基于Tdidf的中文文本聚类方法

root
abc abc
  • 23 May
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

documents = [
    "我喜欢吃苹果",
    "她不喜欢吃水果",
    "她喜欢吃水果",
    "他们很喜欢运动",
    "他喜欢吃香蕉",
    "他很喜欢跑步"
]

def chinese_word_cut(mytext):
    return " ".join(jieba.cut(mytext))

# 对每条文档进行分词
document_cut = [chinese_word_cut(doc) for doc in documents]
print(document_cut)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(document_cut)
print(X)

num_clusters = 2
km = KMeans(n_clusters=num_clusters, random_state=0)
km.fit(X)

# 输出每个样本所属的簇
labels = km.labels_
print("Cluster assignments:", labels)

score = silhouette_score(X, labels)
print("Silhouette Score: ", score)