import jieba from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score documents = [ "我喜欢吃苹果", "她不喜欢吃水果", "她喜欢吃水果", "他们很喜欢运动", "他喜欢吃香蕉", "他很喜欢跑步" ] def chinese_word_cut(mytext): return " ".join(jieba.cut(mytext)) # 对每条文档进行分词 document_cut = [chinese_word_cut(doc) for doc in documents] print(document_cut) vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(document_cut) print(X) num_clusters = 2 km = KMeans(n_clusters=num_clusters, random_state=0) km.fit(X) # 输出每个样本所属的簇 labels = km.labels_ print("Cluster assignments:", labels) score = silhouette_score(X, labels) print("Silhouette Score: ", score)