LDA主题聚类算法

root
abc abc
  • 23 May
import jieba
from gensim import corpora, models
import pandas as pd
import numpy as np

### 步骤 3: 准备数据
documents = [
    "我喜欢阅读书籍",
    "这本书非常有趣",
    "他喜欢跑步和游泳",
    "游泳是一项很好的运动"
]
### 步骤 4: 文本预处理
def preprocess(texts):
    # 定义停用词列表
    stop_words = set( ['的', '是', '在', '一', '有', '这', '和', '对', '它'])

    texts_cut = [
        [word for word in jieba.lcut(doc) if word not in stop_words]
        for doc in texts]
    return texts_cut

texts = preprocess(documents)
print(texts)

### 步骤 5: 创建词典和语料库
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# 打印前几条记录
for i in range(len(corpus)):
    print(f"Document {i}:")
    for id, freq in corpus[i]:
        print(f"{dictionary[id]}: {freq}")

### 步骤 6: 训练LDA模型
lda_model = models.LdaModel(corpus, num_topics=2, id2word=dictionary,
                            passes=10)

# 输出每个主题的关键词
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")

### 步骤 7: 对新文档进行分类
new_doc = "我经常去图书馆看书"
new_vec = dictionary.doc2bow(jieba.lcut(new_doc))
topics = lda_model[new_vec]
print(topics)