import jieba from gensim import corpora, models import pandas as pd import numpy as np ### 步骤 3: 准备数据 documents = [ "我喜欢阅读书籍", "这本书非常有趣", "他喜欢跑步和游泳", "游泳是一项很好的运动" ] ### 步骤 4: 文本预处理 def preprocess(texts): # 定义停用词列表 stop_words = set( ['的', '是', '在', '一', '有', '这', '和', '对', '它']) texts_cut = [ [word for word in jieba.lcut(doc) if word not in stop_words] for doc in texts] return texts_cut texts = preprocess(documents) print(texts) ### 步骤 5: 创建词典和语料库 dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # 打印前几条记录 for i in range(len(corpus)): print(f"Document {i}:") for id, freq in corpus[i]: print(f"{dictionary[id]}: {freq}") ### 步骤 6: 训练LDA模型 lda_model = models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10) # 输出每个主题的关键词 for idx, topic in lda_model.print_topics(-1): print(f"Topic: {idx} \nWords: {topic}\n") ### 步骤 7: 对新文档进行分类 new_doc = "我经常去图书馆看书" new_vec = dictionary.doc2bow(jieba.lcut(new_doc)) topics = lda_model[new_vec] print(topics)