import jieba import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.svm import LinearSVC from sklearn.metrics import classification_report # 1. 数据准备 (示例数据) data = { "text": ["这家餐厅很好吃", "手机质量很差", "电影非常精彩", "服务态度恶劣"], "label": ["餐饮", "数码", "影视", "服务"] } df = pd.DataFrame(data) # 2. 中文分词 def chinese_word_cut(text): return " ".join(jieba.cut(text)) df["text_cut"] = df.text.apply(chinese_word_cut) print(df["text_cut"]) # 3. 特征提取 (TF-IDF) tfidf = TfidfVectorizer() X = tfidf.fit_transform(df["text_cut"]) print(type(X),X) y = df["label"] # 4. 划分训练测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 5. 训练分类器 clf = LinearSVC() clf.fit(X_train, y_train) # 6. 评估模型 y_pred = clf.predict(X_test) print(classification_report(y_test, y_pred)) # 7. 预测新文本 def predict(text): text_cut = chinese_word_cut(text) text_tfidf = tfidf.transform([text_cut]) return clf.predict(text_tfidf)[0] print(predict("这个电子表很好用")) # 示例预测