中文文本自动分类示例(svm)

root
abc abc
  • 23 May
import jieba
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# 1. 数据准备 (示例数据)
data = {
    "text": ["这家餐厅很好吃", "手机质量很差", "电影非常精彩", "服务态度恶劣"],
    "label": ["餐饮", "数码", "影视", "服务"]
}
df = pd.DataFrame(data)

# 2. 中文分词
def chinese_word_cut(text):
    return " ".join(jieba.cut(text))

df["text_cut"] = df.text.apply(chinese_word_cut)
print(df["text_cut"])

# 3. 特征提取 (TF-IDF)
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df["text_cut"])

print(type(X),X)
y = df["label"]

# 4. 划分训练测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 5. 训练分类器
clf = LinearSVC()
clf.fit(X_train, y_train)

# 6. 评估模型
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

# 7. 预测新文本
def predict(text):
    text_cut = chinese_word_cut(text)
    text_tfidf = tfidf.transform([text_cut])
    return clf.predict(text_tfidf)[0]

print(predict("这个电子表很好用"))  # 示例预测