Files
c/backend/app/ml/spam_categorizer.py
刘正航 cedfd066c4 feat: 垃圾信息分类标签功能
新增垃圾信息细分类标签,在朴素贝叶斯二分类基础上对spam进行细分:
- 新增 spam_categorizer.py 分类模块(诈骗/骚扰/广告)
- SpamPredictionLog 和 ContentPost 模型添加 category 字段
- content_routes 和 spam_routes 接口返回分类标签

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-22 21:52:08 +08:00

68 lines
2.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""垃圾信息分类标签模块
在朴素贝叶斯二分类spam/ham基础上对判定为 spam 的文本进行细分类标签。
分类优先级:诈骗 > 骚扰 > 广告(按危害程度排序)
"""
CATEGORY_KEYWORDS = {
"fraud": [
"中奖", "幸运粉丝", "幸运用户", "银行卡异常", "社保异常", "账号冻结",
"解封", "立即验证", "验证码", "欠费停机", "退款待确认", "违章信息",
"紧急通知", "账户异常", "风险", "核验", "被冻结", "将被冻结",
],
"harassment": [
"兼职", "日结", "高薪", "刷单", "赚钱", "外快", "宝妈", "学生都能做",
"添加微信", "扫码进群", "进群立刻", "想赚", "零花钱", "在家办公",
"无需面试", "火热招募", "秒赚", "招募",
],
"advertisement": [
"领取", "优惠", "红包", "优惠券", "秒杀", "返现", "补贴", "会员",
"特价", "低价", "点击链接", "扫码", "免费领取", "无门槛", "现金券",
"盲盒", "百分百中奖", "隐藏优惠券", "内部价", "货到付款", "限时",
"最后", "名额", "先到先得",
],
}
CATEGORY_LABELS = {
"fraud": "疑似诈骗",
"harassment": "疑似骚扰",
"advertisement": "疑似广告",
"spam": "疑似垃圾",
"ham": "",
}
CATEGORY_PRIORITY = ["fraud", "harassment", "advertisement"]
def categorize_spam(text: str) -> tuple[str, str]:
"""根据关键词匹配判定垃圾信息的具体分类标签
Args:
text: 待分类的文本内容
Returns:
tuple[str, str]: (category_code, category_label)
- category_code: fraud | harassment | advertisement | spam
- category_label: 疑似诈骗 | 疑似骚扰 | 疑似广告 | 疑似垃圾
"""
text_lower = text.lower()
for category in CATEGORY_PRIORITY:
keywords = CATEGORY_KEYWORDS.get(category, [])
for kw in keywords:
if kw.lower() in text_lower:
return category, CATEGORY_LABELS[category]
return "spam", CATEGORY_LABELS["spam"]
def get_category_label(category: str) -> str:
"""获取分类标签的中文显示文本
Args:
category: 分类代码
Returns:
str: 中文标签文本
"""
return CATEGORY_LABELS.get(category, "")