"""垃圾信息分类标签模块 在朴素贝叶斯二分类(spam/ham)基础上,对判定为 spam 的文本进行细分类标签。 分类优先级:诈骗 > 骚扰 > 广告(按危害程度排序) """ CATEGORY_KEYWORDS = { "fraud": [ "中奖", "幸运粉丝", "幸运用户", "银行卡异常", "社保异常", "账号冻结", "解封", "立即验证", "验证码", "欠费停机", "退款待确认", "违章信息", "紧急通知", "账户异常", "风险", "核验", "被冻结", "将被冻结", ], "harassment": [ "兼职", "日结", "高薪", "刷单", "赚钱", "外快", "宝妈", "学生都能做", "添加微信", "扫码进群", "进群立刻", "想赚", "零花钱", "在家办公", "无需面试", "火热招募", "秒赚", "招募", ], "advertisement": [ "领取", "优惠", "红包", "优惠券", "秒杀", "返现", "补贴", "会员", "特价", "低价", "点击链接", "扫码", "免费领取", "无门槛", "现金券", "盲盒", "百分百中奖", "隐藏优惠券", "内部价", "货到付款", "限时", "最后", "名额", "先到先得", ], } CATEGORY_LABELS = { "fraud": "疑似诈骗", "harassment": "疑似骚扰", "advertisement": "疑似广告", "spam": "疑似垃圾", "ham": "", } CATEGORY_PRIORITY = ["fraud", "harassment", "advertisement"] def categorize_spam(text: str) -> tuple[str, str]: """根据关键词匹配判定垃圾信息的具体分类标签 Args: text: 待分类的文本内容 Returns: tuple[str, str]: (category_code, category_label) - category_code: fraud | harassment | advertisement | spam - category_label: 疑似诈骗 | 疑似骚扰 | 疑似广告 | 疑似垃圾 """ text_lower = text.lower() for category in CATEGORY_PRIORITY: keywords = CATEGORY_KEYWORDS.get(category, []) for kw in keywords: if kw.lower() in text_lower: return category, CATEGORY_LABELS[category] return "spam", CATEGORY_LABELS["spam"] def get_category_label(category: str) -> str: """获取分类标签的中文显示文本 Args: category: 分类代码 Returns: str: 中文标签文本 """ return CATEGORY_LABELS.get(category, "")