Files
c/backend/app/routes/content_routes.py
刘正航 cedfd066c4 feat: 垃圾信息分类标签功能
新增垃圾信息细分类标签,在朴素贝叶斯二分类基础上对spam进行细分:
- 新增 spam_categorizer.py 分类模块(诈骗/骚扰/广告)
- SpamPredictionLog 和 ContentPost 模型添加 category 字段
- content_routes 和 spam_routes 接口返回分类标签

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-22 21:52:08 +08:00

399 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from datetime import datetime
from flask import Blueprint, current_app, request
from flask_jwt_extended import jwt_required
from app.extensions import db
from app.ml.naive_bayes_classifier import NaiveBayesSpamClassifier
from app.ml.spam_categorizer import categorize_spam, get_category_label
from app.models import ContentPost, DetectionConfig, SpamPredictionLog, SpamTrainingSample, User
from app.utils.auth import current_user
from app.utils.response import fail, ok
content_bp = Blueprint("content", __name__)
def _classifier() -> NaiveBayesSpamClassifier:
return NaiveBayesSpamClassifier(current_app.config["NB_MODEL_PATH"])
def _active_samples() -> list[dict]:
rows = SpamTrainingSample.query.filter_by(is_active=True).order_by(SpamTrainingSample.id.asc()).all()
return [{"text": row.text, "label": row.label} for row in rows]
def _ensure_ready() -> NaiveBayesSpamClassifier:
clf = _classifier()
clf.ensure_ready(_active_samples())
return clf
def _get_config() -> DetectionConfig:
cfg = DetectionConfig.query.order_by(DetectionConfig.id.asc()).first()
if cfg:
return cfg
cfg = DetectionConfig(spam_threshold=0.75)
db.session.add(cfg)
db.session.commit()
return cfg
def _serialize_post(row: ContentPost) -> dict:
payload = row.to_dict()
payload["username"] = row.author.username if row.author else ""
payload["nickname"] = row.author.nickname if row.author else ""
payload["recipient_username"] = row.recipient.username if row.recipient else ""
payload["recipient_nickname"] = row.recipient.nickname if row.recipient else ""
payload["reviewer_username"] = row.reviewer.username if row.reviewer else ""
return payload
def _resolve_visibility(value: str) -> str:
key = (value or "public").strip().lower()
return key if key in {"public", "private", "direct"} else "public"
def _resolve_recipient(payload: dict, visibility: str, current_user_id: int):
if visibility != "direct":
return None, None
recipient = None
raw_id = payload.get("recipient_user_id")
username = (payload.get("recipient_username") or "").strip()
if raw_id is not None and str(raw_id).strip() != "":
try:
recipient = User.query.get(int(raw_id))
except Exception:
return None, "recipient_user_id 无效"
elif username:
recipient = User.query.filter_by(username=username).first()
if not recipient:
return None, "私信发布必须指定有效接收人"
if recipient.id == current_user_id:
return None, "不能给自己发送私信"
return recipient, None
def _predict_and_decide(text: str, user_credit: int = 100) -> tuple[dict, float, bool, str, str]:
"""根据用户信誉分调整阈值系数。信誉分越高,阈值越高(降低敏感度)"""
clf = _ensure_ready()
result = clf.predict(text)
base_threshold = float(_get_config().spam_threshold)
# 信誉分影响阈值系数credit 0-200默认100
# credit > 100阈值提高降低敏感度减少误判
# credit < 100阈值降低提高敏感度加强拦截
# 系数范围0.85 - 1.15
credit_factor = 1.0 + (user_credit - 100) * 0.0015 # 每10分变化1.5%
credit_factor = max(0.85, min(1.15, credit_factor))
adjusted_threshold = base_threshold * credit_factor
blocked = float(result["spam_probability"]) >= adjusted_threshold
# 分类标签
category = ""
category_label = ""
if blocked:
category, category_label = categorize_spam(result["text"])
return result, adjusted_threshold, blocked, category, category_label
@content_bp.post("/publish")
@jwt_required()
def publish_text():
user = current_user()
if not user:
return fail("用户不存在", 404)
payload = request.get_json(silent=True) or {}
text = (payload.get("text") or "").strip()
visibility = _resolve_visibility(payload.get("visibility"))
if len(text) < 2:
return fail("发布文本至少2个字符", 400)
recipient, err = _resolve_recipient(payload, visibility, user.id)
if err:
return fail(err, 400)
result, threshold, blocked, category, category_label = _predict_and_decide(text, user.credit_score or 100)
post = ContentPost(
user_id=user.id,
recipient_user_id=recipient.id if recipient else None,
text=result["text"],
visibility=visibility,
status="blocked" if blocked else "published",
prediction=result["prediction"],
category=category,
spam_probability=result["spam_probability"],
ham_probability=result["ham_probability"],
confidence=result["confidence"],
threshold=threshold,
reason_tokens=result["reason_tokens"],
model_version=result.get("model_version", ""),
manual_review_status="pending" if blocked else "none",
)
detect_log = SpamPredictionLog(
user_id=user.id,
text=result["text"],
prediction=result["prediction"],
category=category,
spam_probability=result["spam_probability"],
ham_probability=result["ham_probability"],
confidence=result["confidence"],
reason_tokens=result["reason_tokens"],
model_version=result.get("model_version", ""),
)
db.session.add(post)
db.session.add(detect_log)
# 发布成功(未被拦截),小幅增加信誉分;被拦截则小幅减少
if not blocked:
user.credit_score = min(200, (user.credit_score or 100) + 1)
else:
user.credit_score = max(0, (user.credit_score or 100) - 2)
db.session.commit()
feedback = "发布成功" if not blocked else f"{category_label or '疑似垃圾信息'},系统已拦截,可提交申诉"
return ok(
{
"publish_allowed": not blocked,
"action": "published" if not blocked else "blocked",
"feedback": feedback,
"post": _serialize_post(post),
"detect": {
**result,
"category": category,
"category_label": category_label,
},
},
feedback,
)
@content_bp.put("/posts/<int:post_id>")
@jwt_required()
def edit_post(post_id: int):
user = current_user()
if not user:
return fail("用户不存在", 404)
post = ContentPost.query.filter_by(id=post_id, user_id=user.id).first()
if not post:
return fail("发布记录不存在", 404)
payload = request.get_json(silent=True) or {}
text = (payload.get("text") or post.text).strip()
visibility = _resolve_visibility(payload.get("visibility") or post.visibility)
if len(text) < 2:
return fail("发布文本至少2个字符", 400)
recipient, err = _resolve_recipient(payload, visibility, user.id)
if err:
return fail(err, 400)
result, threshold, blocked, category, category_label = _predict_and_decide(text, user.credit_score or 100)
post.text = result["text"]
post.visibility = visibility
post.recipient_user_id = recipient.id if recipient else None
post.status = "blocked" if blocked else "published"
post.prediction = result["prediction"]
post.category = category
post.spam_probability = result["spam_probability"]
post.ham_probability = result["ham_probability"]
post.confidence = result["confidence"]
post.threshold = threshold
post.reason_tokens = result["reason_tokens"]
post.model_version = result.get("model_version", "")
post.manual_review_status = "pending" if blocked else "none"
post.manual_review_by = None
post.manual_review_note = ""
post.manual_review_at = None
post.appeal_status = "none"
post.appeal_reason = ""
post.appeal_admin_note = ""
post.appeal_submitted_at = None
post.appeal_processed_at = None
post.appeal_processed_by = None
db.session.commit()
feedback = "更新并重新发布成功" if not blocked else "更新后触发拦截,可提交申诉"
return ok(
{
"publish_allowed": not blocked,
"action": "published" if not blocked else "blocked",
"feedback": feedback,
"post": _serialize_post(post),
"detect": result,
},
feedback,
)
@content_bp.get("/posts/history")
@jwt_required()
def my_posts():
user = current_user()
if not user:
return fail("用户不存在", 404)
status = (request.args.get("status") or "").strip().lower()
visibility = (request.args.get("visibility") or "").strip().lower()
page = max(int(request.args.get("page", 1) or 1), 1)
page_size = min(max(int(request.args.get("page_size", 20) or 20), 1), 100)
query = ContentPost.query.filter_by(user_id=user.id)
if status in {"published", "blocked"}:
query = query.filter(ContentPost.status == status)
if visibility in {"public", "private", "direct"}:
query = query.filter(ContentPost.visibility == visibility)
pagination = query.order_by(ContentPost.id.desc()).paginate(page=page, per_page=page_size, error_out=False)
return ok(
{
"items": [_serialize_post(item) for item in pagination.items],
"total": pagination.total,
"page": page,
"page_size": page_size,
}
)
@content_bp.get("/posts/inbox")
@jwt_required()
def my_inbox():
user = current_user()
if not user:
return fail("用户不存在", 404)
page = max(int(request.args.get("page", 1) or 1), 1)
page_size = min(max(int(request.args.get("page_size", 20) or 20), 1), 100)
pagination = (
ContentPost.query.filter_by(recipient_user_id=user.id, visibility="direct", status="published")
.order_by(ContentPost.id.desc())
.paginate(page=page, per_page=page_size, error_out=False)
)
return ok(
{
"items": [_serialize_post(item) for item in pagination.items],
"total": pagination.total,
"page": page,
"page_size": page_size,
}
)
@content_bp.delete("/posts/<int:post_id>")
@jwt_required()
def delete_post(post_id: int):
user = current_user()
if not user:
return fail("用户不存在", 404)
row = ContentPost.query.filter_by(id=post_id, user_id=user.id).first()
if not row:
return fail("记录不存在", 404)
db.session.delete(row)
db.session.commit()
return ok({}, "记录已删除")
@content_bp.post("/posts/<int:post_id>/appeal")
@jwt_required()
def submit_appeal(post_id: int):
user = current_user()
if not user:
return fail("用户不存在", 404)
post = ContentPost.query.filter_by(id=post_id, user_id=user.id).first()
if not post:
return fail("发布记录不存在", 404)
if post.status != "blocked":
return fail("仅被拦截的信息可申诉", 400)
payload = request.get_json(silent=True) or {}
reason_type = (payload.get("reason_type") or "").strip()
reason = (payload.get("reason") or "").strip()
evidence_urls = payload.get("evidence_urls") or []
# 如果选择了快捷理由reason 可为空;否则至少 2 字符
if not reason_type and len(reason) < 2:
return fail("申诉理由至少2个字符", 400)
if post.appeal_status == "pending":
return fail("该记录已在申诉处理中", 400)
post.appeal_status = "pending"
post.appeal_reason_type = reason_type
post.appeal_reason = reason
post.appeal_evidence_urls = evidence_urls
post.appeal_submitted_at = datetime.utcnow()
post.appeal_admin_note = ""
post.appeal_processed_at = None
post.appeal_processed_by = None
post.manual_review_status = "pending"
db.session.commit()
return ok(_serialize_post(post), "申诉提交成功")
@content_bp.get("/appeals/my")
@jwt_required()
def my_appeals():
user = current_user()
if not user:
return fail("用户不存在", 404)
page = max(int(request.args.get("page", 1) or 1), 1)
page_size = min(max(int(request.args.get("page_size", 20) or 20), 1), 100)
pagination = (
ContentPost.query.filter(ContentPost.user_id == user.id, ContentPost.appeal_status != "none")
.order_by(ContentPost.id.desc())
.paginate(page=page, per_page=page_size, error_out=False)
)
return ok(
{
"items": [_serialize_post(item) for item in pagination.items],
"total": pagination.total,
"page": page,
"page_size": page_size,
}
)
@content_bp.get("/posts/public")
@jwt_required(optional=True)
def public_feed():
page = max(int(request.args.get("page", 1) or 1), 1)
page_size = min(max(int(request.args.get("page_size", 20) or 20), 1), 100)
pagination = (
ContentPost.query.filter_by(visibility="public", status="published")
.order_by(ContentPost.id.desc())
.paginate(page=page, per_page=page_size, error_out=False)
)
return ok(
{
"items": [_serialize_post(item) for item in pagination.items],
"total": pagination.total,
"page": page,
"page_size": page_size,
}
)