1

2026-04-21 22:45:19 +08:00
commit b5237f9038
159 changed files with 7769 additions and 0 deletions
--- a/backend/app/ml/init.py
+++ b/backend/app/ml/init.py
@@ -0,0 +1 @@
+
--- a/backend/app/ml/pycache/init.cpython-310.pyc
+++ b/backend/app/ml/pycache/init.cpython-310.pyc
--- a/backend/app/ml/pycache/init.cpython-311.pyc
+++ b/backend/app/ml/pycache/init.cpython-311.pyc
--- a/backend/app/ml/pycache/naive_bayes_classifier.cpython-310.pyc
+++ b/backend/app/ml/pycache/naive_bayes_classifier.cpython-310.pyc
--- a/backend/app/ml/pycache/naive_bayes_classifier.cpython-311.pyc
+++ b/backend/app/ml/pycache/naive_bayes_classifier.cpython-311.pyc
--- a/backend/app/ml/pycache/rf_recommender.cpython-311.pyc
+++ b/backend/app/ml/pycache/rf_recommender.cpython-311.pyc
--- a/backend/app/ml/naive_bayes_classifier.py
+++ b/backend/app/ml/naive_bayes_classifier.py
@@ -0,0 +1,156 @@
+import hashlib
+from collections import Counter
+from datetime import datetime
+from pathlib import Path
+
+import joblib
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.naive_bayes import MultinomialNB
+
+
+VALID_LABELS = {"spam", "ham"}
+
+
+class NaiveBayesSpamClassifier:
+    def __init__(self, model_path: str):
+        self.model_path = Path(model_path)
+        self.vectorizer = None
+        self.model = None
+        self.metadata = {}
+
+    @staticmethod
+    def normalize_label(label: str) -> str:
+        value = (label or "").strip().lower()
+        return value if value in VALID_LABELS else ""
+
+    @staticmethod
+    def normalize_text(text: str) -> str:
+        return " ".join((text or "").strip().split())
+
+    @staticmethod
+    def _to_metadata(samples: list[dict], version_seed: str) -> dict:
+        dist = Counter([item["label"] for item in samples])
+        digest = hashlib.md5(version_seed.encode("utf-8")).hexdigest()[:16]
+        return {
+            "trained_at": datetime.utcnow().isoformat(),
+            "sample_count": len(samples),
+            "label_distribution": dict(dist),
+            "version": f"nb-{digest}",
+        }
+
+    def train(self, samples: list[dict]) -> dict:
+        clean_samples = []
+        for row in samples:
+            text = self.normalize_text(row.get("text"))
+            label = self.normalize_label(row.get("label"))
+            if not text or not label:
+                continue
+            clean_samples.append({"text": text, "label": label})
+
+        if len(clean_samples) < 10:
+            raise ValueError("训练样本太少，至少需要10条有效样本")
+
+        texts = [item["text"] for item in clean_samples]
+        labels = [item["label"] for item in clean_samples]
+
+        vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(1, 2), min_df=1)
+        x = vectorizer.fit_transform(texts)
+
+        model = MultinomialNB(alpha=0.4)
+        model.fit(x, labels)
+
+        version_seed = "||".join([f"{item['label']}::{item['text']}" for item in clean_samples])
+        metadata = self._to_metadata(clean_samples, version_seed)
+
+        self.model_path.parent.mkdir(parents=True, exist_ok=True)
+        joblib.dump({"vectorizer": vectorizer, "model": model, "metadata": metadata}, self.model_path)
+
+        self.vectorizer = vectorizer
+        self.model = model
+        self.metadata = metadata
+        return metadata
+
+    def load(self) -> bool:
+        if not self.model_path.exists():
+            return False
+
+        payload = joblib.load(self.model_path)
+        self.vectorizer = payload.get("vectorizer")
+        self.model = payload.get("model")
+        self.metadata = payload.get("metadata", {})
+        return self.vectorizer is not None and self.model is not None
+
+    def ensure_ready(self, samples: list[dict]) -> dict:
+        if self.load():
+            return self.metadata
+        return self.train(samples)
+
+    def predict(self, text: str) -> dict:
+        if self.vectorizer is None or self.model is None:
+            raise RuntimeError("模型未加载，请先训练")
+
+        cleaned = self.normalize_text(text)
+        if len(cleaned) < 2:
+            raise ValueError("待识别文本至少2个字符")
+
+        x = self.vectorizer.transform([cleaned])
+        probs = self.model.predict_proba(x)[0]
+        classes = list(self.model.classes_)
+
+        spam_idx = classes.index("spam") if "spam" in classes else 0
+        ham_idx = classes.index("ham") if "ham" in classes else 0
+
+        spam_prob = float(probs[spam_idx])
+        ham_prob = float(probs[ham_idx])
+        prediction = "spam" if spam_prob >= ham_prob else "ham"
+
+        reason_tokens = self._extract_reason_tokens(cleaned, classes, x)
+        confidence = max(spam_prob, ham_prob)
+
+        return {
+            "text": cleaned,
+            "prediction": prediction,
+            "prediction_text": "垃圾信息" if prediction == "spam" else "正常信息",
+            "spam_probability": round(spam_prob, 4),
+            "ham_probability": round(ham_prob, 4),
+            "confidence": round(confidence, 4),
+            "reason_tokens": reason_tokens,
+            "model_version": self.metadata.get("version", ""),
+            "trained_at": self.metadata.get("trained_at"),
+        }
+
+    def _extract_reason_tokens(self, text: str, classes: list[str], x_row) -> list[str]:
+        try:
+            vocab = self.vectorizer.vocabulary_
+            feature_names = self.vectorizer.get_feature_names_out()
+            class_log_prob = self.model.feature_log_prob_
+            spam_idx = classes.index("spam") if "spam" in classes else 0
+            ham_idx = classes.index("ham") if "ham" in classes else 0
+
+            token_counter = Counter()
+            for idx in x_row.nonzero()[1]:
+                token = feature_names[idx]
+                token_counter[token] += 1
+
+            scored = []
+            for token in token_counter:
+                idx = vocab.get(token)
+                if idx is None:
+                    continue
+                delta = class_log_prob[spam_idx][idx] - class_log_prob[ham_idx][idx]
+                scored.append((token, delta))
+
+            scored.sort(key=lambda row: abs(row[1]), reverse=True)
+            return [token for token, _ in scored[:5]]
+        except Exception:
+            return list(text[:5])
+
+    def model_info(self) -> dict:
+        return {
+            "ready": self.vectorizer is not None and self.model is not None,
+            "model_path": str(self.model_path),
+            "version": self.metadata.get("version", ""),
+            "trained_at": self.metadata.get("trained_at"),
+            "sample_count": int(self.metadata.get("sample_count", 0) or 0),
+            "label_distribution": self.metadata.get("label_distribution", {}),
+        }
--- a/backend/app/ml/rf_recommender.py
+++ b/backend/app/ml/rf_recommender.py
@@ -0,0 +1,252 @@
+import hashlib
+import json
+from datetime import datetime
+from pathlib import Path
+
+import joblib
+import numpy as np
+from sklearn.ensemble import RandomForestRegressor
+
+
+GOAL_MAP = {
+    "maintain": 0,
+    "lose_fat": 1,
+    "gain_muscle": 2,
+    "keto": 3,
+}
+
+OCCUPATION_MAP = {
+    "通用": 0,
+    "student": 1,
+    "office": 2,
+    "teacher": 3,
+    "developer": 4,
+    "healthcare": 5,
+    "fitness": 6,
+    "manual": 7,
+}
+
+
+class RandomForestDietRecommender:
+    def __init__(self, model_path: str):
+        self.model_path = Path(model_path)
+        self.model = None
+        self.recipe_signature = None
+
+    @staticmethod
+    def _encode_goal(goal: str) -> int:
+        return GOAL_MAP.get(goal or "maintain", 0)
+
+    @staticmethod
+    def _encode_occupation(occupation: str) -> int:
+        occupation = occupation or "通用"
+        if occupation in OCCUPATION_MAP:
+            return OCCUPATION_MAP[occupation]
+        return OCCUPATION_MAP["通用"]
+
+    def _signature(self, recipes: list) -> str:
+        raw = [
+            {
+                "id": item.id,
+                "name": item.name,
+                "calories": item.calories,
+                "protein": item.protein,
+                "fat": item.fat,
+                "carbs": item.carbs,
+                "fiber": item.fiber,
+                "updated": item.updated_at.isoformat() if item.updated_at else "",
+            }
+            for item in recipes
+        ]
+        raw_json = json.dumps(raw, ensure_ascii=False, sort_keys=True)
+        return hashlib.md5(raw_json.encode("utf-8")).hexdigest()
+
+    @staticmethod
+    def _daily_target_kcal(profile: dict) -> float:
+        goal = profile.get("goal", "maintain")
+        baseline = 1800 + float(profile.get("exercise_kcal", 0)) * 0.4
+        if goal == "lose_fat":
+            baseline *= 0.82
+        elif goal == "gain_muscle":
+            baseline *= 1.12
+        elif goal == "keto":
+            baseline *= 0.9
+        return max(baseline, 1200)
+
+    @staticmethod
+    def _heuristic_score(profile: dict, recipe) -> float:
+        goal = profile.get("goal", "maintain")
+        daily_target = RandomForestDietRecommender._daily_target_kcal(profile)
+        target_per_meal = daily_target / 3
+
+        cal_gap_ratio = abs(recipe.calories - target_per_meal) / max(target_per_meal, 1)
+        protein_ratio = recipe.protein / max(recipe.calories, 1)
+        carbs_ratio = recipe.carbs / max(recipe.calories, 1)
+        fat_ratio = recipe.fat / max(recipe.calories, 1)
+
+        score = 100.0
+        score -= min(cal_gap_ratio * 55, 50)
+
+        if goal == "lose_fat":
+            score += min(recipe.protein * 0.6, 18)
+            score -= max((recipe.fat - 20) * 0.7, 0)
+            score -= max((recipe.carbs - 55) * 0.3, 0)
+        elif goal == "gain_muscle":
+            score += min(recipe.protein * 0.8, 26)
+            score += min(recipe.carbs * 0.2, 10)
+        elif goal == "keto":
+            score += min(recipe.fat * 0.4, 18)
+            score -= max(recipe.carbs - 30, 0) * 0.8
+        else:
+            score += min(recipe.fiber * 1.2, 8)
+
+        body_fat = float(profile.get("body_fat", 20))
+        if body_fat > 28:
+            score -= max(recipe.calories - 520, 0) * 0.03
+
+        intake_kcal = float(profile.get("intake_kcal", 1800))
+        if intake_kcal > daily_target:
+            score -= max(recipe.calories - 460, 0) * 0.02
+
+        score += np.clip((protein_ratio - 0.12) * 100, -8, 8)
+        score += np.clip((0.08 - carbs_ratio) * 80 if goal == "keto" else 0, -6, 6)
+        score += np.clip((0.25 - fat_ratio) * 30 if goal == "lose_fat" else 0, -5, 5)
+
+        return float(np.clip(score, 1, 100))
+
+    def _build_feature(self, profile: dict, recipe) -> list:
+        return [
+            float(profile.get("weight", 65)),
+            float(profile.get("body_fat", 20)),
+            float(profile.get("exercise_kcal", 300)),
+            float(profile.get("intake_kcal", 1800)),
+            float(profile.get("age", 25)),
+            float(profile.get("height_cm", 170)),
+            float(self._encode_goal(profile.get("goal", "maintain"))),
+            float(self._encode_occupation(profile.get("occupation", "通用"))),
+            float(recipe.calories),
+            float(recipe.protein),
+            float(recipe.fat),
+            float(recipe.carbs),
+            float(recipe.fiber or 0),
+        ]
+
+    def _sample_profiles(self, n: int = 600) -> list:
+        rng = np.random.default_rng(2026)
+        goals = list(GOAL_MAP.keys())
+        occupations = list(OCCUPATION_MAP.keys())
+
+        profiles = []
+        for _ in range(n):
+            goal = goals[int(rng.integers(0, len(goals)))]
+            occupation = occupations[int(rng.integers(0, len(occupations)))]
+            profiles.append(
+                {
+                    "weight": float(rng.uniform(45, 100)),
+                    "body_fat": float(rng.uniform(10, 38)),
+                    "exercise_kcal": float(rng.uniform(50, 850)),
+                    "intake_kcal": float(rng.uniform(1200, 3200)),
+                    "age": float(rng.uniform(18, 55)),
+                    "height_cm": float(rng.uniform(150, 190)),
+                    "goal": goal,
+                    "occupation": occupation,
+                }
+            )
+        return profiles
+
+    def train(self, recipes: list) -> None:
+        if not recipes:
+            raise ValueError("训练随机森林前至少需要 1 条食谱数据")
+
+        x_rows = []
+        y_rows = []
+        sampled_profiles = self._sample_profiles()
+
+        for profile in sampled_profiles:
+            for recipe in recipes:
+                x_rows.append(self._build_feature(profile, recipe))
+                y_rows.append(self._heuristic_score(profile, recipe))
+
+        x = np.array(x_rows)
+        y = np.array(y_rows)
+
+        model = RandomForestRegressor(
+            n_estimators=240,
+            random_state=2026,
+            max_depth=12,
+            min_samples_leaf=2,
+            n_jobs=-1,
+        )
+        model.fit(x, y)
+
+        self.model = model
+        self.recipe_signature = self._signature(recipes)
+
+        self.model_path.parent.mkdir(parents=True, exist_ok=True)
+        joblib.dump(
+            {
+                "model": model,
+                "recipe_signature": self.recipe_signature,
+                "trained_at": datetime.utcnow().isoformat(),
+            },
+            self.model_path,
+        )
+
+    def load_or_train(self, recipes: list) -> None:
+        current_signature = self._signature(recipes)
+        if self.model_path.exists():
+            payload = joblib.load(self.model_path)
+            if payload.get("recipe_signature") == current_signature:
+                self.model = payload["model"]
+                self.recipe_signature = current_signature
+                return
+
+        self.train(recipes)
+
+    @staticmethod
+    def _build_reason(profile: dict, recipe, score: float) -> str:
+        goal = profile.get("goal", "maintain")
+        if goal == "lose_fat":
+            return f"热量适中，蛋白质 {recipe.protein}g，适合减脂期控热量和保肌。"
+        if goal == "gain_muscle":
+            return f"蛋白质与碳水配置较高，适合增肌训练后的恢复。"
+        if goal == "keto":
+            return f"碳水 {recipe.carbs}g，偏低碳结构，适合生酮期参考。"
+        if score > 80:
+            return "营养均衡度高，适合作为日常轻食搭配。"
+        return "综合营养结构较均衡，可作为个性化备选方案。"
+
+    def recommend(self, profile: dict, recipes: list, top_k: int = 5) -> list:
+        if not recipes:
+            return []
+
+        self.load_or_train(recipes)
+        x = np.array([self._build_feature(profile, recipe) for recipe in recipes])
+        pred_scores = self.model.predict(x)
+
+        result = []
+        for recipe, score in zip(recipes, pred_scores):
+            row = recipe.to_dict()
+            row["rf_score"] = round(float(score), 2)
+            row["reason"] = self._build_reason(profile, recipe, float(score))
+            result.append(row)
+
+        result.sort(key=lambda item: item["rf_score"], reverse=True)
+        return result[:top_k]
+
+
+def merge_profile_with_history(base_profile: dict, history: list) -> dict:
+    if not history:
+        return base_profile
+
+    weights = [item.weight for item in history]
+    body_fats = [item.body_fat for item in history]
+    exercise = [item.exercise_kcal for item in history]
+    intake = [item.intake_kcal for item in history]
+
+    merged = dict(base_profile)
+    merged["weight"] = float(np.mean(weights))
+    merged["body_fat"] = float(np.mean(body_fats))
+    merged["exercise_kcal"] = float(np.mean(exercise))
+    merged["intake_kcal"] = float(np.mean(intake))
+    return merged