InternHub/recommender.py at main · SakuraMathcraft/InternHub

183 lines (152 loc) · 8.42 KB
import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import hashlib
def resource_path(relative_path):
    """Get absolute resource path for source run and PyInstaller run."""
    base_path = getattr(sys, '_MEIPASS', os.path.dirname(os.path.abspath(__file__)))
    return os.path.join(base_path, relative_path)
class InternshipRecommender:
    def __init__(self, data_path=None):
        self.data_path = data_path or resource_path('data.xlsx')
        self.df = None
        self.vectorizer = None
        self.tfidf_matrix = None
        self.load_data()
    def load_data(self):
        if not os.path.exists(self.data_path):
            print(f"Error: {self.data_path} not found.")
            return
        try:
            self.df = pd.read_excel(self.data_path)
            # Fill missing values
            self.df = self.df.fillna('')
            # Create a corpus for recommendation
            # Combining important text fields
            self.df['search_text'] = (
                self.df['job_title'] + " " + 
                self.df['tag'] + " " + 
                self.df['industry'] + " " + 
                self.df['job_detail'] + " " +
                self.df['city']
            # Pre-cut text using jieba for better TF-IDF
            print("Processing text data... (this might take a moment)")
            self.df['search_text_cut'] = self.df['search_text'].apply(lambda x: " ".join(jieba.cut(str(x))))
            # *** Clean job_detail ***
            # Remove [' ... '] patterns and replace with readable text
            def clean_detail(text):
                if isinstance(text, str):
                    # Simple heuristic: remove brackets and split by comma if list-like string
                    cleaned = text.strip("[]'\"\r\n ").replace("', '", "\n").replace("', \"", "\n").replace("\", '", "\n").replace("\\n", "\n")
                    # Remove common leading digits if they look like "1. ", "2. " to avoid double bulleting if we format it later
                    # But for BodyLabel is okay.
                    return cleaned
                return text
            self.df['job_detail_clean'] = self.df['job_detail'].apply(clean_detail)
            # Initialize Vectorizer
            print("Vectorizing data...")
            self.vectorizer = TfidfVectorizer(max_features=5000) # Limit features for speed
            self.tfidf_matrix = self.vectorizer.fit_transform(self.df['search_text_cut'])
            # Generate a unique ID for each job based on content hash
            def generate_id(row):
                unique_str = f"{row['job_title']}{row['com_fullname']}{row['city']}{row['processed_date']}"
                return hashlib.md5(unique_str.encode('utf-8')).hexdigest()
            # Use columns that are likely unique in combination
            # Note: processed_date might not exist, use release_time or just title+company+city+detail_short
            self.df['job_id'] = self.df.apply(lambda x: hashlib.md5(
                (str(x['job_title']) + str(x['com_fullname']) + str(x['city'])).encode('utf-8')
            ).hexdigest(), axis=1)
            print("Data loaded and model ready.")
        except Exception as e:
            print(f"Error loading data: {e}")
    def recommend(self, query, filters=None, top_n=10):
        if self.df is None or self.vectorizer is None:
            return []
        # 1. Base Filter (Hard filters)
        filtered_indices = self.df.index
        if filters:
            # City Filter
            if 'city' in filters and filters['city']:
                filtered_indices = filtered_indices[self.df.loc[filtered_indices, 'city'].str.contains(filters['city'], na=False)]
            # Days Per Week Filter (e.g., "3天", "4天")
            if 'days_per_week' in filters and filters['days_per_week']:
                # Extract number from 'day_per_week' column (e.g. "4天/周")
                # We want jobs that require <= user's available days? Or exactly?
                # Usually user says "I can do 3 days", so show jobs requiring <= 3 days? 
                # Or user filters for "Jobs requiring 3 days". Let's assume strict match or simple string match for now.
                # Use regex to find number.
                req_days = int(filters['days_per_week'])
                def check_days(val):
                    import re
                    match = re.search(r'(\d+)', str(val))
                    if match:
                        return int(match.group(1)) <= req_days # Show jobs requiring LESS or EQUAL days
                    return False
                filtered_indices = filtered_indices[self.df.loc[filtered_indices, 'day_per_week'].apply(check_days)]
            # Degree Filter
            if 'degree' in filters and filters['degree']:
                # 'job_academic' contains '本科', '硕士', etc.
                filtered_indices = filtered_indices[self.df.loc[filtered_indices, 'job_academic'].str.contains(filters['degree'], na=False)]
            # Salary Filter (min_wage)
            if 'min_wage' in filters and filters['min_wage']:
                min_w = int(filters['min_wage'])
                def check_wage(val):
                    # val e.g. "150-200/天"
                    import re
                    nums = re.findall(r'(\d+)', str(val))
                    if nums:
                        # Take the average or max or min?
                        # Usually "150-200", if user wants min 200, then max(wage) >= 200? 
                        # Or consistent: high bound of wage >= user_req
                        return int(nums[-1]) >= min_w
                    return False
                filtered_indices = filtered_indices[self.df.loc[filtered_indices, 'wage'].apply(check_wage)]
        if len(filtered_indices) == 0:
            return []
        # 2. Content-based Recommendation
        if query:
            query_cut = " ".join(jieba.cut(query))
            query_vec = self.vectorizer.transform([query_cut])
            # Calculate cosine similarity only for filtered items
            # To avoid re-indexing issues, we'll calculate for all then filter
            # Optimization: slice the matrix if dataset is huge, but for now simple approach:
            cosine_sim = cosine_similarity(query_vec, self.tfidf_matrix).flatten()
            # Create a dataframe for results with scores
            # We explicitly use the original indices to map back to the dataframe
            results = self.df.loc[filtered_indices].copy()
            results['similarity_score'] = cosine_sim[filtered_indices]
            # *** Keyword Filter & Boost (Heuristic) ***
            # If query words appear in job_title, give huge boost
            # This solves "keyword search doesn't work" feeling
            query_keywords = query.split() # Simple split
            def calculate_boost(row):
                boost = 0
                title = str(row['job_title']).lower()
                for k in query_keywords:
                    if k.lower() in title:
                        boost += 0.5 # Significant boost per keyword in title
                return boost
            results['boost_score'] = results.apply(calculate_boost, axis=1)
            results['final_score'] = results['similarity_score'] + results['boost_score']
            # Sort by similarity
            results = results.sort_values(by='final_score', ascending=False)
            results['job_detail'] = results['job_detail_clean'] # Ensure clean detail is used
        else:
            # If no query, just return filtered data (maybe sorted by release time if we parsed it)
            results = self.df.loc[filtered_indices].copy()
            results['similarity_score'] = 0
            results['final_score'] = 0
            # Simple fallback sort if released_time is parseable, otherwise just head
            results['job_detail'] = results['job_detail_clean'] # Ensure clean detail is used
        return results.head(top_n).to_dict('records')
if __name__ == "__main__":
    rec = InternshipRecommender()
    print("Testing query 'python 实习' in '北京'...")
    results = rec.recommend("python 数据分析", filters={'city': '北京'})
    for r in results:
        print(f"[{r['similarity_score']:.2f}] {r['job_title']} - {r['com_fullname']} ({r['city']})")
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

recommender.py

Latest commit

History

recommender.py

File metadata and controls