-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrecommender.py
More file actions
183 lines (152 loc) · 8.42 KB
/
recommender.py
File metadata and controls
183 lines (152 loc) · 8.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import hashlib
import sys
def resource_path(relative_path):
"""Get absolute resource path for source run and PyInstaller run."""
base_path = getattr(sys, '_MEIPASS', os.path.dirname(os.path.abspath(__file__)))
return os.path.join(base_path, relative_path)
class InternshipRecommender:
def __init__(self, data_path=None):
self.data_path = data_path or resource_path('data.xlsx')
self.df = None
self.vectorizer = None
self.tfidf_matrix = None
self.load_data()
def load_data(self):
if not os.path.exists(self.data_path):
print(f"Error: {self.data_path} not found.")
return
try:
self.df = pd.read_excel(self.data_path)
# Fill missing values
self.df = self.df.fillna('')
# Create a corpus for recommendation
# Combining important text fields
self.df['search_text'] = (
self.df['job_title'] + " " +
self.df['tag'] + " " +
self.df['industry'] + " " +
self.df['job_detail'] + " " +
self.df['city']
)
# Pre-cut text using jieba for better TF-IDF
print("Processing text data... (this might take a moment)")
self.df['search_text_cut'] = self.df['search_text'].apply(lambda x: " ".join(jieba.cut(str(x))))
# *** Clean job_detail ***
# Remove [' ... '] patterns and replace with readable text
def clean_detail(text):
if isinstance(text, str):
# Simple heuristic: remove brackets and split by comma if list-like string
cleaned = text.strip("[]'\"\r\n ").replace("', '", "\n").replace("', \"", "\n").replace("\", '", "\n").replace("\\n", "\n")
# Remove common leading digits if they look like "1. ", "2. " to avoid double bulleting if we format it later
# But for BodyLabel is okay.
return cleaned
return text
self.df['job_detail_clean'] = self.df['job_detail'].apply(clean_detail)
# Initialize Vectorizer
print("Vectorizing data...")
self.vectorizer = TfidfVectorizer(max_features=5000) # Limit features for speed
self.tfidf_matrix = self.vectorizer.fit_transform(self.df['search_text_cut'])
# Generate a unique ID for each job based on content hash
def generate_id(row):
unique_str = f"{row['job_title']}{row['com_fullname']}{row['city']}{row['processed_date']}"
return hashlib.md5(unique_str.encode('utf-8')).hexdigest()
# Use columns that are likely unique in combination
# Note: processed_date might not exist, use release_time or just title+company+city+detail_short
self.df['job_id'] = self.df.apply(lambda x: hashlib.md5(
(str(x['job_title']) + str(x['com_fullname']) + str(x['city'])).encode('utf-8')
).hexdigest(), axis=1)
print("Data loaded and model ready.")
except Exception as e:
print(f"Error loading data: {e}")
def recommend(self, query, filters=None, top_n=10):
if self.df is None or self.vectorizer is None:
return []
# 1. Base Filter (Hard filters)
filtered_indices = self.df.index
if filters:
# City Filter
if 'city' in filters and filters['city']:
filtered_indices = filtered_indices[self.df.loc[filtered_indices, 'city'].str.contains(filters['city'], na=False)]
# Days Per Week Filter (e.g., "3天", "4天")
if 'days_per_week' in filters and filters['days_per_week']:
# Extract number from 'day_per_week' column (e.g. "4天/周")
# We want jobs that require <= user's available days? Or exactly?
# Usually user says "I can do 3 days", so show jobs requiring <= 3 days?
# Or user filters for "Jobs requiring 3 days". Let's assume strict match or simple string match for now.
# Use regex to find number.
req_days = int(filters['days_per_week'])
def check_days(val):
import re
match = re.search(r'(\d+)', str(val))
if match:
return int(match.group(1)) <= req_days # Show jobs requiring LESS or EQUAL days
return False
filtered_indices = filtered_indices[self.df.loc[filtered_indices, 'day_per_week'].apply(check_days)]
# Degree Filter
if 'degree' in filters and filters['degree']:
# 'job_academic' contains '本科', '硕士', etc.
filtered_indices = filtered_indices[self.df.loc[filtered_indices, 'job_academic'].str.contains(filters['degree'], na=False)]
# Salary Filter (min_wage)
if 'min_wage' in filters and filters['min_wage']:
min_w = int(filters['min_wage'])
def check_wage(val):
# val e.g. "150-200/天"
import re
nums = re.findall(r'(\d+)', str(val))
if nums:
# Take the average or max or min?
# Usually "150-200", if user wants min 200, then max(wage) >= 200?
# Or consistent: high bound of wage >= user_req
return int(nums[-1]) >= min_w
return False
filtered_indices = filtered_indices[self.df.loc[filtered_indices, 'wage'].apply(check_wage)]
if len(filtered_indices) == 0:
return []
# 2. Content-based Recommendation
if query:
query_cut = " ".join(jieba.cut(query))
query_vec = self.vectorizer.transform([query_cut])
# Calculate cosine similarity only for filtered items
# To avoid re-indexing issues, we'll calculate for all then filter
# Optimization: slice the matrix if dataset is huge, but for now simple approach:
cosine_sim = cosine_similarity(query_vec, self.tfidf_matrix).flatten()
# Create a dataframe for results with scores
# We explicitly use the original indices to map back to the dataframe
results = self.df.loc[filtered_indices].copy()
results['similarity_score'] = cosine_sim[filtered_indices]
# *** Keyword Filter & Boost (Heuristic) ***
# If query words appear in job_title, give huge boost
# This solves "keyword search doesn't work" feeling
query_keywords = query.split() # Simple split
def calculate_boost(row):
boost = 0
title = str(row['job_title']).lower()
for k in query_keywords:
if k.lower() in title:
boost += 0.5 # Significant boost per keyword in title
return boost
results['boost_score'] = results.apply(calculate_boost, axis=1)
results['final_score'] = results['similarity_score'] + results['boost_score']
# Sort by similarity
results = results.sort_values(by='final_score', ascending=False)
results['job_detail'] = results['job_detail_clean'] # Ensure clean detail is used
else:
# If no query, just return filtered data (maybe sorted by release time if we parsed it)
results = self.df.loc[filtered_indices].copy()
results['similarity_score'] = 0
results['final_score'] = 0
# Simple fallback sort if released_time is parseable, otherwise just head
results['job_detail'] = results['job_detail_clean'] # Ensure clean detail is used
return results.head(top_n).to_dict('records')
if __name__ == "__main__":
# Test
rec = InternshipRecommender()
print("Testing query 'python 实习' in '北京'...")
results = rec.recommend("python 数据分析", filters={'city': '北京'})
for r in results:
print(f"[{r['similarity_score']:.2f}] {r['job_title']} - {r['com_fullname']} ({r['city']})")