Skip to content

Commit db7b0b1

Browse files
committed
Compute related papers for each paper.
1 parent 00c358a commit db7b0b1

File tree

2 files changed

+76
-1
lines changed

2 files changed

+76
-1
lines changed

.github/workflows/deploy.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@ jobs:
2828
python ${{ github.workspace }}/etc/compute_embeddings.py ${{ github.workspace }}/_site/paper-abstracts.json ${{ github.workspace }}/_site/tsne.json
2929
- name: Compute topics
3030
run: |
31-
python -m pip install nltk gensim
31+
python -m pip install nltk gensim scipy
3232
python ${{ github.workspace }}/etc/compute_topics.py ${{ github.workspace }}/_site/paper-abstracts.json ${{ github.workspace }}/_site/topics.json
33+
python ${{ github.workspace }}/etc/compute_related.py ${{ github.workspace }}/_site/paper-abstracts.json ${{ github.workspace }}/_site/publications-metadata/
3334
- name: Deploy
3435
uses: peaceiris/actions-gh-pages@v3
3536
with:

etc/compute_related.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import argparse
2+
import json
3+
import os
4+
5+
import nltk
6+
7+
nltk.download('stopwords')
8+
nltk.download('wordnet')
9+
nltk.download('punkt')
10+
11+
from nltk.corpus import stopwords
12+
from nltk.stem import WordNetLemmatizer
13+
14+
import numpy as np
15+
import scipy
16+
17+
from gensim.models import TfidfModel
18+
from gensim.corpora import Dictionary
19+
20+
21+
def parse_arguments():
22+
parser = argparse.ArgumentParser(description="TSNE Visualization of Papers in ML4Code")
23+
24+
parser.add_argument("json", default=False, help="the path the json containing all papers.")
25+
parser.add_argument("outdir", default=False, help="the target path of the visualizations papers.")
26+
parser.add_argument("--num-relwork", default=4, help="Number of related work per paper.", type=int)
27+
return parser.parse_args()
28+
29+
30+
if __name__ == "__main__":
31+
args = parse_arguments()
32+
num_relworks = args.num_relwork
33+
34+
with open(args.json) as f:
35+
data = json.load(f)
36+
37+
print(f"Num papers: {len(data)}")
38+
39+
lemmatizer = WordNetLemmatizer()
40+
stopwords = set(stopwords.words('english'))
41+
stopwords.update(["one", "two", "using"])
42+
43+
tokens_per_paper = []
44+
keys = []
45+
46+
for paper_info in data:
47+
keys.append(paper_info["key"])
48+
text = paper_info["title"] + " " + paper_info["abstract"].replace("<p>", " ").replace("</p>", " ") + " ".join(paper_info["tags"])
49+
lemmatized_tokens = [lemmatizer.lemmatize(w).lower() for w in nltk.word_tokenize(text) if w.lower() not in stopwords and w.isalpha()]
50+
tokens_per_paper.append(lemmatized_tokens)
51+
52+
dictionary = Dictionary(tokens_per_paper)
53+
dictionary.filter_extremes(no_below=2, no_above=0.5)
54+
55+
corpus = [dictionary.doc2bow(line) for line in tokens_per_paper]
56+
model = TfidfModel(corpus)
57+
58+
tf_idf_vectors = []
59+
for bow in corpus:
60+
vec = np.zeros(len(dictionary), dtype=np.float64)
61+
for i, v in model[bow]:
62+
vec[i] = v
63+
tf_idf_vectors.append(vec)
64+
tf_idf_vectors = np.array(tf_idf_vectors)
65+
66+
distances = scipy.spatial.distance.cdist(tf_idf_vectors, tf_idf_vectors, metric='cosine')
67+
sorted_idxs = np.argsort(distances, axis=-1)[:, 1:num_relworks+1]
68+
69+
os.makedirs(args.outdir, exist_ok=True)
70+
for i, key in enumerate(keys):
71+
with open(os.path.join(args.outdir, key + ".json"), "w") as f:
72+
json.dump([keys[j] for j in sorted_idxs[i]], f)
73+
74+

0 commit comments

Comments
 (0)