1+ import argparse
2+ import json
3+ import os
4+
5+ import nltk
6+
7+ nltk .download ('stopwords' )
8+ nltk .download ('wordnet' )
9+ nltk .download ('punkt' )
10+
11+ from nltk .corpus import stopwords
12+ from nltk .stem import WordNetLemmatizer
13+
14+ import numpy as np
15+ import scipy
16+
17+ from gensim .models import TfidfModel
18+ from gensim .corpora import Dictionary
19+
20+
21+ def parse_arguments ():
22+ parser = argparse .ArgumentParser (description = "TSNE Visualization of Papers in ML4Code" )
23+
24+ parser .add_argument ("json" , default = False , help = "the path the json containing all papers." )
25+ parser .add_argument ("outdir" , default = False , help = "the target path of the visualizations papers." )
26+ parser .add_argument ("--num-relwork" , default = 4 , help = "Number of related work per paper." , type = int )
27+ return parser .parse_args ()
28+
29+
30+ if __name__ == "__main__" :
31+ args = parse_arguments ()
32+ num_relworks = args .num_relwork
33+
34+ with open (args .json ) as f :
35+ data = json .load (f )
36+
37+ print (f"Num papers: { len (data )} " )
38+
39+ lemmatizer = WordNetLemmatizer ()
40+ stopwords = set (stopwords .words ('english' ))
41+ stopwords .update (["one" , "two" , "using" ])
42+
43+ tokens_per_paper = []
44+ keys = []
45+
46+ for paper_info in data :
47+ keys .append (paper_info ["key" ])
48+ text = paper_info ["title" ] + " " + paper_info ["abstract" ].replace ("<p>" , " " ).replace ("</p>" , " " ) + " " .join (paper_info ["tags" ])
49+ lemmatized_tokens = [lemmatizer .lemmatize (w ).lower () for w in nltk .word_tokenize (text ) if w .lower () not in stopwords and w .isalpha ()]
50+ tokens_per_paper .append (lemmatized_tokens )
51+
52+ dictionary = Dictionary (tokens_per_paper )
53+ dictionary .filter_extremes (no_below = 2 , no_above = 0.5 )
54+
55+ corpus = [dictionary .doc2bow (line ) for line in tokens_per_paper ]
56+ model = TfidfModel (corpus )
57+
58+ tf_idf_vectors = []
59+ for bow in corpus :
60+ vec = np .zeros (len (dictionary ), dtype = np .float64 )
61+ for i , v in model [bow ]:
62+ vec [i ] = v
63+ tf_idf_vectors .append (vec )
64+ tf_idf_vectors = np .array (tf_idf_vectors )
65+
66+ distances = scipy .spatial .distance .cdist (tf_idf_vectors , tf_idf_vectors , metric = 'cosine' )
67+ sorted_idxs = np .argsort (distances , axis = - 1 )[:, 1 :num_relworks + 1 ]
68+
69+ os .makedirs (args .outdir , exist_ok = True )
70+ for i , key in enumerate (keys ):
71+ with open (os .path .join (args .outdir , key + ".json" ), "w" ) as f :
72+ json .dump ([keys [j ] for j in sorted_idxs [i ]], f )
73+
74+
0 commit comments