Skip to content

Commit 89d843e

Browse files
committed
__other__
1 parent d7f77f5 commit 89d843e

1 file changed

Lines changed: 45 additions & 0 deletions

File tree

  • __other__/pubmed - download and textract
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
2+
# date: 2022.07.19
3+
# [python - getting weird results from metapub and pubmed - Stack Overflow](https://stackoverflow.com/questions/73043246/getting-weird-results-from-metapub-and-pubmed/)
4+
5+
import os
6+
from urllib.request import urlretrieve
7+
import metapub
8+
import textract
9+
10+
#another_path = '/content/Articles/'
11+
another_path = './'
12+
13+
pmid_list = ['35566889','33538053', '30848212']
14+
15+
for query in pmid_list:
16+
17+
print('query:', query)
18+
19+
url = metapub.FindIt(query).url
20+
print('url:', url)
21+
22+
if url:
23+
24+
try:
25+
out_file = os.path.join(another_path, query)
26+
print('out_file:', out_file)
27+
28+
print('... downloading')
29+
30+
urlretrieve(url, out_file + '.pdf')
31+
32+
print('... processing')
33+
34+
data = textract.process(out_file + '.pdf', extension='pdf', method='pdftotext', encoding="utf_8")
35+
36+
print('... saving')
37+
38+
with open(out_file + '.txt', "wb") as textfile: # save bytes
39+
textfile.write(data)
40+
41+
print('... OK')
42+
43+
except Exception as ex:
44+
print('Exception:', ex)
45+

0 commit comments

Comments
 (0)