Skip to content

Commit 7944538

Browse files
authored
Update scrap_word_count_image.py
1 parent b1df7ee commit 7944538

File tree

1 file changed

+17
-7
lines changed

1 file changed

+17
-7
lines changed

scrap_word_count_image.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,25 +6,35 @@
66
from collections import Counter
77
from wordcloud import WordCloud
88
import matplotlib.pyplot as plt
9-
9+
import nltk
10+
from nltk.corpus import stopwords
11+
import re
12+
from collections import Counter
1013

1114
# 스크래핑
1215
url = "https://healthguides.cnn.com/finding-the-right-migraine-treatment/how-to-ensure-your-doctor-understands-your-migraine-severity"
1316
webpage = requests.get(url)
1417
soup = BeautifulSoup(webpage.content, "html.parser")
1518
item = soup.select_one("article.ArticlePage-mainContent")
16-
data = item.text
19+
data = str(item.text).lower()
1720

1821
# 단어 분리
19-
word = str(data).lower().strip().split()
20-
count = Counter(word).most_common()
21-
words = dict(count)
22-
# print(words)
22+
nltk.download('all')
23+
cleaned_data = re.sub(r'[^\.\?\!\w\d\s]','',data)
24+
words = nltk.word_tokenize(cleaned_data)
25+
tokens_pos = nltk.pos_tag(words)
26+
27+
NN_words = []
28+
for word, pos in tokens_pos:
29+
if 'NN' in pos:
30+
NN_words.append(word)
31+
c = Counter(NN_words)
2332

2433
# 워드클라우드
2534
wordcloud = WordCloud(font_path='C:\\Windows\\Fonts\\Gothic.ttf',
26-
background_color="white", max_font_size=100).generate_from_frequencies(words)
35+
background_color="white", max_font_size=100).generate_from_frequencies(c)
2736

2837
plt.imshow(wordcloud, interpolation='bilinear')
2938
plt.axis('off')
3039
plt.show()
40+
wordcloud.to_file("wordcloud.png")

0 commit comments

Comments
 (0)