From 48024add6729ea2095eb34979163319f3c89794c Mon Sep 17 00:00:00 2001 From: kyh1126 Date: Sat, 23 Sep 2017 22:44:20 +0900 Subject: [PATCH 01/18] =?UTF-8?q?=EB=A6=AC=ED=8C=A9=ED=86=A0=EB=A7=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/KeywordExtractor/SentenceManager.py | 75 +++++++++++++------------ src/test/t_SentenceManager.py | 19 +++++-- 2 files changed, 53 insertions(+), 41 deletions(-) diff --git a/src/KeywordExtractor/SentenceManager.py b/src/KeywordExtractor/SentenceManager.py index 39fba27..1ff7ade 100644 --- a/src/KeywordExtractor/SentenceManager.py +++ b/src/KeywordExtractor/SentenceManager.py @@ -11,39 +11,42 @@ from konlpy.utils import pprint class SentenceManager() : - def run(self, pos) : #문장 처리 수행 - slist = [] - try : - slist = self.scd_parser('DOC1.SCD') # scd에서 문장 추출, #id, date는 별도 저장 - pos = self.pos_tagger(slist[0:10000]) # 문장 형태소 분석하여 명사 추출 - - return 1 - except : - return 0 - - - def scd_parser(self, scd) : - sentence = open(scd,'r').readlines() - slist = [] - - for i in sentence : - if i.startswith('') : - slist.append(i.replace('','')) - - return slist - - def pos_tagger(self, slist) : - twitter = Twitter("C:/Program Files/Java/jdk1.7.0_55/jre/bin/server/jvm.dll") - imsi = [] - - for one in slist : - imsi.append(twitter.nouns(one)) - - return imsi - - def id_manager(self) : - return - - def indexer(self) : - return - + + def run(self, pos, slist) : #문장 처리 수행 + path = "C:/Program Files/Java/jdk1.7.0_55/jre/bin/server/jvm.dll" + try : + self.scd_parser('DOC1.SCD', slist) # scd에서 문장 추출, #id, date는 별도 저장 + self.pos_tagger(slist[0:2], pos, path) # 문장 형태소 분석하여 명사 추출 + + return 1 + except : + return 0 + + + def scd_parser(self, scd, slist) : + sentence = open(scd,'r') + try: + for i in sentence : + if i.startswith('') : + slist.append(i[9:]) + # slist.append(i.replace('','')) + return 1 + except Exception as ex : + print(ex) + return 0 + + def pos_tagger(self, slist, pos, path) : + twitter = Twitter(path) + + try: + for one in slist : + pos.append(twitter.nouns(one)) + return 1 + except Exception as ex : + print(ex) + return 0 + + # def id_manager(self) : + # return + + # def indexer(self) : \ No newline at end of file diff --git a/src/test/t_SentenceManager.py b/src/test/t_SentenceManager.py index 43dea5d..16c296b 100644 --- a/src/test/t_SentenceManager.py +++ b/src/test/t_SentenceManager.py @@ -1,6 +1,6 @@ ''' @author: 신승식, 김윤희 -@date: 2017.9.19 +@date: 2017.9.23 @version: 1.0.1 @brief: sentence manager @@ -10,13 +10,22 @@ ''' import time import SentenceManager - # 클래스 호출 p = SentenceManager.SentenceManager() pos = [] - +slist = [] +path = "C:/Program Files/Java/jdk1.7.0_55/jre/bin/server/jvm.dll" start = time.time() -print(p.run(pos)) +# print(p.run(pos, slist)) +try: + print(p.scd_parser('DOC.SCD', slist)) +except Exception as ex: + print(ex) + +try: + print(p.pos_tagger('청와대 수석-보좌관 10명도 24일 재산을 신고했다. 이들의 평균재산은 12억8800만원이었다.', pos, path)) +except Exception as ex: + print(ex) end = time.time() - start -print(" end: [" + str(round(end,2)) + " sec]") +print(" end: [" + str(round(end,2)) + " sec]") \ No newline at end of file From 8118716c26a482c555b5326e4039b0f3b1765224 Mon Sep 17 00:00:00 2001 From: HeoJeaHyuk <31958876+HeoJeaHyuk@users.noreply.github.com> Date: Sat, 23 Sep 2017 22:46:23 +0900 Subject: [PATCH 02/18] =?UTF-8?q?main=20class=20=EC=88=98=EC=A0=95=20/=20?= =?UTF-8?q?=EC=84=A4=EB=AA=85=EC=B6=94=EA=B0=80,=20run=20time=20=EC=B6=9C?= =?UTF-8?q?=EB=A0=A5=20,=20=EC=B6=9C=EB=A0=A5=EA=B2=B0=EA=B3=BC=20?= =?UTF-8?q?=ED=8C=8C=EC=9D=BC=EC=A0=80=EC=9E=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/KeywordExtractor/KE.py | 50 +++++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/src/KeywordExtractor/KE.py b/src/KeywordExtractor/KE.py index cdd9c5a..9a8bfbe 100644 --- a/src/KeywordExtractor/KE.py +++ b/src/KeywordExtractor/KE.py @@ -1,10 +1,16 @@ -# @author : 허재혁 -# @date : 2017. 9. -# @version : 3.5.2 -# -# @brief : +''' +@author : 김문종, 허재혁 +@date : 2017. 9. +@version : 3.5.2 +@brief : 키워드 추출 및 문장 처리 main class +- 파라미터 : pos(리스트), keyword(리스트), r_keyword(리스트) +- sm 클래스에 pos를 넘겨서 추출된 키워드를 넘겨받는다. + sm 클래스에서 받은 pos와 비어있는 keyword 리스트를 ka 클래스에 넘긴다. + ka 클래스에서 받은 pos와 sorting된 keyword 리스트를 출력. +''' +import time import SentenceManager as sm import KeywordAnalyzer as ka @@ -22,13 +28,13 @@ def __init__(self): def run(self): - sm.ret = self.sm.run(self.pos) - if sm.ret == 1 : + ret = self.sm.run(self.pos) + if ret is 1: print("SM complete") else : print("SM error") - ka.ret = self.ka.run(self.pos,self.keyword) - if ka.ret == 1 : + ret = self.ka.run(self.pos,self.keyword) + if ret is 1 : print("KA complete") else : print("KA error") @@ -38,9 +44,35 @@ def run(self): # sm.indexer(keyword,index) # ka.relational_keyword_analyzer(keyword,r_keyword) + def print_pos(self): + print(self.pos) + + def print_keyword(self): + print(self.keyword) + + def write_pos(self): + f = open("pos_result.txt", "w") + for sentence in self.pos: + f.write(str(sentence)) + + def write_keyword(self): + f = open("keyword_result.txt","w") + for sentence in self.keyword: + f.write(str(sentence)) if __name__ == "__main__" : + start = time.time() ke = Keyword_Ext() ke.run() + end = time.time() - start + print("complete: [" + str(round(end, 2)) + "second ]") + + # pos keyword 출력 + # ke.print_pos() + # ke.print_keyword() + + # file write + ke.write_pos() + ke.write_keyword() From 92c9f2afad446e0ddd2ef81fb1c6fd3c20885a24 Mon Sep 17 00:00:00 2001 From: kyh1126 Date: Sat, 23 Sep 2017 22:47:24 +0900 Subject: [PATCH 03/18] =?UTF-8?q?=ED=85=8C=EC=8A=A4=ED=8A=B8=EC=BD=94?= =?UTF-8?q?=EB=93=9C=20=EB=8B=A4=EC=8B=9C=EC=98=AC=EB=A6=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/test/t_SentenceManager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/t_SentenceManager.py b/src/test/t_SentenceManager.py index 16c296b..b3edc6d 100644 --- a/src/test/t_SentenceManager.py +++ b/src/test/t_SentenceManager.py @@ -18,7 +18,7 @@ start = time.time() # print(p.run(pos, slist)) try: - print(p.scd_parser('DOC.SCD', slist)) + print(p.scd_parser('DOC1.SCD', slist)) except Exception as ex: print(ex) From ee369f6c7d2a904fcf06b5f8f4c483ed06a468c3 Mon Sep 17 00:00:00 2001 From: penbell Date: Sat, 23 Sep 2017 23:24:04 +0900 Subject: [PATCH 04/18] =?UTF-8?q?sm,=20ka=20=ED=81=B4=EB=9E=98=EC=8A=A4=20?= =?UTF-8?q?=EC=98=A4=EB=A5=98=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/KeywordExtractor/KeywordAnalyzer.py | 8 ++++---- src/KeywordExtractor/SentenceManager.py | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/KeywordExtractor/KeywordAnalyzer.py b/src/KeywordExtractor/KeywordAnalyzer.py index 596cca7..ba242ef 100644 --- a/src/KeywordExtractor/KeywordAnalyzer.py +++ b/src/KeywordExtractor/KeywordAnalyzer.py @@ -8,7 +8,7 @@ - check tf idf tfidf ''' -from operator +import operator from collections import Counter # class KeywordAnalyzer() : @@ -22,11 +22,11 @@ def run(self, list1, keyword) : # 모든 함수를 실행 시키는 함수. self.tfidf_calc(keyword) self.reset() - return 0 + return 1 except : - return 1 + return 0 def tf_calc(self, list1) : # tf를 계산하는 함수. @@ -71,7 +71,7 @@ def tfidf_calc(self, keyword) : for j in self._tf[i].keys() : each[j] = self._tf[i][j] / self._df[j] - keyword += [ sorted(each.items(), key=operator.itemgetter(1), reverce=True) ] + keyword += [ sorted(each.items(), key=operator.itemgetter(1), reverse=True) ] def reset(self) : self._tf = [] \ No newline at end of file diff --git a/src/KeywordExtractor/SentenceManager.py b/src/KeywordExtractor/SentenceManager.py index 1ff7ade..c542246 100644 --- a/src/KeywordExtractor/SentenceManager.py +++ b/src/KeywordExtractor/SentenceManager.py @@ -12,11 +12,12 @@ class SentenceManager() : - def run(self, pos, slist) : #문장 처리 수행 + def run(self, pos) : #문장 처리 수행 path = "C:/Program Files/Java/jdk1.7.0_55/jre/bin/server/jvm.dll" + slist = [] try : self.scd_parser('DOC1.SCD', slist) # scd에서 문장 추출, #id, date는 별도 저장 - self.pos_tagger(slist[0:2], pos, path) # 문장 형태소 분석하여 명사 추출 + self.pos_tagger(slist[0:10000], pos, path) # 문장 형태소 분석하여 명사 추출 return 1 except : From 4d744b71d3d850973e0c97db81dde0a6f13908ea Mon Sep 17 00:00:00 2001 From: "DESKTOP-T5K033P\\foryo" Date: Sat, 23 Sep 2017 23:34:51 +0900 Subject: [PATCH 05/18] test commit --- Readme.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Readme.txt b/Readme.txt index e69de29..30d74d2 100644 --- a/Readme.txt +++ b/Readme.txt @@ -0,0 +1 @@ +test \ No newline at end of file From 6be3bfa66063d92acbef764f5cfd6033f126d893 Mon Sep 17 00:00:00 2001 From: "DESKTOP-T5K033P\\foryo" Date: Sat, 23 Sep 2017 23:37:28 +0900 Subject: [PATCH 06/18] test --- Readme.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/Readme.txt b/Readme.txt index 30d74d2..e69de29 100644 --- a/Readme.txt +++ b/Readme.txt @@ -1 +0,0 @@ -test \ No newline at end of file From f0eb5c313627df69bb53659aa8dcd36da399d854 Mon Sep 17 00:00:00 2001 From: penbell Date: Thu, 28 Sep 2017 22:22:26 +0900 Subject: [PATCH 07/18] =?UTF-8?q?KE=20=ED=95=A8=EC=88=98=20=EA=B5=AC?= =?UTF-8?q?=EC=A1=B0=20=EB=B3=80=EA=B2=BD=20=EB=B0=8F=20=ED=95=A8=EC=88=98?= =?UTF-8?q?=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/KeywordExtractor/KE.py | 44 +++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/KeywordExtractor/KE.py b/src/KeywordExtractor/KE.py index 9a8bfbe..9f966ea 100644 --- a/src/KeywordExtractor/KE.py +++ b/src/KeywordExtractor/KE.py @@ -22,27 +22,25 @@ def __init__(self): self.pos = [] self.keyword = [] self.r_keyword = [] - # self.id = {} - # self.iid = {} - # self.index = {} + self.key_id = {} + self.key_iid = {} + self.index = {} def run(self): - ret = self.sm.run(self.pos) - if ret is 1: - print("SM complete") - else : - print("SM error") - ret = self.ka.run(self.pos,self.keyword) + self.check(self.sm.run(self.pos), 'sm.run') + self.check(self.ka.run(self.pos,self.keyword), 'ka.run') + #self.check(self.sm.id_manager(self.keyword, self.key_id, self.key_iid), 'id_manager') + #self.check(self.sm.id_indexer(self.keyword, self.index), 'indexer') + + + def check(self, ret, i) : if ret is 1 : - print("KA complete") + print("{}함수 성공!".format(i)) + else : - print("KA error") + print("{}함수 실패!".format(i)) - # def temp(self): - # sm.id_manager(keyword,id,iid) - # sm.indexer(keyword,index) - # ka.relational_keyword_analyzer(keyword,r_keyword) def print_pos(self): print(self.pos) @@ -53,26 +51,28 @@ def print_keyword(self): def write_pos(self): f = open("pos_result.txt", "w") for sentence in self.pos: - f.write(str(sentence)) + f.write("{}\n".format(str(sentence))) def write_keyword(self): f = open("keyword_result.txt","w") for sentence in self.keyword: - f.write(str(sentence)) + f.write("{}\n".format(str(sentence))) if __name__ == "__main__" : start = time.time() + ke = Keyword_Ext() ke.run() + end = time.time() - start print("complete: [" + str(round(end, 2)) + "second ]") - # pos keyword 출력 - # ke.print_pos() - # ke.print_keyword() + #pos keyword 출력 + ke.print_pos() + ke.print_keyword() # file write - ke.write_pos() - ke.write_keyword() + # ke.write_pos() + # ke.write_keyword() \ No newline at end of file From a7b96624fcb42b79a34533a60d3c2105159b2b4a Mon Sep 17 00:00:00 2001 From: "user-PC\\user" Date: Thu, 28 Sep 2017 22:25:01 +0900 Subject: [PATCH 08/18] =?UTF-8?q?normalizer=20=ED=95=A8=EC=88=98=20?= =?UTF-8?q?=EC=83=9D=EC=84=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/KeywordExtractor/KeywordAnalyzer.py | 38 ++++++++++++++++++------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/src/KeywordExtractor/KeywordAnalyzer.py b/src/KeywordExtractor/KeywordAnalyzer.py index ba242ef..e06f7e2 100644 --- a/src/KeywordExtractor/KeywordAnalyzer.py +++ b/src/KeywordExtractor/KeywordAnalyzer.py @@ -1,15 +1,20 @@ ''' @author : suhyun 이병욱 -@data : 2017-09-23 -@version : 1.0.3 +@data : 2017-09-28 +@version : 1.0.4 -@brief : check tf idf +@brief : check tf, idf, tfidf, normalizer - python version : 3.5.x -- check tf idf tfidf +- check tf, idf, tfidf, normalizer +- variables + - _tf : tf 계산된 값을 list안에 문서별로 counter된 값은 입력 + - _df : 중복된 단어를 제거하고 _df변수에 df값을 채워넣는다 + - each : tfidf가 계산된 dictionary ''' - +import time import operator from collections import Counter # +import copy class KeywordAnalyzer() : _tf = [] @@ -19,8 +24,9 @@ def run(self, list1, keyword) : # 모든 함수를 실행 시키는 함수. try : self.tf_calc(list1) self.idf_calc() - self.tfidf_calc(keyword) + self.tfidf_calc() self.reset() + self.normalizer(self.each,keyword) return 1 @@ -57,7 +63,7 @@ def idf_calc(self) : else : self._df[key] = 1 - def tfidf_calc(self, keyword) : + def tfidf_calc(self) : ''' tfidf를 계산하는 함수. 임시로 딕셔너리 받을 위치 생성(한 문서를 돌때마다 새로 생성.) 하고 @@ -66,12 +72,22 @@ def tfidf_calc(self, keyword) : for i in range(len(self._tf)) : - each = {} + self.each = {} for j in self._tf[i].keys() : - each[j] = self._tf[i][j] / self._df[j] + self.each[j] = self._tf[i][j] / self._df[j] + return self.each - keyword += [ sorted(each.items(), key=operator.itemgetter(1), reverse=True) ] def reset(self) : - self._tf = [] \ No newline at end of file + self._tf = [] + +# func normalizer # tf-idf 점수 평준화(0~100점) + def normalizer(self,dict,keyword): + max_value = max(dict.values()) + + for key in dict.keys(): + dict[key] = int((dict[key] / max_value) * 100) + + keyword += [sorted(dict.items(), key=operator.itemgetter(1), reverse=True)] + From 4248d4623ce1b00dd99ad35d76d1b0cd3f7c1e30 Mon Sep 17 00:00:00 2001 From: penbell Date: Thu, 28 Sep 2017 22:31:51 +0900 Subject: [PATCH 09/18] Merge remote-tracking branch 'origin/suhyun' # Conflicts: # src/KeywordExtractor/SentenceManager.py # src/test/t_SentenceManager.py --- .../__pycache__/KeywordAnalyzer.cpython-35.pyc | Bin 0 -> 2223 bytes .../__pycache__/SentenceManager.cpython-35.pyc | Bin 0 -> 1573 bytes 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/KeywordExtractor/__pycache__/KeywordAnalyzer.cpython-35.pyc create mode 100644 src/KeywordExtractor/__pycache__/SentenceManager.cpython-35.pyc diff --git a/src/KeywordExtractor/__pycache__/KeywordAnalyzer.cpython-35.pyc b/src/KeywordExtractor/__pycache__/KeywordAnalyzer.cpython-35.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ad11e6e0d4a3eff53b5035a3086a1c780dd10116 GIT binary patch literal 2223 zcmah~-ES0C6hC)nXLmbn1uY*INthTV&I21hA`Mtg4K;*d`cjj^res`aX4~CvciTI& zvTfSL*wr+a1TYkd6vc>&2@f>I1dY%B5%cO3o#7ww$=|u#E_FlVWbfH?KhBwR&hK~6 z?CeqftBE_!9Q!P_YqC)}?jE3L2AhjhdC*eH?GxNWNV+hrD7~#KwtZ#|MrcJAL%T zNpt8^Uk2rR4XelV$MPplbLbOU_Dh(%lfFA;g(a)(l`Qk9H8TglHS4*-$^6^-CN}l|9=T}u1%K{JU3%}=iq*M!1i_W&hyQyp&SC`TA?8le z0#Su%LC}&Q!4qiI)-+nsI?K9RMs%ZNwHyf{xcHZ9f-WM5QClB*D&SDEuv1YcEIDqm z>PA_VrLCE&L~Y%We&C0;9;E`mT8fNnIS7wOnW=W%zmm+s%|EAy&y3H=dc}9c@iq$M zfpo{WviqPJ%Ayo@qYmx4RCCM+db;)2z6Cx62m2+G4^0EPO7$ZnQklrum4|IpURb-co&JUmLg#H8* z#0@YP1vT@WK&WD_07?b-@Z|$44FnI|kp)s1z~R}$G}gSrNmWv8KYY1Ni#jzwexZ-^ z$_K1FKBm$H;xDw94C3kcYTMvNqaP$1&5QXKldlOtRr?>fhKojBAHe4 zWr!#R&IZ8LE7>UpXCYG#>eV9dKJzcqvEPj>L-Zy_z$Yl(qa70av_9<+#%{4!pMSL* z=j}Vx;V5$h?X=C&TTq(UwmFJ=s!OYA+GK3dR~Wr_RISyW?q>=xu19DwwPSf9Mg0s2 z)}Xz{2&QOo_7shQ3!#Fkl4eT)7iOq}?xWu2sXnzNG-f{&Xg^>yRB!$R!F7;P9pL}* z`>vI&D6r$3tH~EP^;qxXYZy2)Lo(BLi$5^{E}T(Uyc_F#c2*;*Ptc`!Ksc zWK-1^#r(eRSE>#^Vw+LcacaeB-*F<-ai;5Dqsn^Dab_FEYI{e$NEJxF#IIrcO1=e> zGnD+U9jU8GR14Ix!3F_~oJfnbmg~)>a|d#JB=6ZqlyMxd?mCX_$2{8Q)~i)~F7Ra! a+Siqjy~uZz9c;(?9%mnLFmNvRX#WCL;VE4J literal 0 HcmV?d00001 diff --git a/src/KeywordExtractor/__pycache__/SentenceManager.cpython-35.pyc b/src/KeywordExtractor/__pycache__/SentenceManager.cpython-35.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94640f2282a5851f9c863a13457c0644c10e3c06 GIT binary patch literal 1573 zcmZ{k-)kF35XWclm*lHlo474?(t;ieMc|yQG$yr4Q-PfpTIv=YNGu$V%iYD6e7d{r zog7Knq2L5cl0Ts?eP|w90?nfXrGG*Hf_v?gWBEPwsoy!-iE5g)cRV}0d$aSI8NW9^ zZdxz@TK!We`kO|E3cZe*dcD1$Dw!9$d z7ni364p^%wnF+*p>`2_m%s5Ky)LBP#6)n>eKe(N#uw`28q#K!@ z1kEJE zF>Ju!6(pybatnz-f`Oli*<(Z8nWpQ=J;;ATTB_2fNphFOj!t+ErT7eb0O#`Ttl}bm zv3XCY8zTwIgzTG_7bC9GG`VsxhUZDNBIVR8dZtCOAA)%5$TUI0Ze)fN!{lXSSG%qs zr|6DM&&#)B$RchwRHR3~1uZL{6n`d~nc{czEDXgs`(QHXFq1P7Bqo4z9P%8Xo)M<# zpFNHBb>dx(uV54~U(uCVfbK3r)u>mc?ng{|>E;I#K zD~`sHqk=Oh91Xf>jBr$!N0~|PGC2j&(8H(`Ch{FNybDn& z$hX-a=R>lq5%>8+@@z42RuVo|Qc4(-ZS108+g{|_cEj9^!XWNhXvv&IqjYTOq|G3a Z+!Ohw|MQR)FAraLh0)}04D{Hf@-LT|X-WV9 literal 0 HcmV?d00001 From 492542c074f374ccbd76ef3199ebedd3088e7bff Mon Sep 17 00:00:00 2001 From: kyh1126 Date: Thu, 28 Sep 2017 22:41:06 +0900 Subject: [PATCH 10/18] =?UTF-8?q?2=EC=A3=BC=EC=B0=A8=20=EC=BB=A4=EB=B0=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/KeywordExtractor/SentenceManager.py | 56 ++++++++++++++++++------- src/test/t_SentenceManager.py | 39 ++++++++++++++--- 2 files changed, 75 insertions(+), 20 deletions(-) diff --git a/src/KeywordExtractor/SentenceManager.py b/src/KeywordExtractor/SentenceManager.py index c542246..5b0186e 100644 --- a/src/KeywordExtractor/SentenceManager.py +++ b/src/KeywordExtractor/SentenceManager.py @@ -1,36 +1,37 @@ +# - *- coding: utf- 8 - *- ''' -@author : 신승식, 김윤희 -@date : 2017. 9. 19 -@version : 1.0.0 +@author : 조수현, 김윤희 +@date : 2017. 9. 26 +@version : 2.0.0 @brief : 기능1 -- 파라미터 - scd:파일명, sentence:문장 - +- 파라미터 - scd:파일명, slist:문장, pos:명사목록, path:jvm.dll경로 + keyword:키워드,tfidf목록, id_m:키워드,채번 목록, iid_m:채번,키워드 목록 + index:키워드가 속한 문서들 목록 ''' from konlpy.tag import Twitter from konlpy.utils import pprint class SentenceManager() : - - def run(self, pos) : #문장 처리 수행 + _count = 0 + + def run(self, pos) : path = "C:/Program Files/Java/jdk1.7.0_55/jre/bin/server/jvm.dll" slist = [] try : self.scd_parser('DOC1.SCD', slist) # scd에서 문장 추출, #id, date는 별도 저장 - self.pos_tagger(slist[0:10000], pos, path) # 문장 형태소 분석하여 명사 추출 + self.pos_tagger(slist[0:10], pos, path) # 문장 형태소 분석하여 명사 추출 return 1 except : return 0 - def scd_parser(self, scd, slist) : sentence = open(scd,'r') try: for i in sentence : if i.startswith('') : - slist.append(i[9:]) - # slist.append(i.replace('','')) + slist.append(i.replace('','')) return 1 except Exception as ex : print(ex) @@ -38,8 +39,13 @@ def scd_parser(self, scd, slist) : def pos_tagger(self, slist, pos, path) : twitter = Twitter(path) - try: + # slist의 기사가 1개인경우 type이 str임으로 별도처리 + if type(slist) == str: + pos.append(twitter.nouns(slist)) + return 1 + + print("len(slist) : ",len(slist)) for one in slist : pos.append(twitter.nouns(one)) return 1 @@ -47,7 +53,27 @@ def pos_tagger(self, slist, pos, path) : print(ex) return 0 - # def id_manager(self) : - # return - # def indexer(self) : \ No newline at end of file + def id_manager(self, keyword, id_m, iid_m) : + for i in range(len(keyword)): + for j in range(len(keyword[i])): + # print(keyword[i][j][0]) + key = keyword[i][j][0] + if key in id_m: + continue + + id_m[key] = self._count + iid_m[self._count] = key + + self._count += 1 + + def indexer(self, keyword, index) : + for i in range(len(keyword)): + for j in range(len(keyword[i])): + key = keyword[i][j][0] + + if key in index: + index[key] += [i] + continue + + index[key] = [i] diff --git a/src/test/t_SentenceManager.py b/src/test/t_SentenceManager.py index b3edc6d..f005a62 100644 --- a/src/test/t_SentenceManager.py +++ b/src/test/t_SentenceManager.py @@ -1,10 +1,10 @@ +# - *- coding: utf- 8 - *- ''' -@author: 신승식, 김윤희 -@date: 2017.9.23 -@version: 1.0.1 +@author: 조수현, 김윤희 +@date: 2017.9.28 +@version: 2.0.1 @brief: sentence manager --python version:3.5.2 -Use class: SentenceManager -test parameter: arg1, arg2 ''' @@ -15,17 +15,46 @@ pos = [] slist = [] path = "C:/Program Files/Java/jdk1.7.0_55/jre/bin/server/jvm.dll" + +id_m = {} +iid_m = {} +keyword = [[('1,10,-1,10,3.4,-10.2', 1.0), ('안녕', 0.6666666666666666), ('하세요', 0.5)], [('suhyun', 1.0), ('안녕', 1.0)], [('반갑습니다.', 1.0), ('하세요', 0.5), ('안녕', 0.3333333333333333)], [('하세요', 0.25), ('안녕', 0.16666666666666666)], [('suhyun', 0.5), ('안녕', 0.5)], [('반갑습니다.', 0.5), ('하세요', 0.25), ('안녕', 0.16666666666666666)], [('하세요', 0.14285714285714285), ('안녕', 0.1111111111111111)], [('1,10,-1,10,3.4,-10.2', 0.5), ('안녕', 0.2222222222222222), ('하세요', 0.14285714285714285)], [('반갑습니다.', 0.3333333333333333), ('하세요', 0.14285714285714285), ('안녕', 0.1111111111111111)], [('하세요', 0.1111111111111111), ('안녕', 0.08333333333333333)], [('1,10,-1,10,3.4,-10.2', 0.3333333333333333), ('안녕', 0.16666666666666666), ('하세요', 0.1111111111111111)], [('suhyun', 0.3333333333333333), ('안녕', 0.25)]] +index = {} # 키워드 별 문서 id indexing + start = time.time() + # print(p.run(pos, slist)) + try: print(p.scd_parser('DOC1.SCD', slist)) + print(slist[0]) except Exception as ex: print(ex) +print("________________________________________") +print() + try: - print(p.pos_tagger('청와대 수석-보좌관 10명도 24일 재산을 신고했다. 이들의 평균재산은 12억8800만원이었다.', pos, path)) + print(p.pos_tagger(slist[0:1], pos, path)) + print(pos) except Exception as ex: print(ex) +# try: +# p.id_manager(keyword, id_m, iid_m) +# print(id_m) +# print(iid_m) +# except Exception as ex: +# print(ex) + +# print("________________________________________") +# print() + +# try: +# p.indexer(keyword, index) +# print(index) +# except Exception as ex: +# print(ex) + end = time.time() - start print(" end: [" + str(round(end,2)) + " sec]") \ No newline at end of file From 77067a90ef5b9746529c2a127b7256a931d970e6 Mon Sep 17 00:00:00 2001 From: kyh1126 Date: Thu, 28 Sep 2017 23:03:56 +0900 Subject: [PATCH 11/18] =?UTF-8?q?ex=EC=B6=9C=EB=A0=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/KeywordExtractor/SentenceManager.py | 134 +++++++----------------- 1 file changed, 35 insertions(+), 99 deletions(-) diff --git a/src/KeywordExtractor/SentenceManager.py b/src/KeywordExtractor/SentenceManager.py index 11e7466..d7ca8a7 100644 --- a/src/KeywordExtractor/SentenceManager.py +++ b/src/KeywordExtractor/SentenceManager.py @@ -13,136 +13,72 @@ from konlpy.utils import pprint class SentenceManager() : -<<<<<<< HEAD -<<<<<<< HEAD - - def run(self, pos) : #문장 처리 수행 -======= _count = 0 def run(self, pos) : ->>>>>>> suhyun -======= - _count = 0 - - def run(self, pos) : ->>>>>>> 492542c074f374ccbd76ef3199ebedd3088e7bff path = "C:/Program Files/Java/jdk1.7.0_55/jre/bin/server/jvm.dll" slist = [] try : self.scd_parser('DOC1.SCD', slist) # scd에서 문장 추출, #id, date는 별도 저장 -<<<<<<< HEAD -<<<<<<< HEAD - self.pos_tagger(slist[0:10000], pos, path) # 문장 형태소 분석하여 명사 추출 -======= self.pos_tagger(slist[0:10], pos, path) # 문장 형태소 분석하여 명사 추출 ->>>>>>> suhyun -======= - self.pos_tagger(slist[0:10], pos, path) # 문장 형태소 분석하여 명사 추출 ->>>>>>> 492542c074f374ccbd76ef3199ebedd3088e7bff - return 1 + return 1, "ok" except : - return 0 + return 0, ex -<<<<<<< HEAD -<<<<<<< HEAD - -======= ->>>>>>> suhyun -======= ->>>>>>> 492542c074f374ccbd76ef3199ebedd3088e7bff def scd_parser(self, scd, slist) : sentence = open(scd,'r') try: for i in sentence : if i.startswith('') : -<<<<<<< HEAD -<<<<<<< HEAD - slist.append(i[9:]) - # slist.append(i.replace('','')) -======= - slist.append(i.replace('','')) ->>>>>>> suhyun -======= slist.append(i.replace('','')) ->>>>>>> 492542c074f374ccbd76ef3199ebedd3088e7bff - return 1 + return 1, "ok" except Exception as ex : - print(ex) - return 0 + return 0, ex def pos_tagger(self, slist, pos, path) : twitter = Twitter(path) -<<<<<<< HEAD -<<<<<<< HEAD - -======= ->>>>>>> 492542c074f374ccbd76ef3199ebedd3088e7bff - try: - # slist의 기사가 1개인경우 type이 str임으로 별도처리 - if type(slist) == str: - pos.append(twitter.nouns(slist)) - return 1 - - print("len(slist) : ",len(slist)) - for one in slist : -======= try: # slist의 기사가 1개인경우 type이 str임으로 별도처리 if type(slist) == str: - print(slist) pos.append(twitter.nouns(slist)) - return 1 + return 1, "ok" - print("len(slist) : ",len(slist)) for one in slist : - print(one) ->>>>>>> suhyun pos.append(twitter.nouns(one)) - return 1 + return 1, "ok" except Exception as ex : - print(ex) - return 0 - -<<<<<<< HEAD -<<<<<<< HEAD - # def id_manager(self) : - # return + return 0, ex - # def indexer(self) : -======= def id_manager(self, keyword, id_m, iid_m) : - for i in range(len(keyword)): - for j in range(len(keyword[i])): -======= - - def id_manager(self, keyword, id_m, iid_m) : - for i in range(len(keyword)): - for j in range(len(keyword[i])): - # print(keyword[i][j][0]) ->>>>>>> 492542c074f374ccbd76ef3199ebedd3088e7bff - key = keyword[i][j][0] - if key in id_m: - continue - - id_m[key] = self._count - iid_m[self._count] = key - - self._count += 1 + try: + for i in range(len(keyword)): + for j in range(len(keyword[i])): + # print(keyword[i][j][0]) + key = keyword[i][j][0] + if key in id_m: + continue + + id_m[key] = self._count + iid_m[self._count] = key + + self._count += 1 + return 1, "ok" + except Exception as ex : + return 0, ex def indexer(self, keyword, index) : - for i in range(len(keyword)): - for j in range(len(keyword[i])): - key = keyword[i][j][0] - - if key in index: - index[key] += [i] - continue - - index[key] = [i] -<<<<<<< HEAD ->>>>>>> suhyun -======= ->>>>>>> 492542c074f374ccbd76ef3199ebedd3088e7bff + try: + for i in range(len(keyword)): + for j in range(len(keyword[i])): + key = keyword[i][j][0] + + if key in index: + index[key] += [i] + continue + + index[key] = [i] + return 1, "ok" + except Exception as ex : + return 0, ex From b56911e045b9b674f6b1a8d86af37914df40029d Mon Sep 17 00:00:00 2001 From: kyh1126 Date: Thu, 28 Sep 2017 23:06:11 +0900 Subject: [PATCH 12/18] =?UTF-8?q?indent=20=EB=A7=9E=EC=B6=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/KeywordExtractor/SentenceManager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/KeywordExtractor/SentenceManager.py b/src/KeywordExtractor/SentenceManager.py index d7ca8a7..76533ea 100644 --- a/src/KeywordExtractor/SentenceManager.py +++ b/src/KeywordExtractor/SentenceManager.py @@ -79,6 +79,6 @@ def indexer(self, keyword, index) : continue index[key] = [i] - return 1, "ok" + return 1, "ok" except Exception as ex : return 0, ex From 5d5dd6285387563d35ab9218d014f7c1afda6d4c Mon Sep 17 00:00:00 2001 From: penbell Date: Thu, 28 Sep 2017 23:08:19 +0900 Subject: [PATCH 13/18] =?UTF-8?q?KE=20=EB=A6=AC=ED=84=B4=EA=B0=92=20?= =?UTF-8?q?=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/KeywordExtractor/KE.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/src/KeywordExtractor/KE.py b/src/KeywordExtractor/KE.py index 9f966ea..d2fa5bb 100644 --- a/src/KeywordExtractor/KE.py +++ b/src/KeywordExtractor/KE.py @@ -26,21 +26,17 @@ def __init__(self): self.key_iid = {} self.index = {} - def run(self): self.check(self.sm.run(self.pos), 'sm.run') self.check(self.ka.run(self.pos,self.keyword), 'ka.run') - #self.check(self.sm.id_manager(self.keyword, self.key_id, self.key_iid), 'id_manager') - #self.check(self.sm.id_indexer(self.keyword, self.index), 'indexer') - + self.check(self.sm.id_manager(self.keyword, self.key_id, self.key_iid), 'id_manager') + self.check(self.sm.id_indexer(self.keyword, self.index), 'indexer') - def check(self, ret, i) : + def check(self, ret, ex, i) : if ret is 1 : - print("{}함수 성공!".format(i)) - - else : - print("{}함수 실패!".format(i)) - + print("{} is success!".format(i)) + elif ret is 0 : + print("{} is fail! :[{}]".format(i, ex)) def print_pos(self): print(self.pos) @@ -68,11 +64,9 @@ def write_keyword(self): print("complete: [" + str(round(end, 2)) + "second ]") #pos keyword 출력 - ke.print_pos() - ke.print_keyword() + #ke.print_pos() + #ke.print_keyword() # file write - - - # ke.write_pos() - # ke.write_keyword() \ No newline at end of file + ke.write_pos() + ke.write_keyword() \ No newline at end of file From ac0359ef4d119835d8cd01ab5d8d43148ee64fc1 Mon Sep 17 00:00:00 2001 From: "user-PC\\user" Date: Thu, 28 Sep 2017 23:21:01 +0900 Subject: [PATCH 14/18] =?UTF-8?q?=EB=A6=AC=ED=84=B4=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/KeywordExtractor/KeywordAnalyzer.py | 80 ++++++++++++++----------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/src/KeywordExtractor/KeywordAnalyzer.py b/src/KeywordExtractor/KeywordAnalyzer.py index e06f7e2..0de13a3 100644 --- a/src/KeywordExtractor/KeywordAnalyzer.py +++ b/src/KeywordExtractor/KeywordAnalyzer.py @@ -24,70 +24,78 @@ def run(self, list1, keyword) : # 모든 함수를 실행 시키는 함수. try : self.tf_calc(list1) self.idf_calc() - self.tfidf_calc() + self.tfidf_calc(keyword) self.reset() - self.normalizer(self.each,keyword) + return 1 , 'ok' - return 1 - - except : - - return 0 + except Exception as ex: + return 0, ex def tf_calc(self, list1) : # tf를 계산하는 함수. - - for i in range(len(list1)) : - self._tf += [ Counter(list1[i]) ] # 1개의 기사에서 tf값을 구하고 _tf에 dic를 리스트로 저장. - + try : + for i in range(len(list1)) : + self._tf += [ Counter(list1[i]) ] # 1개의 기사에서 tf값을 구하고 _tf에 dic를 리스트로 저장. + except Exception as ex: + print(ex) + return 0 def idf_calc(self) : ''' 실제로 df를 계산하는 함수. 중복된 단어를 제거하고 _df변수에 df값을 채워넣는다. ''' - - isolate = Counter() - - for i in range(len(self._tf)): - isolate += self._tf[i] - - for key in isolate.keys() : + try: + isolate = Counter() for i in range(len(self._tf)): + isolate += self._tf[i] + + for key in isolate.keys() : - if key in self._tf[i]: #isolate.keys()가 _tf[i]에 들어있으면 if 실행. - #즉, 단어가 n기사에 들어 있으면 카운팅 해주는 것. - if key in self._df : - self._df[key] += 1 + for i in range(len(self._tf)): - else : - self._df[key] = 1 + if key in self._tf[i]: #isolate.keys()가 _tf[i]에 들어있으면 if 실행. + #즉, 단어가 n기사에 들어 있으면 카운팅 해주는 것. + if key in self._df : + self._df[key] += 1 - def tfidf_calc(self) : + else : + self._df[key] = 1 + return 1, "ok" + except Exception as ex: + return 0, ex + def tfidf_calc(self,keyword) : ''' tfidf를 계산하는 함수. 임시로 딕셔너리 받을 위치 생성(한 문서를 돌때마다 새로 생성.) 하고 keyword에 값을 넣는다. ''' + try: + for i in range(len(self._tf)) : - for i in range(len(self._tf)) : + each = {} - self.each = {} - - for j in self._tf[i].keys() : - self.each[j] = self._tf[i][j] / self._df[j] - return self.each + for j in self._tf[i].keys() : + each[j] = self._tf[i][j] / self._df[j] + each = self.normalizer(each) + keyword += [sorted(each.items(), key=operator.itemgetter(1), reverse=True)] + return 1, "ok" + except Exception as ex: + print(ex) + return 0, ex def reset(self) : self._tf = [] # func normalizer # tf-idf 점수 평준화(0~100점) - def normalizer(self,dict,keyword): - max_value = max(dict.values()) + def normalizer(self,k_dict): + max_value = max(k_dict.values()) + + for key in k_dict.keys(): + k_dict[key] = int((k_dict[key] / max_value) * 100) + return k_dict + - for key in dict.keys(): - dict[key] = int((dict[key] / max_value) * 100) - keyword += [sorted(dict.items(), key=operator.itemgetter(1), reverse=True)] From bdc5139db4242f54006960b6579cd0498f2dd765 Mon Sep 17 00:00:00 2001 From: penbell Date: Thu, 28 Sep 2017 23:34:27 +0900 Subject: [PATCH 15/18] =?UTF-8?q?=ED=82=A4=EC=9B=8C=EB=93=9C=20=EC=B6=94?= =?UTF-8?q?=EC=B6=9C=20=EC=A0=95=EB=A6=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/KeywordExtractor/KE.py | 15 +++++++++++---- src/KeywordExtractor/SentenceManager.py | 4 ++-- .../KeywordAnalyzer.cpython-35.pyc | Bin 2223 -> 3152 bytes .../SentenceManager.cpython-35.pyc | Bin 1573 -> 2768 bytes 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/KeywordExtractor/KE.py b/src/KeywordExtractor/KE.py index d2fa5bb..d7e8f86 100644 --- a/src/KeywordExtractor/KE.py +++ b/src/KeywordExtractor/KE.py @@ -27,10 +27,17 @@ def __init__(self): self.index = {} def run(self): - self.check(self.sm.run(self.pos), 'sm.run') - self.check(self.ka.run(self.pos,self.keyword), 'ka.run') - self.check(self.sm.id_manager(self.keyword, self.key_id, self.key_iid), 'id_manager') - self.check(self.sm.id_indexer(self.keyword, self.index), 'indexer') + ret, ex = self.sm.run(self.pos) + self.check(ret, ex, 'sm.run') + + ret, ex = self.ka.run(self.pos,self.keyword) + self.check(ret, ex, 'ka.run') + + ret, ex = self.sm.id_manager(self.keyword, self.key_id, self.key_iid) + self.check(ret, ex, 'id_manager') + + ret, ex = self.sm.indexer(self.keyword, self.index) + self.check(ret, ex, 'indexer') def check(self, ret, ex, i) : if ret is 1 : diff --git a/src/KeywordExtractor/SentenceManager.py b/src/KeywordExtractor/SentenceManager.py index 76533ea..7d0296b 100644 --- a/src/KeywordExtractor/SentenceManager.py +++ b/src/KeywordExtractor/SentenceManager.py @@ -20,10 +20,10 @@ def run(self, pos) : slist = [] try : self.scd_parser('DOC1.SCD', slist) # scd에서 문장 추출, #id, date는 별도 저장 - self.pos_tagger(slist[0:10], pos, path) # 문장 형태소 분석하여 명사 추출 + self.pos_tagger(slist[0:100], pos, path) # 문장 형태소 분석하여 명사 추출 return 1, "ok" - except : + except Exception as ex : return 0, ex def scd_parser(self, scd, slist) : diff --git a/src/KeywordExtractor/__pycache__/KeywordAnalyzer.cpython-35.pyc b/src/KeywordExtractor/__pycache__/KeywordAnalyzer.cpython-35.pyc index ad11e6e0d4a3eff53b5035a3086a1c780dd10116..fc8f411ba72df4ccd363a7bd756640be5034b522 100644 GIT binary patch literal 3152 zcmb_e-)|gO6+U-?Ar_G(NueR7Afu#01<6tsB&RD^!*piWS$n-} z-1#&QJ@jD7E( zd+(WZXTI~@@66GO3Hz4$w>R|?(cfv$rNf>^i{3yN5Y^BN3N&g8s=>%@jcS^*b!zIg zWKhi@F<3LHX43Zqg94NKxZ9-LaJM+d<%-q$i6FA4y-s|!EuB}KsB^W~X*t=0Pt)!9 zvfn>;We@sJvk}GF#(MUfcb)W;KWF{Tbi1GaZqxDGomL#mk>@QZd-zWJ z=z~GLfa7J4-%Gdu#4hQbkFsBXntt$^lRetpd9w9vGrK`{XT~e>Vqw_ zC;jaGyJ`Oq=`Zf3cOK^vL(jje9uYX~Y~9-Pph3frfuSdR_EV&mW-1~nT+AQO=ZWE^ z!I(yikbMfl0HhqfM)T-(YM@7M6%UYJJ%GWXIeTTX)Q%n?#0notu(p#tK z1`vk-f<`gW!=grIEX@l#uelnn9HrUW)=ML>F5ucsjt8`B|lnE-{y7qZL;SInGlMFiJ#2KF0&bFz@(vq=oJjTEtGI>7p!N6o&!J z>aqQB407pnn)x!(Mv(jygWhM12Z(kE2n6- zd!4~rF!=sUZ#9@`-5(zq<8>7ScaLH$S|sZX9#5IX!`}~VOqyljd0u~YrOszEcfmA| zXIFkevpA3g`dfaO$uMbviVse>muD~cgAdyirvC&yu4oaF8Zy2QZoiol(qu4q#+ z6<=zFM$~S4(Bz}+q{Hbz>J+g0RkY|hIx=-Jrkxa1+N5?8?T|R4Pl+O^wEDt+A`P`* zbhwy=BPOx*_CBWW!{OmVPL*OQ0{xtm4}oQRd|M?`30j#s#>DwDr-*Z$Pp=tN<`L## zonVR>$ovJG9q|BR5e(dmEvC>yfdhdkagi3F#A@q6>Gy4+@hN7)mO+~$)8+yWvaM+I z575R0ZORmn(;7-TA7q3yhXhh`50{(ffItNhXsq*Byzht;N*wKSPrmYO14T2hw;*Kt%e(2u-Sm&^D2MP^XR79o z)d~p8wRH3>$U7GtuBd~0I;y05NLR+=E9NBy8S)oI_@mF^U`hTb{Ul4Vd;wiz?|zOX zR@9bp7$jyR4ws^&*j^5$2SHD4o|q3~5L3}rPT|&ym22Dt$6~(L3@-4ASU8ZgMR;69 zi`bPcU7XY`?YMXmd_AYw3gJ7xpRc5cBOhub{8XzDFufF7U6YHAS?1Yg@O``ACkeapV^5;=)3|_8s zk}C82UhD*7&7nmMG>OC7q;L*EbT{2ImGPwLx-D-hblt>u-KBQWX|i2%-Ib2l%vThc ziYxL92&~GMWC4=Akz_n2@4ctIsV7r4PR9a^o+|q#VF^nsjhBk0a%rk$NM5ZJl7j07 zZ6Es@abqczU&L%O?zfvwd}rXxBvJyJ7=C-Xr*1iDr=<95{`lWzN<>_WEm6YX^P;5v E8z2=akN^Mx delta 1043 zcmZuvO-vI}5T5tm{f6x?(Aq3M)O@PLvsogEK6dK+Z`ALm2 zH6B&+j2`vs$(!-$$+PFZdeNIl4@RBY8YRT0^JU(9vorI}e4hs2_PGPGn6WTTt)#g)D=@(xwfJF)F&U$zT~Rj9Fwn!Ub2~#K|}nH_*c3*8tS0 zHf<+hDJ;kGSmd=fFyRPE9ZCx!U^QveA%qxtI>HE=uOWaKQa`m*aq|CWevg^Y>zCMw z+R$(0D$o|t2B_<>Zh(CC$Hz;32p2IeX0|G{k!}cH(|KuY9^0ZF2$eOSJ;*zT?PC0-fEb>YOLG45{ zvyR@~R1e(}o1*5GD{BF(ALVS~;|!WlO9eB<4L;1qxcVMlNYfx4*3EQe*F^W0DqEu> z?^vqVlW{3HY(O3(k7goippq&`ia%8PpJ+&V^np)cGlLo2j@gEN!j4ONbM(M0mE2~f iRFXsLS5NX>c>TXzL1bp5>25c?>!ebjieoYs=f43NKD9vr diff --git a/src/KeywordExtractor/__pycache__/SentenceManager.cpython-35.pyc b/src/KeywordExtractor/__pycache__/SentenceManager.cpython-35.pyc index 94640f2282a5851f9c863a13457c0644c10e3c06..c12aae1d7dc2e585e0c79b09bd42b82ed39dcf26 100644 GIT binary patch literal 2768 zcmaKuQEwDg6vxk<*_l~}t%a(^qUfqb)1+)$3{)zTYKtToE3qcP+GaBBOv|?0+3lSv z>^6lU6haIpCdP!s)`SEeAS5CW2BSV06W{y-^Xe1q_6zXj|J>c~woPHWbLZZ3?%bLA z|IRtLoo#LQu=Vfhzip!bXx*WsKZG;<5RFHaMJveDs32%oqpU`P$2t{snoUqPLE@N~ zAkUyB%rxi<<{0FeG}dj+9~VUSkUJlpEz0y@x_0Y9?fT8tn@c_EC%;{)E#F>Uxo;17 zZsfzczpsBFlRlJ5?>}e{o%3awF9vMR^kw>Nd+3bJ`;!H>%$+_e7I{&ZBbcS$-U!d_p5V+84%rUDoO3(u=plgDwBl9d}H_1zqw@vje@@(?9 zt3He=U~P-r$NV7jgPi}F8@N-xl)y|mel8yd&K)?1afav7;JFtxnj*TOQ{@0H5>3OX z(W1`LB6uu#fY-tcUDc>-5o3+6=!i9o#!d#kl!!E%u}Ml27Z7p_?L2!iL6aK%r}2IG z!ErG`rvgEzFdFMljxM}EJlK0e7N?{;oBkwU@WbBY?m4%2+MDUm3}pJ80|$DirQdrd zAM}R4MAY>*LSi8~^7(LoW^8ywecWR#NRRn5W34#Ttw|Hz1@7vxjmUFKuEYYdjX-lE zcM77!+m4oVeknp2cN=jc^b3=*4tJd3(2ETfpt0tc?~-J>o*o(*952b@w4aN{OBJL{ zFdoX>_^0(iJX(&Vn~REae3ST@QYF^qe9(b4LSBf3i9cQJ5PP(w=nxApZoUJvbDZaV z@P{;MUBe(Eg8X0xHVo_|Dj#GzBNG``7@C6?=v@&cyBO0nNW!=lS-1@@G1o;WTksMJ z-q(O<)Zv-5Xl#UM&_!bdyjaKu`|xm)LoEuA#H2)*7T!@YSx*517fXHs7LhBX5czR7 zHr*0TUTjIfRB&^?M5&{0Q*Os4s{@*R0lhdGHi9`8dD+g3vmEtiC-;(Q_TX%Y+YV{l zM7!7{l(@S$i`$BZTmpCT!LQuZcm)I27{QOcNKoY~iV*J5TM!4iM)CtCtBUVwD6U+r zDY9d%yq66l6R#1ZWc8cVP37x4owlVq0W-cwZ#iGZI;huH@%C~O)AL^xr;$`N zVym&aWfQzlWsN*vYK9p7Ay} zTa=>MQZs#d3=_gmG!6Mhhn5mMks~kH#ox8LC^haI?cNqA9Vc*SeaDGy$C)j9^9Amw z9A|FcEz~_Sjg=*DD605LPDZ(xn|HWr)koD+MUqP$zj4S(o)V^(+Mcpg?WvuTJ)_-b zY&nis%sEcho+$=}QYC|JQq_-as$rOq@`X_H#-wiA>N#=paQ&5!a1{@E^D?o~?a=-Q D>x6|L delta 921 zcmZ9K&rcIk5XWcU+ugR$);1uf6-YSvBbyKmffz-CBw$QTKXhg21wMsF_ugF2yME;Q zHFx62+xMN9`yzgaqj$!sgbPqXaL6)n$YJo9d=3tMhV~6`OmHmilaqq7olf%Qqk^}p zes$HAHAc16W6%<`W@v@CV~n^+H5tOi+1*r#!z%2kC2>Bd~44LRaKmf^tht&IIZLqHyXUL;Xop86`UY zVhmO@*Bdpr9?^9rW?g!IWLq*#!3rlH@&tKFIy?fNpqDVWA+xH2HaX>4Wcq1^J!Am$ z1SLndpRn|xh;NuXMR3jmK$y7`~*ziMnN^SI!u~S-}RwgwMH8Pb{^vH`MS5|o7yRxXV z|5KTwJsOukm!Dxu4`mY bb-4S_ Date: Sat, 30 Sep 2017 21:36:28 +0900 Subject: [PATCH 16/18] =?UTF-8?q?=EC=A3=BC=EC=84=9D=EA=B3=BC=20=20return?= =?UTF-8?q?=20=EC=B2=98=EB=A6=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/KeywordExtractor/SentenceManager.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/KeywordExtractor/SentenceManager.py b/src/KeywordExtractor/SentenceManager.py index 7d0296b..7400434 100644 --- a/src/KeywordExtractor/SentenceManager.py +++ b/src/KeywordExtractor/SentenceManager.py @@ -24,6 +24,7 @@ def run(self, pos) : return 1, "ok" except Exception as ex : + ex += 'SentenceManager-run' return 0, ex def scd_parser(self, scd, slist) : @@ -34,6 +35,7 @@ def scd_parser(self, scd, slist) : slist.append(i.replace('','')) return 1, "ok" except Exception as ex : + ex += 'SentenceManager-scd_parser' return 0, ex def pos_tagger(self, slist, pos, path) : @@ -48,10 +50,15 @@ def pos_tagger(self, slist, pos, path) : pos.append(twitter.nouns(one)) return 1, "ok" except Exception as ex : + ex += 'SentenceManager-pos_tagger' return 0, ex def id_manager(self, keyword, id_m, iid_m) : + ''' + keyword 에서 key value 에 따른 + 딕셔너리 생성 + ''' try: for i in range(len(keyword)): for j in range(len(keyword[i])): @@ -66,9 +73,13 @@ def id_manager(self, keyword, id_m, iid_m) : self._count += 1 return 1, "ok" except Exception as ex : + ex += 'SentenceManager-id_manager' return 0, ex def indexer(self, keyword, index) : + ''' + key 가 있는 위치를 저장하는 위치 설정 + ''' try: for i in range(len(keyword)): for j in range(len(keyword[i])): @@ -81,4 +92,5 @@ def indexer(self, keyword, index) : index[key] = [i] return 1, "ok" except Exception as ex : + ex += 'SentenceManager-indexer' return 0, ex From ba61a7edce9fd812eba9fc43a9ead24a03980b44 Mon Sep 17 00:00:00 2001 From: kyh1126 Date: Sat, 30 Sep 2017 21:37:22 +0900 Subject: [PATCH 17/18] =?UTF-8?q?=ED=82=A4=EC=9B=8C=EB=93=9C=20=EB=84=A3?= =?UTF-8?q?=EC=9C=BC=EB=A9=B4=20=EC=95=84=EC=9D=B4=EB=94=94=20=EC=B6=9C?= =?UTF-8?q?=EB=A0=A5,=20=EB=93=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/test/t_SentenceManager.py | 120 +++++++++++++--------------------- 1 file changed, 47 insertions(+), 73 deletions(-) diff --git a/src/test/t_SentenceManager.py b/src/test/t_SentenceManager.py index 8687ca8..c351091 100644 --- a/src/test/t_SentenceManager.py +++ b/src/test/t_SentenceManager.py @@ -1,14 +1,8 @@ # - *- coding: utf- 8 - *- ''' -<<<<<<< HEAD -@author: 신승식, 김윤희 -@date: 2017.9.19 -@version: 1.0.1 -======= @author: 조수현, 김윤희 @date: 2017.9.28 @version: 2.0.1 ->>>>>>> 492542c074f374ccbd76ef3199ebedd3088e7bff @brief: sentence manager -Use class: SentenceManager @@ -21,93 +15,73 @@ pos = [] slist = [] path = "C:/Program Files/Java/jdk1.7.0_55/jre/bin/server/jvm.dll" -<<<<<<< HEAD -<<<<<<< HEAD -======= id_m = {} iid_m = {} keyword = [[('1,10,-1,10,3.4,-10.2', 1.0), ('안녕', 0.6666666666666666), ('하세요', 0.5)], [('suhyun', 1.0), ('안녕', 1.0)], [('반갑습니다.', 1.0), ('하세요', 0.5), ('안녕', 0.3333333333333333)], [('하세요', 0.25), ('안녕', 0.16666666666666666)], [('suhyun', 0.5), ('안녕', 0.5)], [('반갑습니다.', 0.5), ('하세요', 0.25), ('안녕', 0.16666666666666666)], [('하세요', 0.14285714285714285), ('안녕', 0.1111111111111111)], [('1,10,-1,10,3.4,-10.2', 0.5), ('안녕', 0.2222222222222222), ('하세요', 0.14285714285714285)], [('반갑습니다.', 0.3333333333333333), ('하세요', 0.14285714285714285), ('안녕', 0.1111111111111111)], [('하세요', 0.1111111111111111), ('안녕', 0.08333333333333333)], [('1,10,-1,10,3.4,-10.2', 0.3333333333333333), ('안녕', 0.16666666666666666), ('하세요', 0.1111111111111111)], [('suhyun', 0.3333333333333333), ('안녕', 0.25)]] -index = {} # 키워드 별 문서 id indexing +index = {} # 키워드 별 문서 id indexing ->>>>>>> 492542c074f374ccbd76ef3199ebedd3088e7bff start = time.time() # print(p.run(pos, slist)) -try: - print(p.scd_parser('DOC1.SCD', slist)) - print(slist[0]) -except Exception as ex: - print(ex) - -print("________________________________________") -print() - -try: - print(p.pos_tagger(slist[0:1], pos, path)) - print(pos) -except Exception as ex: - print(ex) -======= - -id_m = {} -iid_m = {} -keyword = [[('1,10,-1,10,3.4,-10.2', 1.0), ('안녕', 0.6666666666666666), ('하세요', 0.5)], [('suhyun', 1.0), ('안녕', 1.0)], [('반갑습니다.', 1.0), ('하세요', 0.5), ('안녕', 0.3333333333333333)], [('하세요', 0.25), ('안녕', 0.16666666666666666)], [('suhyun', 0.5), ('안녕', 0.5)], [('반갑습니다.', 0.5), ('하세요', 0.25), ('안녕', 0.16666666666666666)], [('하세요', 0.14285714285714285), ('안녕', 0.1111111111111111)], [('1,10,-1,10,3.4,-10.2', 0.5), ('안녕', 0.2222222222222222), ('하세요', 0.14285714285714285)], [('반갑습니다.', 0.3333333333333333), ('하세요', 0.14285714285714285), ('안녕', 0.1111111111111111)], [('하세요', 0.1111111111111111), ('안녕', 0.08333333333333333)], [('1,10,-1,10,3.4,-10.2', 0.3333333333333333), ('안녕', 0.16666666666666666), ('하세요', 0.1111111111111111)], [('suhyun', 0.3333333333333333), ('안녕', 0.25)]] -index = {} # 키워드 별 문서 id indexing - -start = time.time() - -# print(p.run(pos, slist)) - -try: - print(p.scd_parser('DOC1.SCD', slist)) - # print(slist[0:2]) -except Exception as ex: - print(ex) - -print("________________________________________") -print() - -try: - print(p.pos_tagger(slist[0:2], pos, path)) - print(pos) -except Exception as ex: - print(ex) - # try: -# p.id_manager(keyword, id_m, iid_m) -# print(id_m) -# print(iid_m) +# print(p.scd_parser('DOC1.SCD', slist)) +# print(slist[0]) # except Exception as ex: -# print(ex) +# print(ex) # print("________________________________________") # print() # try: -# p.indexer(keyword, index) -# print(index) +# print(p.pos_tagger(slist[0:1], pos, path)) +# print(pos) # except Exception as ex: -# print(ex) - ->>>>>>> suhyun +# print(ex) -# try: -# p.id_manager(keyword, id_m, iid_m) -# print(id_m) -# print(iid_m) -# except Exception as ex: -# print(ex) +try: + p.id_manager(keyword, id_m, iid_m) + print(id_m) + print() + while True : + inp = input('키워드를 입력하세요 >') + if inp not in id_m : + continue + else : + print(id_m[inp]) + break + print("-----------") + print() + print(iid_m) + print() + while True : + inp2 = int(input('아이디를 입력하세요 >')) + if inp2 not in iid_m : + continue + else : + print(iid_m[inp2]) + break + +except Exception as ex: + print(ex) -# print("________________________________________") -# print() +print("________________________________________") +print() -# try: -# p.indexer(keyword, index) -# print(index) -# except Exception as ex: -# print(ex) +try: + p.indexer(keyword, index) + print(index) + + while True : + inp3 = input('키워드를 입력하세요 >') + if inp3 not in index : + continue + else : + print(index[inp3]) + break +except Exception as ex: + print(ex) end = time.time() - start print(" end: [" + str(round(end,2)) + " sec]") \ No newline at end of file From 18e4b04e1ebbac945cb0c066aa721d74a901c027 Mon Sep 17 00:00:00 2001 From: kyh1126 Date: Sat, 30 Sep 2017 22:01:02 +0900 Subject: [PATCH 18/18] =?UTF-8?q?=EC=98=AC=EB=9D=BC=EA=B0=80=EB=9D=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/KeywordExtractor/SentenceManager.py | 27 +++++++------------------ src/test/t_SentenceManager.py | 14 ++++++------- 2 files changed, 14 insertions(+), 27 deletions(-) diff --git a/src/KeywordExtractor/SentenceManager.py b/src/KeywordExtractor/SentenceManager.py index 7400434..1095e6e 100644 --- a/src/KeywordExtractor/SentenceManager.py +++ b/src/KeywordExtractor/SentenceManager.py @@ -20,11 +20,10 @@ def run(self, pos) : slist = [] try : self.scd_parser('DOC1.SCD', slist) # scd에서 문장 추출, #id, date는 별도 저장 - self.pos_tagger(slist[0:100], pos, path) # 문장 형태소 분석하여 명사 추출 + self.pos_tagger(slist[0:10], pos, path) # 문장 형태소 분석하여 명사 추출 return 1, "ok" except Exception as ex : - ex += 'SentenceManager-run' return 0, ex def scd_parser(self, scd, slist) : @@ -35,7 +34,6 @@ def scd_parser(self, scd, slist) : slist.append(i.replace('','')) return 1, "ok" except Exception as ex : - ex += 'SentenceManager-scd_parser' return 0, ex def pos_tagger(self, slist, pos, path) : @@ -50,20 +48,14 @@ def pos_tagger(self, slist, pos, path) : pos.append(twitter.nouns(one)) return 1, "ok" except Exception as ex : - ex += 'SentenceManager-pos_tagger' return 0, ex def id_manager(self, keyword, id_m, iid_m) : - ''' - keyword 에서 key value 에 따른 - 딕셔너리 생성 - ''' try: - for i in range(len(keyword)): - for j in range(len(keyword[i])): - # print(keyword[i][j][0]) - key = keyword[i][j][0] + for i, l in enumerate(keyword): + for j in l: + key = j[0] if key in id_m: continue @@ -73,17 +65,13 @@ def id_manager(self, keyword, id_m, iid_m) : self._count += 1 return 1, "ok" except Exception as ex : - ex += 'SentenceManager-id_manager' return 0, ex def indexer(self, keyword, index) : - ''' - key 가 있는 위치를 저장하는 위치 설정 - ''' try: - for i in range(len(keyword)): - for j in range(len(keyword[i])): - key = keyword[i][j][0] + for i, l in enumerate(keyword) : + for j in l : + key = j[0] if key in index: index[key] += [i] @@ -92,5 +80,4 @@ def indexer(self, keyword, index) : index[key] = [i] return 1, "ok" except Exception as ex : - ex += 'SentenceManager-indexer' return 0, ex diff --git a/src/test/t_SentenceManager.py b/src/test/t_SentenceManager.py index c351091..66043fb 100644 --- a/src/test/t_SentenceManager.py +++ b/src/test/t_SentenceManager.py @@ -73,13 +73,13 @@ p.indexer(keyword, index) print(index) - while True : - inp3 = input('키워드를 입력하세요 >') - if inp3 not in index : - continue - else : - print(index[inp3]) - break + # while True : + # inp3 = input('키워드를 입력하세요 >') + # if inp3 not in index : + # continue + # else : + # print(index[inp3]) + # break except Exception as ex: print(ex)