|
| 1 | +# coding=UTF-8 |
| 2 | +import nltk |
| 3 | +from nltk.corpus import brown |
| 4 | +import os |
| 5 | + |
| 6 | +# This is a fast and simple noun phrase extractor (based on NLTK) |
| 7 | +# Feel free to use it, just keep a link back to this post |
| 8 | +# http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/ |
| 9 | +# http://www.sharejs.com/codes/ |
| 10 | +# Create by Shlomi Babluki |
| 11 | +# May, 2013 |
| 12 | + |
| 13 | + |
| 14 | +# This is our fast Part of Speech tagger |
| 15 | +############################################################################# |
| 16 | +brown_train = brown.tagged_sents(categories='news') |
| 17 | +regexp_tagger = nltk.RegexpTagger( |
| 18 | + [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), |
| 19 | + (r'(-|:|;)$', ':'), |
| 20 | + (r'\'*$', 'MD'), |
| 21 | + (r'(The|the|A|a|An|an)$', 'AT'), |
| 22 | + (r'.*able$', 'JJ'), |
| 23 | + (r'^[A-Z].*$', 'NNP'), |
| 24 | + (r'.*ness$', 'NN'), |
| 25 | + (r'.*ly$', 'RB'), |
| 26 | + (r'.*s$', 'NNS'), |
| 27 | + (r'.*ing$', 'VBG'), |
| 28 | + (r'.*ed$', 'VBD'), |
| 29 | + (r'.*', 'NN') |
| 30 | + ]) |
| 31 | +unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger) |
| 32 | +bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger) |
| 33 | +############################################################################# |
| 34 | + |
| 35 | + |
| 36 | +# This is our semi-CFG; Extend it according to your own needs |
| 37 | +############################################################################# |
| 38 | +cfg = {} |
| 39 | +cfg["NNP+NNP"] = "NNP" |
| 40 | +cfg["NN+NN"] = "NNI" |
| 41 | +cfg["NNI+NN"] = "NNI" |
| 42 | +cfg["JJ+JJ"] = "JJ" |
| 43 | +cfg["JJ+NN"] = "NNI" |
| 44 | +############################################################################# |
| 45 | + |
| 46 | + |
| 47 | +class NPExtractor(object): |
| 48 | + |
| 49 | + def __init__(self, sentence): |
| 50 | + self.sentence = sentence |
| 51 | + |
| 52 | + # Split the sentence into singlw words/tokens |
| 53 | + def tokenize_sentence(self, sentence): |
| 54 | + tokens = nltk.word_tokenize(sentence) |
| 55 | + return tokens |
| 56 | + |
| 57 | + # Normalize brown corpus' tags ("NN", "NN-PL", "NNS" > "NN") |
| 58 | + def normalize_tags(self, tagged): |
| 59 | + n_tagged = [] |
| 60 | + for t in tagged: |
| 61 | + if t[1] == "NP-TL" or t[1] == "NP": |
| 62 | + n_tagged.append((t[0], "NNP")) |
| 63 | + continue |
| 64 | + if t[1].endswith("-TL"): |
| 65 | + n_tagged.append((t[0], t[1][:-3])) |
| 66 | + continue |
| 67 | + if t[1].endswith("S"): |
| 68 | + n_tagged.append((t[0], t[1][:-1])) |
| 69 | + continue |
| 70 | + n_tagged.append((t[0], t[1])) |
| 71 | + return n_tagged |
| 72 | + |
| 73 | + # Extract the main topics from the sentence |
| 74 | + def extract(self): |
| 75 | + |
| 76 | + tokens = self.tokenize_sentence(self.sentence) |
| 77 | + tags = self.normalize_tags(bigram_tagger.tag(tokens)) |
| 78 | + |
| 79 | + merge = True |
| 80 | + while merge: |
| 81 | + merge = False |
| 82 | + for x in range(0, len(tags) - 1): |
| 83 | + t1 = tags[x] |
| 84 | + t2 = tags[x + 1] |
| 85 | + key = "%s+%s" % (t1[1], t2[1]) |
| 86 | + value = cfg.get(key, '') |
| 87 | + if value: |
| 88 | + merge = True |
| 89 | + tags.pop(x) |
| 90 | + tags.pop(x) |
| 91 | + match = "%s %s" % (t1[0], t2[0]) |
| 92 | + pos = value |
| 93 | + tags.insert(x, (match, pos)) |
| 94 | + break |
| 95 | + |
| 96 | + matches = [] |
| 97 | + for t in tags: |
| 98 | + if t[1] == "NNP" or t[1] == "NNI": |
| 99 | + # if t[1] == "NNP" or t[1] == "NNI" or t[1] == "NN": |
| 100 | + matches.append(t[0]) |
| 101 | + return matches |
| 102 | + |
| 103 | + |
| 104 | +# Main method, just run "python np_extractor.py" |
| 105 | +def main(): |
| 106 | + path = '.' |
| 107 | + for file in os.listdir(path): |
| 108 | + text = [] |
| 109 | + if file.endswith('.txt'): |
| 110 | + with open(file, 'rt') as f: |
| 111 | + for line in f: |
| 112 | + words = line.split() |
| 113 | + text += words |
| 114 | + str_text=' '.join(text) |
| 115 | + np_extractor = NPExtractor(str_text) |
| 116 | + result = np_extractor.extract() |
| 117 | + print("This file is about: %s" % ", ".join(result)) |
| 118 | +if __name__ == '__main__': |
| 119 | + main() |
0 commit comments