一个非常高效的提取内容关键词的python代码
发布时间:2020-05-25 00:02:49 所属栏目:Python 来源:互联网
导读:一个非常高效的提取内容关键词的python代码
|
下面是脚本之家 jb51.cc 通过网络收集整理的代码片段。 脚本之家小编现在分享给大家,也给大家做个参考。 # coding=UTF-8
import nltk
from nltk.corpus import brown
# This is a fast and simple noun phrase extractor (based on NLTK)
# Feel free to use it,just keep a link back to this post
# http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/
# Create by Shlomi Babluki
# May,2013
# This is our fast Part of Speech tagger
#############################################################################
brown_train = brown.tagged_sents(categories='news')
regexp_tagger = nltk.RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$','CD'),(r'(-|:|;)$',':'),(r''*$','MD'),(r'(The|the|A|a|An|an)$','AT'),(r'.*able$','JJ'),(r'^[A-Z].*$','NNP'),(r'.*ness$','NN'),(r'.*ly$','RB'),(r'.*s$','NNS'),(r'.*ing$','VBG'),(r'.*ed$','VBD'),(r'.*','NN')
])
unigram_tagger = nltk.UnigramTagger(brown_train,backoff=regexp_tagger)
bigram_tagger = nltk.BigramTagger(brown_train,backoff=unigram_tagger)
#############################################################################
# This is our semi-CFG; Extend it according to your own needs
#############################################################################
cfg = {}
cfg["NNP+NNP"] = "NNP"
cfg["NN+NN"] = "NNI"
cfg["NNI+NN"] = "NNI"
cfg["JJ+JJ"] = "JJ"
cfg["JJ+NN"] = "NNI"
#############################################################################
class NPExtractor(object):
def __init__(self,sentence):
self.sentence = sentence
# Split the sentence into singlw words/tokens
def tokenize_sentence(self,sentence):
tokens = nltk.word_tokenize(sentence)
return tokens
# Normalize brown corpus' tags ("NN","NN-PL","NNS" > "NN")
def normalize_tags(self,tagged):
n_tagged = []
for t in tagged:
if t[1] == "NP-TL" or t[1] == "NP":
n_tagged.append((t[0],"NNP"))
continue
if t[1].endswith("-TL"):
n_tagged.append((t[0],t[1][:-3]))
continue
if t[1].endswith("S"):
n_tagged.append((t[0],t[1][:-1]))
continue
n_tagged.append((t[0],t[1]))
return n_tagged
# Extract the main topics from the sentence
def extract(self):
tokens = self.tokenize_sentence(self.sentence)
tags = self.normalize_tags(bigram_tagger.tag(tokens))
merge = True
while merge:
merge = False
for x in range(0,len(tags) - 1):
t1 = tags[x]
t2 = tags[x + 1]
key = "%s+%s" % (t1[1],t2[1])
value = cfg.get(key,'')
if value:
merge = True
tags.pop(x)
tags.pop(x)
match = "%s %s" % (t1[0],t2[0])
pos = value
tags.insert(x,(match,pos))
break
matches = []
for t in tags:
if t[1] == "NNP" or t[1] == "NNI":
#if t[1] == "NNP" or t[1] == "NNI" or t[1] == "NN":
matches.append(t[0])
return matches
# Main method,just run "python np_extractor.py"
def main():
sentence = "Swayy is a beautiful new dashboard for discovering and curating online content."
np_extractor = NPExtractor(sentence)
result = np_extractor.extract()
print "This sentence is about: %s" % ",".join(result)
if __name__ == '__main__':
main()
以上是脚本之家(jb51.cc)为你收集整理的全部代码内容,希望文章能够帮你解决所遇到的程序开发问题。 如果觉得脚本之家网站内容还不错,欢迎将脚本之家网站推荐给程序员好友。 (编辑:安卓应用网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |
