Python nltk 模块,RegexpParser() 实例源码


项目:PyRATA    作者:nicolashernandez    | 项目源码 | 文件源码
def nltk_parse_clause(sentence):
  Natural Language Toolkit: code_cascaded_chunker
  grammar = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
  cp = nltk.RegexpParser(grammar)
  #sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"),  ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
  parsed_sentence = cp.parse(sentence)
  #print('parsed_sentence=', parsed_sentence)
项目:kpex    作者:christophfeinauer    | 项目源码 | 文件源码
def extract_chunks(text_string,max_words=3,lemmatize=False):

    # Any number of adjectives followed by any number of nouns and (optionally) again
    # any number of adjectives folowerd by any number of nouns
    grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'

    # Makes chunks using grammar regex
    chunker = nltk.RegexpParser(grammar)

    # Get grammatical functions of words
    # What this is doing: tag(sentence -> words)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text_string))

    # Make chunks from the sentences, using grammar. Output in IOB.
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                        for tagged_sent in tagged_sents))
    # Join phrases based on IOB syntax.
    candidates = [' '.join(w[0] for w in group).lower() for key, group in itertools.groupby(all_chunks, lambda l: l[2] != 'O') if key]

    # Filter by maximum keyphrase length
    candidates = list(filter(lambda l: len(l.split()) <= 3, candidates))

    # Filter phrases consisting of punctuation or stopwords
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    candidates = list(filter(lambda l: l not in stop_words and not all(c in punct for c in l),candidates))

    # lemmatize
    if lemmatize:
        lemmatizer = nltk.stem.WordNetLemmatizer().lemmatize
        candidates =  [lemmatizer(x) for x in candidates]

    return candidates
项目:repeat-aft    作者:ripeta    | 项目源码 | 文件源码
def extract_chunks(sent, chunkGram = r"""Chunk: {<JJ|NN.*>*<NNP>+<JJ|NN.*|IN>*<NN.*>}"""):
        tagged = pos_tag(word_tokenize(sent))
        #Maybe actually better if possessives aren't included.
        #At least one Proper Noun (NNP) should be included in the noun chunk. Also a single NNP is
        #probably not enough information to identify a data source
        chunkParser = RegexpParser(chunkGram)
        chunked = chunkParser.parse(tagged)
        chunks = []
        for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
            chunk = ""
            for leave in subtree.leaves():
                chunk += leave[0] + ' '
        return chunked, chunks
    except Exception as e:
项目:StrepHit    作者:Wikidata    | 项目源码 | 文件源码
def setup_extractor(self):
        self.splitter = PunktSentenceSplitter(self.language)
        grammar = self.grammars.get(self.language)
        if grammar:
            self.parser = RegexpParser(grammar)
            raise ValueError(
                "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % (
                    self.language, self.grammars.keys())

        for lemma, match_tokens in self.lemma_to_token.iteritems():
            self.lemma_to_token[lemma] = set([match.lower() for match in match_tokens])
项目:information-extraction-PT    作者:davidsbatista    | 项目源码 | 文件源码
def main():

    verb = "<ADV>*<AUX>*<VERB><PART>*<ADV>*"
    word = "<NOUN|ADJ|ADV|DET|ADP>"
    preposition = "<ADP|ADJ>"

    rel_pattern = "( %s (%s* (%s)+ )? )+ " % (verb, word, preposition)
    grammar_long = '''REL_PHRASE: {%s}''' % rel_pattern

    print grammar_long
    reverb_pattern = nltk.RegexpParser(grammar_long)

    # test_patterns(reverb_pattern)

项目:patentdata    作者:benhoyle    | 项目源码 | 文件源码
def determine_entities(self):
        """ Determines noun entities within a patent claim.
        param: pos - list of tuples from nltk pos tagger"""
        # Define grammar for chunking
        grammar = '''
            NP: {<DT|PRP\$> <VBG> <NN.*>+}
                {<DT|PRP\$> <NN.*> <POS> <JJ>* <NN.*>+}
                {<DT|PRP\$>? <JJ>* <NN.*>+ }
        cp = nltk.RegexpParser(grammar)
        # Or store as part of claim object property?

        # Option: split into features / clauses, run over clauses and
        # then re-correlate
        return cp.parse(self.pos)
项目:chitti    作者:bhuvi8    | 项目源码 | 文件源码
def find_chunk(sent, chunk_rule=None):
    if not chunk_rule: 
        chunk_rule = 'QWORD: <W.*><V.*><DT>*{<.*>*?<N.*>+}'
    cp = nltk.RegexpParser(chunk_rule)
    tree = cp.parse(sent)
    for subtree in tree.subtrees():
        if subtree.label() == label:
            subtree = ' '.join([a[0] for a in subtree ])
            return subtree
项目:chitti    作者:bhuvi8    | 项目源码 | 文件源码
def find_chunk(sent,chunk_rule=None):
    if not chunk_rule: 
        chunk_rule = 'HCHUNK: <W.*><.*>*?{<N.*>+}'
    cp = nltk.RegexpParser(chunk_rule)
    tree = cp.parse(sent)
    for subtree in tree.subtrees():
        if subtree.label() == label:
            subtree = ' '.join([a[0] for a in subtree ])
            print (subtree)
            return subtree
##this is required only once
项目:acl2017-interactive_summarizer    作者:UKPLab    | 项目源码 | 文件源码
def get_parse_info(parsestr, stemmer, language, stoplist):
    hash_token_pos = OrderedDict()
    if language=='german':
        grammar = r"""
            {<N.*|ADJ.*>*<N.*>}  # Nouns and Adjectives, terminated with Nouns
            {<V.*>}  # terminated with Verbs
            {<NBAR><APPR><NBAR>}  # Above, connected with in/of/etc...
    if language=='english':
        #Taken from Su Nam Kim Paper...
        grammar = r"""
            {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
            {<V.*>}  # terminated with Verbs
            {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...

    chunker = RegexpParser(grammar)

    postoks = []
    for i in Tree.fromstring(parsestr).subtrees():
        if i.height() == 2:
            word, pos = i[0], i.label()
            hash_token_pos[stemmer.stem(word)] = word + u"::" + pos
            postoks.append((word, pos))

    chunk_tree = chunker.parse(postoks)
    phrases = get_terms(chunk_tree, stemmer, stoplist)
    phrase_list = [ ' '.join(term) for term in phrases if term]
    return hash_token_pos, phrase_list
项目:minetext    作者:gustavoaires    | 项目源码 | 文件源码
def drawNamedEntityTree(self, text):
        tokenized_text = self.tokenizer.tokenize(text)
        tagged_text = self.tagWords(tokenized_text)
        grammar = "ENT: {<PESSOA>*}"
        cp = RegexpParser(grammar)
        res = cp.parse(tagged_text)

    # Tokenizar sentenas em palavras. Retorna uma lista com as palavras que formam o texto.
项目:minetext    作者:gustavoaires    | 项目源码 | 文件源码
def drawNamedEntityTree(self, text):
        tokenized_text = self.tokenizer.tokenize(text)
        tagged_text = self.tagWords(tokenized_text)
        grammar = "ENT: {<PESSOA>*}"
        cp = RegexpParser(grammar)
        res = cp.parse(tagged_text)

    # Tokenizar sentenas em palavras. Retorna uma lista com as palavras que formam o texto.
项目:cvscan    作者:skcript    | 项目源码 | 文件源码
def fetch_all_organizations(resume_text):
  organizations = set()
  tokenized_sentences = nltk.sent_tokenize(resume_text)

  # Custom grammar with NLTK
  # NP - Noun Phrase
  # NN - Noun
  # NNP - Proper Noun
  # V - Verb
  # JJ - Adjective

  # In a sentence that contains NN NNNP V NN NN JJ NN.
  # The noun-phrases fetched are:
  # NP: NN NNP
  # NP: NN NN
  # NP: NN

  # Ex, "Application Developer at Delta Force"
  # => ["Application Developer", "Delta Force"]

  grammar = r"""NP: {<NN|NNP>+}"""
  parser = nltk.RegexpParser(grammar)

  avoid_organizations = utilities.get_avoid_organizations()

  for sentence in tokenized_sentences:

    # tags all parts of speech in the tokenized sentences
    tagged_words = nltk.pos_tag(nltk.word_tokenize(sentence))

    # then chunks with customize grammar
    # np_chunks are instances of class nltk.tree.Tree
    np_chunks = parser.parse(tagged_words)
    noun_phrases = []

    for np_chunk in np_chunks:
      if isinstance(np_chunk, nltk.tree.Tree) and np_chunk.label() == 'NP':
        # if np_chunk is of grammer 'NP' then create a space seperated string of all leaves under the 'NP' tree
        noun_phrase = ""
        for (org, tag) in np_chunk.leaves():
          noun_phrase += org + ' '


    # Using name entity chunker to get all the organizations
    chunks = nltk.ne_chunk(tagged_words)
    for chunk in chunks:
      if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'ORGANIZATION':
        (organization, tag) = chunk[0]

        # if organization is in the noun_phrase, it means that there is a high chance of noun_phrase containing the employer name
        # eg, Delta Force is added to organizations even if only Delta is recognized as an organization but Delta Force is a noun-phrase
        for noun_phrase in noun_phrases:
          if organization in noun_phrase and organization not in avoid_organizations:

  return organizations
项目:patentdata    作者:benhoyle    | 项目源码 | 文件源码
def label_nounphrases(self):
        """ Label noun phrases in the output from pos chunking. """
        grammar = '''
            NP: {<DT|PRP\$> <VBG> <NN.*>+}
                {<DT|PRP\$> <NN.*> <POS> <JJ>* <NN.*>+}
                {<DT|PRP\$>? <JJ>* <NN.*>+ }

        cp = nltk.RegexpParser(grammar)
        result = cp.parse(self.pos)
        ptree = nltk.tree.ParentedTree.convert(result)
        subtrees = ptree.subtrees(filter=lambda x: x.label() == 'NP')

        # build up mapping dict - if not in dict add new entry id+1;
        # if in dict label using key
        mapping_dict = {}
        pos_to_np = {}
        for st in subtrees:
            np_string = " ".join(
                    leaf[0] for leaf in st.leaves()
                    if leaf[1] != ("DT" or "PRP$")
            np_id = mapping_dict.get(np_string, None)
            if not np_id:
                # put ends_with here
                nps = [i[0] for i in mapping_dict.items()]
                ends_with_list = [
                    np for np in nps if ends_with(np_string, np)
                if ends_with_list:
                    np_id = mapping_dict[ends_with_list[0]]
                    np_id = len(mapping_dict)+1
                    mapping_dict[np_string] = np_id
            pos_to_np[st.parent_index()] = np_id

        # Label Tree with entities
        flat_list = []
        for i in range(0, len(ptree)):
            # print(i)
            # Label
            if isinstance(ptree[i], nltk.tree.Tree):
                for leaf in ptree[i].leaves():
                    # Unpack leaf and add label as triple
                    flat_list.append((leaf[0], leaf[1], pos_to_np.get(i, "")))
                    (ptree[i][0], ptree[i][1], pos_to_np.get(i, ""))
        return (flat_list, mapping_dict)
项目:memex-dossier-open    作者:dossier    | 项目源码 | 文件源码
def noun_phrases_as_tokens(text):
    '''Generate a bag of lists of unnormalized tokens representing noun
    phrases from ``text``.

    This is built around python's nltk library for getting Noun
    Phrases (NPs). This is all documented in the NLTK Book and blog posts that cite the

    :rtype: list of lists of strings

    ## from NLTK Book:
    sentence_re = r'''(?x)      # set flag to allow verbose regexps
          ([A-Z])(\.[A-Z])+\.?  # abbreviations, e.g. U.S.A.
        | \w+(-\w+)*            # words with optional internal hyphens
        | \$?\d+(\.\d+)?%?      # currency and percentages, e.g. $12.40, 82%
        | \.\.\.                # ellipsis
        | [][.,;"'?():-_`]      # these are separate tokens

    ## From Su Nam Kim paper:
    grammar = r'''
            {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns

            {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
    if len(text.strip()) == 0:
        return []

    chunker = nltk.RegexpParser(grammar)

    toks = nltk.regexp_tokenize(text, sentence_re)
    postoks = nltk.tag.pos_tag(toks)

    #print postoks
    tree = chunker.parse(postoks)
    stops = stopwords.words('english')
    stops += dossier_stopwords()

    ## These next four functions are standard uses of NLTK illustrated by
    def leaves(tree):
        '''Finds NP (nounphrase) leaf nodes of a chunk tree.'''
        for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
            yield subtree.leaves()

    def acceptable_word(word):
        '''Checks conditions for acceptable word: length, stopword.'''
        return 2 <= len(word) <= 40 and word.lower() not in stops

    def get_terms(tree):
        for leaf in leaves(tree):
            yield [w for w,t in leaf if acceptable_word(w)]

    return list(get_terms(tree))