Source code for geograpy.extraction

import re

import nltk
from newspaper import Article

from geograpy.labels import Labels



[docs]
class Extractor(object):
    """
    Extract geo context for text or from url
    """

    def __init__(self, text=None, url=None, debug=False):
        """
        Constructor
        Args:

            text(string): the text to analyze
            url(string): the url to read the text to analyze from
            debug(boolean): if True show debug information
        """
        if not text and not url:
            raise Exception("text or url is required")
        self.debug = debug
        self.text = text
        self.url = url
        self.places = []
        nltk_packages = [
            "maxent_ne_chunker",
            "words",
            "treebank",
            "maxent_treebank_pos_tagger",
            "punkt",
            "averaged_perceptron_tagger",
        ]
        for nltk_package in nltk_packages:
            try:
                import nltk

                nltk.data.find(nltk_package)
            except LookupError:
                nltk.downloader.download(nltk_package, quiet=True)
        import nltk


[docs]
    def set_text(self):
        """
        Setter for text
        """
        if not self.text and self.url:
            a = Article(self.url)
            a.download()
            a.parse()
            self.text = a.text



[docs]
    def split(self, delimiter=r","):
        """
        simpler regular expression splitter with not entity check

        hat tip: https://stackoverflow.com/a/1059601/1497139
        """
        self.set_text()
        self.places = re.split(delimiter, self.text)



[docs]
    def find_geoEntities(self):
        """
        Find geographic entities

        Returns:
            list:
                List of places
        """
        self.find_entities(Labels.geo)
        return self.places



[docs]
    def find_entities(self, labels=Labels.default):
        """
        Find entities with the given labels set self.places and returns it
        Args:
            labels:
                Labels: The labels to filter
        Returns:
            list:
                List of places
        """
        self.set_text()

        text = nltk.word_tokenize(self.text)
        nes = nltk.ne_chunk(nltk.pos_tag(text))

        for ne in nes:
            if type(ne) is nltk.tree.Tree:
                nelabel = ne.label()
                if nelabel in labels:
                    leaves = ne.leaves()
                    if self.debug:
                        print(leaves)
                    self.places.append(" ".join([i[0] for i in leaves]))
        return self.places
Source code for geograpy.extraction

geograpy3

Navigation

Related Topics