Source code for geograpy.extraction
import re
import nltk
from newspaper import Article
from geograpy.labels import Labels
[docs]
class Extractor(object):
"""
Extract geo context for text or from url
"""
def __init__(self, text=None, url=None, debug=False):
"""
Constructor
Args:
text(string): the text to analyze
url(string): the url to read the text to analyze from
debug(boolean): if True show debug information
"""
if not text and not url:
raise Exception("text or url is required")
self.debug = debug
self.text = text
self.url = url
self.places = []
nltk_packages = [
"maxent_ne_chunker",
"words",
"treebank",
"maxent_treebank_pos_tagger",
"punkt",
"averaged_perceptron_tagger",
]
for nltk_package in nltk_packages:
try:
import nltk
nltk.data.find(nltk_package)
except LookupError:
nltk.downloader.download(nltk_package, quiet=True)
import nltk
[docs]
def set_text(self):
"""
Setter for text
"""
if not self.text and self.url:
a = Article(self.url)
a.download()
a.parse()
self.text = a.text
[docs]
def split(self, delimiter=r","):
"""
simpler regular expression splitter with not entity check
hat tip: https://stackoverflow.com/a/1059601/1497139
"""
self.set_text()
self.places = re.split(delimiter, self.text)
[docs]
def find_geoEntities(self):
"""
Find geographic entities
Returns:
list:
List of places
"""
self.find_entities(Labels.geo)
return self.places
[docs]
def find_entities(self, labels=Labels.default):
"""
Find entities with the given labels set self.places and returns it
Args:
labels:
Labels: The labels to filter
Returns:
list:
List of places
"""
self.set_text()
text = nltk.word_tokenize(self.text)
nes = nltk.ne_chunk(nltk.pos_tag(text))
for ne in nes:
if type(ne) is nltk.tree.Tree:
nelabel = ne.label()
if nelabel in labels:
leaves = ne.leaves()
if self.debug:
print(leaves)
self.places.append(" ".join([i[0] for i in leaves]))
return self.places