"""
Created on 2020-09-23
@author: wf
"""
import re
from lodstorage.sparql import SPARQL
from geograpy.utils import Profiler
[docs]
class Wikidata(object):
"""
Wikidata access
"""
def __init__(
self, endpoint="https://query.wikidata.org/sparql", profile: bool = True
):
"""
Constructor
"""
self.endpoint = endpoint
self.profile = profile
[docs]
def query(self, msg, queryString: str, limit=None) -> list:
"""
get the query result
Args:
msg(str): the profile message to display
queryString(str): the query to execute
Return:
list: the list of dicts with the result
"""
profile = Profiler(msg, profile=self.profile)
wd = SPARQL(self.endpoint)
limitedQuery = queryString
if limit is not None:
limitedQuery = f"{queryString} LIMIT {limit}"
results = wd.query(limitedQuery)
lod = wd.asListOfDicts(results)
for record in lod:
for key in list(record.keys()):
value = record[key]
if isinstance(value, str):
if value.startswith("http://www.wikidata.org/"):
record[key] = self.getWikidataId(value)
if key.lower().endswith("coord"):
lat, lon = Wikidata.getCoordinateComponents(value)
record["lat"] = lat
record["lon"] = lon
record.pop(key)
profile.time(f"({len(lod)})")
return lod
[docs]
def store2DB(self, lod, tableName: str, primaryKey: str = None, sqlDB=None):
"""
store the given list of dicts to the database
Args:
lod(list): the list of dicts
tableName(str): the table name to use
primaryKey(str): primary key (if any)
sqlDB(SQLDB): target SQL database
"""
msg = f"Storing {tableName}"
profile = Profiler(msg, profile=self.profile)
entityInfo = sqlDB.createTable(
lod,
entityName=tableName,
primaryKey=primaryKey,
withDrop=True,
sampleRecordCount=-1,
)
sqlDB.store(lod, entityInfo, fixNone=True)
profile.time()
[docs]
def getCountries(self, limit=None):
"""
get a list of countries
`try query <https://query.wikidata.org/#%23%20get%20a%20list%20of%20countries%0A%23%20for%20geograpy3%20library%0A%23%20see%20https%3A%2F%2Fgithub.com%2Fsomnathrakshit%2Fgeograpy3%2Fissues%2F15%0APREFIX%20rdfs%3A%20%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23%3E%0APREFIX%20wd%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fdirect%2F%3E%0APREFIX%20p%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2F%3E%0APREFIX%20ps%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fstatement%2F%3E%0APREFIX%20pq%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fqualifier%2F%3E%0A%23%20get%20City%20details%20with%20Country%0ASELECT%20DISTINCT%20%3Fcountry%20%3FcountryLabel%20%3FcountryIsoCode%20%3FcountryPopulation%20%3FcountryGDP_perCapita%20%3Fcoord%20%20WHERE%20%7B%0A%20%20%23%20instance%20of%20City%20Country%0A%20%20%3Fcountry%20wdt%3AP31%2Fwdt%3AP279%2a%20wd%3AQ3624078%20.%0A%20%20%23%20label%20for%20the%20country%0A%20%20%3Fcountry%20rdfs%3Alabel%20%3FcountryLabel%20filter%20%28lang%28%3FcountryLabel%29%20%3D%20%22en%22%29.%0A%20%20%23%20get%20the%20coordinates%0A%20%20%3Fcountry%20wdt%3AP625%20%3Fcoord.%0A%20%20%23%20https%3A%2F%2Fwww.wikidata.org%2Fwiki%2FProperty%3AP297%20ISO%203166-1%20alpha-2%20code%0A%20%20%3Fcountry%20wdt%3AP297%20%3FcountryIsoCode.%0A%20%20%23%20population%20of%20country%0A%20%20%3Fcountry%20wdt%3AP1082%20%3FcountryPopulation.%0A%20%20%23%20https%3A%2F%2Fwww.wikidata.org%2Fwiki%2FProperty%3AP2132%0A%20%20%23%20nonminal%20GDP%20per%20capita%0A%20%20%3Fcountry%20wdt%3AP2132%20%3FcountryGDP_perCapita.%0A%7D>`_
"""
queryString = """# get a list of countries
# for geograpy3 library
# see https://github.com/somnathrakshit/geograpy3/issues/15
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
# get City details with Country
SELECT DISTINCT ?wikidataid ?name ?iso ?pop ?coord
WHERE {
BIND (?countryQ AS ?wikidataid)
# instance of Country
# inverse path see https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization#Inverse_property_paths
wd:Q6256 ^wdt:P279*/^wdt:P31 ?countryQ .
# VALUES ?country { wd:Q55}.
# label for the country
?countryQ rdfs:label ?name filter (lang(?name) = "en").
# get the continent (s)
#OPTIONAL {
# ?country wdt:P30 ?continent.
# ?continent rdfs:label ?continentLabel filter (lang(?continentLabel) = "en").
#}
# get the coordinates
OPTIONAL {
?countryQ wdt:P625 ?coord.
}
# https://www.wikidata.org/wiki/Property:P297 ISO 3166-1 alpha-2 code
?countryQ wdt:P297 ?iso.
# population of country
OPTIONAL
{
SELECT ?countryQ (max(?countryPopulationValue) as ?pop)
WHERE {
?countryQ wdt:P1082 ?countryPopulationValue
} group by ?countryQ
}
# https://www.wikidata.org/wiki/Property:P2132
# nominal GDP per capita
# OPTIONAL { ?country wdt:P2132 ?countryGDP_perCapitaValue. }
}
ORDER BY ?iso"""
msg = "Getting countries from wikidata ETA 10s"
countryList = self.query(msg, queryString, limit=limit)
return countryList
[docs]
def getRegions(self, limit=None):
"""
get Regions from Wikidata
`try query <https://query.wikidata.org/#%23%20get%20a%20list%20of%20regions%0A%23%20for%20geograpy3%20library%0A%23%20see%20https%3A%2F%2Fgithub.com%2Fsomnathrakshit%2Fgeograpy3%2Fissues%2F15%0APREFIX%20rdfs%3A%20%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23%3E%0APREFIX%20wd%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fdirect%2F%3E%0APREFIX%20wikibase%3A%20%3Chttp%3A%2F%2Fwikiba.se%2Fontology%23%3E%0ASELECT%20%3Fcountry%20%3FcountryLabel%20%3FcountryIsoCode%20%3Fregion%20%3FregionIsoCode%20%3FregionLabel%20%3Fpopulation%20%3Flocation%0AWHERE%0A%7B%0A%20%20%23%20administrative%20unit%20of%20first%20order%0A%20%20%3Fregion%20wdt%3AP31%2Fwdt%3AP279%2a%20wd%3AQ10864048.%0A%20%20OPTIONAL%20%7B%0A%20%20%20%20%20%3Fregion%20rdfs%3Alabel%20%3FregionLabel%20filter%20%28lang%28%3FregionLabel%29%20%3D%20%22en%22%29.%0A%20%20%7D%0A%20%20%23%20filter%20historic%20regions%0A%20%20%23%20FILTER%20NOT%20EXISTS%20%7B%3Fregion%20wdt%3AP576%20%3Fend%7D%0A%20%20%23%20get%20the%20population%0A%20%20%23%20https%3A%2F%2Fwww.wikidata.org%2Fwiki%2FProperty%3AP1082%0A%20%20OPTIONAL%20%7B%20%3Fregion%20wdt%3AP1082%20%3Fpopulation.%20%7D%0A%20%20%23%20%23%20https%3A%2F%2Fwww.wikidata.org%2Fwiki%2FProperty%3AP297%0A%20%20OPTIONAL%20%7B%20%0A%20%20%20%20%3Fregion%20wdt%3AP17%20%3Fcountry.%0A%20%20%20%20%23%20label%20for%20the%20country%0A%20%20%20%20%3Fcountry%20rdfs%3Alabel%20%3FcountryLabel%20filter%20%28lang%28%3FcountryLabel%29%20%3D%20%22en%22%29.%0A%20%20%20%20%3Fcountry%20wdt%3AP297%20%3FcountryIsoCode.%20%0A%20%20%7D%0A%20%20%23%20isocode%20state%2Fprovince%0A%20%20%3Fregion%20wdt%3AP300%20%3FregionIsoCode.%0A%20%20%23%20https%3A%2F%2Fwww.wikidata.org%2Fwiki%2FProperty%3AP625%0A%20%20OPTIONAL%20%7B%20%3Fregion%20wdt%3AP625%20%3Flocation.%20%7D%0A%7D>`_
"""
queryString = """# get a list of regions
# for geograpy3 library
# see https://github.com/somnathrakshit/geograpy3/issues/15
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wikibase: <http://wikiba.se/ontology#>
SELECT DISTINCT ?countryId (?regionQ as ?wikidataid) ?name ?iso ?pop ?coord
WHERE
{
# administrative unit of first order
?regionQ wdt:P31/wdt:P279* wd:Q10864048.
OPTIONAL {
?regionQ rdfs:label ?name filter (lang(?name) = "en").
}
# isocode state/province (mandatory - filters historic regions while at it ...)
# filter historic regions
# FILTER NOT EXISTS {?region wdt:P576 ?end}
{
SELECT ?regionQ (max(?regionAlpha2) as ?iso) (max(?regionPopulationValue) as ?pop) (max(?locationValue) as ?coord)
WHERE {
?regionQ wdt:P300 ?regionAlpha2.
# get the population
# https://www.wikidata.org/wiki/Property:P1082
OPTIONAL {
?regionQ wdt:P1082 ?regionPopulationValue
}
# get the location
# https://www.wikidata.org/wiki/Property:P625
OPTIONAL {
?regionQ wdt:P625 ?locationValue.
}
} GROUP BY ?regionQ
}
# # https://www.wikidata.org/wiki/Property:P297
OPTIONAL {
?regionQ wdt:P17 ?countryId.
}
} ORDER BY ?iso"""
msg = "Getting regions from wikidata ETA 15s"
regionList = self.query(msg, queryString, limit=limit)
return regionList
[docs]
def getCities(self, limit=1000000):
"""
get all human settlements as list of dict with duplicates for label, region, country ...
"""
queryString = """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
SELECT DISTINCT (?cityQ as ?wikidataid) ?city ?altLabel ?geoNameId ?gndId ?cityPopulation ?cityCoord ?regionId ?countryId
WHERE {
# instance of human settlement https://www.wikidata.org/wiki/Q486972
wd:Q486972 ^wdt:P279*/^wdt:P31 ?cityQ .
# Values
# VALUES ?cityQ { wd:Q656 }
# label of the City
?cityQ rdfs:label ?city filter (lang(?city) = "en").
OPTIONAL {
?cityQ skos:altLabel ?altLabel .
FILTER (lang(?altLabel) = "en")
}
# geoName Identifier
OPTIONAL {
?cityQ wdt:P1566 ?geoNameId.
}
# GND-ID
OPTIONAL {
?cityQ wdt:P227 ?gndId.
}
# population of city
OPTIONAL {
SELECT ?cityQ (max(?cityPopulationValue) as ?cityPopulation)
WHERE {
?cityQ wdt:P1082 ?cityPopulationValue
} group by ?cityQ
}
OPTIONAL{
?cityQ wdt:P625 ?cityCoord .
}
# region this city belongs to
OPTIONAL {
?cityQ wdt:P131 ?regionId .
}
# country this city belongs to
OPTIONAL {
?cityQ wdt:P17 ?countryId .
}
}
"""
msg = "Getting cities (human settlements) from wikidata ETA 50 s"
citiesList = self.query(msg, queryString, limit=limit)
return citiesList
[docs]
def getCitiesForRegion(self, regionId, msg):
"""
get the cities for the given Region
"""
regionPath = (
"?region ^wdt:P131/^wdt:P131/^wdt:P131 ?cityQ."
if regionId in ["Q980", "Q21"]
else "?cityQ wdt:P131* ?region."
)
queryString = """# get cities by region for geograpy3
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wd: <http://www.wikidata.org/entity/>
SELECT distinct (?cityQ as ?wikidataid) ?name ?geoNameId ?gndId ?regionId ?countryId ?pop ?coord WHERE {
VALUES ?hsType {
wd:Q1549591 wd:Q3957 wd:Q5119 wd:Q15284 wd:Q62049 wd:Q515 wd:Q1637706 wd:Q1093829 wd:Q486972 wd:Q532
}
VALUES ?region {
wd:%s
}
# region the city should be in
%s
# type of human settlement to try
?hsType ^wdt:P279*/^wdt:P31 ?cityQ.
# label of the City
?cityQ rdfs:label ?name filter (lang(?name) = "en").
# geoName Identifier
OPTIONAL {
?cityQ wdt:P1566 ?geoNameId.
}
# GND-ID
OPTIONAL {
?cityQ wdt:P227 ?gndId.
}
OPTIONAL{
?cityQ wdt:P625 ?coord .
}
# region this city belongs to
OPTIONAL {
?cityQ wdt:P131 ?regionId .
}
OPTIONAL {
?cityQ wdt:P1082 ?pop
}
# country this city belongs to
OPTIONAL {
?cityQ wdt:P17 ?countryId .
}
}""" % (
regionId,
regionPath,
)
regionCities = self.query(msg, queryString)
return regionCities
[docs]
def getCityStates(self, limit=None):
"""
get city states from Wikidata
`try query <https://query.wikidata.org/#%23%20get%20a%20list%20of%20city%20states%0A%23%20for%20geograpy3%20library%0APREFIX%20rdfs%3A%20%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23%3E%0APREFIX%20wd%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fdirect%2F%3E%0APREFIX%20wikibase%3A%20%3Chttp%3A%2F%2Fwikiba.se%2Fontology%23%3E%0ASELECT%20DISTINCT%20%3FcountryId%20%28%3FcityStateQ%20as%20%3Fwikidataid%29%20%3Fname%20%3Fiso%20%3Fpop%20%3Fcoord%0AWHERE%0A%7B%0A%20%20%23%20all%20citiy%20states%0A%20%20%3FcityStateQ%20wdt%3AP31%20wd%3AQ133442%20.%0A%20%20%3FcityStateQ%20rdfs%3Alabel%20%3Fname%20filter%20%28lang%28%3Fname%29%20%3D%20%22en%22%29.%0A%20%20%7B%20%0A%20%20%20%20SELECT%20%3FcityStateQ%20%28max%28%3FisoCode%29%20as%20%3Fiso%29%20%28max%28%3FpopulationValue%29%20as%20%3Fpop%29%20%28max%28%3FlocationValue%29%20as%20%3Fcoord%29%0A%20%20%20%20WHERE%20%7B%0A%20%20%20%20%20%20%3FcityStateQ%20wdt%3AP300%7Cwdt%3AP297%20%3FisoCode.%0A%20%20%20%20%20%20%23%20get%20the%20population%0A%20%20%20%20%20%20%23%20https%3A%2F%2Fwww.wikidata.org%2Fwiki%2FProperty%3AP1082%0A%20%20%20%20%20%20OPTIONAL%20%7B%0A%20%20%20%20%20%20%20%20%3FcityStateQ%20wdt%3AP1082%20%3FpopulationValue%0A%20%20%20%20%20%20%7D%20%0A%20%20%20%20%20%20%23%20get%20the%20location%0A%20%20%20%20%20%20%23%20https%3A%2F%2Fwww.wikidata.org%2Fwiki%2FProperty%3AP625%0A%20%20%20%20%20%20OPTIONAL%20%7B%0A%20%20%20%20%20%20%20%20%3FcityStateQ%20wdt%3AP625%20%3FlocationValue.%20%0A%20%20%20%20%20%20%20%7D%0A%20%20%20%20%7D%20GROUP%20BY%20%3FcityStateQ%0A%20%20%7D%0A%20%20OPTIONAL%20%7B%20%0A%20%20%20%20%3FcityStateQ%20wdt%3AP17%20%3FcountryId.%0A%20%20%7D%0A%7D%20ORDER%20BY%20%3Fiso>`_
"""
queryString = """# get a list of city states
# for geograpy3 library
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wikibase: <http://wikiba.se/ontology#>
SELECT DISTINCT ?countryId (?cityStateQ as ?wikidataid) ?name ?iso ?pop ?coord
WHERE
{
# all citiy states
?cityStateQ wdt:P31 wd:Q133442 .
?cityStateQ rdfs:label ?name filter (lang(?name) = "en").
{
SELECT ?cityStateQ (max(?isoCode) as ?iso) (max(?populationValue) as ?pop) (max(?locationValue) as ?coord)
WHERE {
?cityStateQ wdt:P300|wdt:P297 ?isoCode.
# get the population
# https://www.wikidata.org/wiki/Property:P1082
OPTIONAL {
?cityStateQ wdt:P1082 ?populationValue
}
# get the location
# https://www.wikidata.org/wiki/Property:P625
OPTIONAL {
?cityStateQ wdt:P625 ?locationValue.
}
} GROUP BY ?cityStateQ
}
OPTIONAL {
?cityStateQ wdt:P17 ?countryId.
}
} ORDER BY ?iso"""
msg = "Getting regions from wikidata ETA 15s"
cityStateList = self.query(msg, queryString, limit=limit)
return cityStateList
[docs]
@staticmethod
def getCoordinateComponents(coordinate: str) -> (float, float):
"""
Converts the wikidata coordinate representation into its subcomponents longitude and latitude
Example: 'Point(-118.25 35.05694444)' results in ('-118.25' '35.05694444')
Args:
coordinate: coordinate value in the format as returned by wikidata queries
Returns:
Returns the longitude and latitude of the given coordinate as separate values
"""
# https://stackoverflow.com/a/18237992/1497139
floatRegex = r"[-+]?\d+([.,]\d*)?"
regexp = rf"Point\((?P<lon>{floatRegex})\s+(?P<lat>{floatRegex})\)"
cMatch = None
if coordinate:
try:
cMatch = re.search(regexp, coordinate)
except Exception as ex:
# ignore
pass
if cMatch:
latStr = cMatch.group("lat")
lonStr = cMatch.group("lon")
lat, lon = float(latStr.replace(",", ".")), float(lonStr.replace(",", "."))
if lon > 180:
lon = lon - 360
return lat, lon
else:
# coordinate does not have the expected format
return None, None
[docs]
@staticmethod
def getWikidataId(wikidataURL: str):
"""
Extracts the wikidata id from the given wikidata URL
Args:
wikidataURL: wikidata URL the id should be extracted from
Returns:
The wikidata id if present in the given wikidata URL otherwise None
"""
# regex pattern taken from https://www.wikidata.org/wiki/Q43649390 and extended to also support property ids
wikidataidMatch = re.search(r"[PQ][1-9]\d*", wikidataURL)
if wikidataidMatch and wikidataidMatch.group(0):
wikidataid = wikidataidMatch.group(0)
return wikidataid
else:
return None
[docs]
@staticmethod
def getValuesClause(varName: str, values, wikidataEntities: bool = True):
"""
generates the SPARQL value clause for the given variable name containing the given values
Args:
varName: variable name for the ValuesClause
values: values for the clause
wikidataEntities(bool): if true the wikidata prefix is added to the values otherwise it is expected taht the given values are proper IRIs
Returns:
str
"""
clauseValues = ""
if isinstance(values, list):
for value in values:
if wikidataEntities:
clauseValues += f"wd:{value} "
else:
clauseValues += f"{value} "
else:
if wikidataEntities:
clauseValues = f"wd:{values} "
else:
clauseValues = f"{values} "
clause = "VALUES ?%s { %s }" % (varName, clauseValues)
return clause