diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 51d849b..99de2e0 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,14 @@ qwikidata Change Log ==================== +v0.3.0 +====== + +**Added**: + +* utility module with dump_entities_to_json function +* example directory referenced from README + v0.2.1 ====== diff --git a/README.rst b/README.rst index 94c3c18..627e80b 100644 --- a/README.rst +++ b/README.rst @@ -43,22 +43,27 @@ You can install the most recent version using pip, pip install qwikidata -Quick Start -=========== +Quick Examples +============== -.. code-block:: python - from qwikidata.linked_data_interface import get_entity_dict_from_api - from qwikidata.entity import WikidataItem, WikidataProperty, WikidataLexeme +Linked Data Interface +--------------------- - q42_dict = get_entity_dict_from_api('Q42') - q42 = WikidataItem(q42_dict) +.. literalinclude:: ../examples/basic_linked_data_interface.py - p279_dict = get_entity_dict_from_api('P279') - p279 = WikidataProperty(p279_dict) - l3_dict = get_entity_dict_from_api('L3') - l3 = WikidataLexeme(l3_dict) +SPARQL Query Service +-------------------- + +.. literalinclude:: ../examples/basic_sparql_query_service.py + + +JSON Dump +--------- + +.. literalinclude:: ../examples/basic_json_dump.py + License diff --git a/examples/basic_json_dump.py b/examples/basic_json_dump.py new file mode 100644 index 0000000..74a20b5 --- /dev/null +++ b/examples/basic_json_dump.py @@ -0,0 +1,59 @@ +import time + +from qwikidata.entity import WikidataItem +from qwikidata.json_dump import WikidataJsonDump +from qwikidata.utils import dump_entities_to_json + +P_OCCUPATION = "P106" +Q_POLITICIAN = "Q82955" + + +def has_occupation_politician(item: WikidataItem, truthy: bool = True) -> bool: + """Return True if the Wikidata Item has occupation politician.""" + if truthy: + claim_group = item.get_truthy_claim_group(P_OCCUPATION) + else: + claim_group = item.get_claim_group(P_OCCUPATION) + + occupation_qids = [ + claim.mainsnak.datavalue.value["id"] + for claim in claim_group + if claim.mainsnak.snaktype == "value" + ] + return Q_POLITICIAN in occupation_qids + + +# create an instance of WikidataJsonDump +wjd_dump_path = "wikidata-20190401-all.json.bz2" +wjd = WikidataJsonDump(wjd_dump_path) + +# create an iterable of WikidataItem representing politicians +politicians = [] +t1 = time.time() +for ii, entity_dict in enumerate(wjd): + + if entity_dict["type"] == "item": + entity = WikidataItem(entity_dict) + if has_occupation_politician(entity): + politicians.append(entity) + + if ii % 1000 == 0: + t2 = time.time() + dt = t2 - t1 + print( + "found {} politicians among {} entities [entities/s: {:.2f}]".format( + len(politicians), ii, ii / dt + ) + ) + + if ii > 10_000: + break + +# write the iterable of WikidataItem to disk as JSON +out_fname = "filtered_entities.json" +dump_entities_to_json(politicians, out_fname) +wjd_filtered = WikidataJsonDump(out_fname) + +# load filtered entities and create instances of WikidataItem +for ii, entity_dict in enumerate(wjd_filtered): + item = WikidataItem(entity_dict) diff --git a/examples/basic_linked_data_interface.py b/examples/basic_linked_data_interface.py new file mode 100644 index 0000000..233d893 --- /dev/null +++ b/examples/basic_linked_data_interface.py @@ -0,0 +1,17 @@ +from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty +from qwikidata.linked_data_interface import get_entity_dict_from_api + +# create an item representing "Douglas Adams" +Q_DOUGLAS_ADAMS = "Q42" +q42_dict = get_entity_dict_from_api(Q_DOUGLAS_ADAMS) +q42 = WikidataItem(q42_dict) + +# create a property representing "subclass of" +P_SUBCLASS_OF = "P279" +p279_dict = get_entity_dict_from_api(P_SUBCLASS_OF) +p279 = WikidataProperty(p279_dict) + +# create a lexeme representing "bank" +L_BANK = "L3354" +l3354_dict = get_entity_dict_from_api(L_BANK) +l3354 = WikidataLexeme(l3354_dict) diff --git a/examples/basic_sparql_query_service.py b/examples/basic_sparql_query_service.py new file mode 100644 index 0000000..cdebcd6 --- /dev/null +++ b/examples/basic_sparql_query_service.py @@ -0,0 +1,17 @@ +from qwikidata.sparql import (get_subclasses_of_item, + return_sparql_query_results) + +# send any sparql query to the wikidata query service and get full result back +# here we use an example that counts the number of humans +sparql_query = """ +SELECT (COUNT(?item) AS ?count) +WHERE { + ?item wdt:P31/wdt:P279* wd:Q5 . +} +""" +res = return_sparql_query_results(sparql_query) + + +# use convenience function to get subclasses of an item as a list of item ids +Q_RIVER = "Q4022" +subclasses_of_river = get_subclasses_of_item(Q_RIVER) diff --git a/qwikidata/__init__.py b/qwikidata/__init__.py index de99aac..403b9af 100644 --- a/qwikidata/__init__.py +++ b/qwikidata/__init__.py @@ -2,4 +2,4 @@ """Metadata for this package.""" __package_name__ = "qwikidata" -__version__ = "0.2.1" +__version__ = "0.3.0" diff --git a/qwikidata/entity.py b/qwikidata/entity.py index e2bea0f..32762d1 100644 --- a/qwikidata/entity.py +++ b/qwikidata/entity.py @@ -575,3 +575,6 @@ def __str__(self) -> str: def __repr__(self) -> str: return self.__str__() + + +WikidataEntity = Union[WikidataItem, WikidataProperty, WikidataLexeme] diff --git a/qwikidata/utils.py b/qwikidata/utils.py new file mode 100644 index 0000000..f2b7684 --- /dev/null +++ b/qwikidata/utils.py @@ -0,0 +1,35 @@ +# Copyright 2019 Kensho Technologies, LLC. +"""qwikidata utilities.""" + +import itertools +import json +from typing import Iterable, Iterator, Tuple + +from qwikidata.entity import WikidataEntity + + +def pairwise(iterable: Iterable) -> Iterator[Tuple]: + """Return pairwise tuples s -> (s0,s1), (s1,s2), (s2, s3), ...""" + a, b = itertools.tee(iterable) + next(b, None) + return zip(a, b) + + +def dump_entities_to_json(entities: Iterable[WikidataEntity], out_fname: str) -> None: + """Write entities to JSON file. + + Parameters + ---------- + entities + An iterable of instances of WikidataEntity + out_fname + Output file name + """ + with open(out_fname, "w") as fp: + fp.write("[\n") + for ent_lo, ent_hi in pairwise(entities): + ent_str = json.dumps(ent_lo._entity_dict) + fp.write("{},\n".format(ent_str)) + ent_str = json.dumps(ent_hi._entity_dict) + fp.write("{}".format(ent_str)) + fp.write("\n]")