Skip to content

Commit

Permalink
Output and docs (#22)
Browse files Browse the repository at this point in the history
* added utils module with `dump_entities_to_json` function
* created an example directory and linked them in the README
  • Loading branch information
galtay authored Apr 10, 2019
1 parent faeb8cc commit 3af9dba
Show file tree
Hide file tree
Showing 8 changed files with 156 additions and 12 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@
qwikidata Change Log
====================

v0.3.0
======

**Added**:

* utility module with dump_entities_to_json function
* example directory referenced from README

v0.2.1
======

Expand Down
27 changes: 16 additions & 11 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,22 +43,27 @@ You can install the most recent version using pip,
pip install qwikidata
Quick Start
===========
Quick Examples
==============

.. code-block:: python

from qwikidata.linked_data_interface import get_entity_dict_from_api
from qwikidata.entity import WikidataItem, WikidataProperty, WikidataLexeme
Linked Data Interface
---------------------

q42_dict = get_entity_dict_from_api('Q42')
q42 = WikidataItem(q42_dict)
.. literalinclude:: ../examples/basic_linked_data_interface.py

p279_dict = get_entity_dict_from_api('P279')
p279 = WikidataProperty(p279_dict)

l3_dict = get_entity_dict_from_api('L3')
l3 = WikidataLexeme(l3_dict)
SPARQL Query Service
--------------------

.. literalinclude:: ../examples/basic_sparql_query_service.py


JSON Dump
---------

.. literalinclude:: ../examples/basic_json_dump.py



License
Expand Down
59 changes: 59 additions & 0 deletions examples/basic_json_dump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import time

from qwikidata.entity import WikidataItem
from qwikidata.json_dump import WikidataJsonDump
from qwikidata.utils import dump_entities_to_json

P_OCCUPATION = "P106"
Q_POLITICIAN = "Q82955"


def has_occupation_politician(item: WikidataItem, truthy: bool = True) -> bool:
"""Return True if the Wikidata Item has occupation politician."""
if truthy:
claim_group = item.get_truthy_claim_group(P_OCCUPATION)
else:
claim_group = item.get_claim_group(P_OCCUPATION)

occupation_qids = [
claim.mainsnak.datavalue.value["id"]
for claim in claim_group
if claim.mainsnak.snaktype == "value"
]
return Q_POLITICIAN in occupation_qids


# create an instance of WikidataJsonDump
wjd_dump_path = "wikidata-20190401-all.json.bz2"
wjd = WikidataJsonDump(wjd_dump_path)

# create an iterable of WikidataItem representing politicians
politicians = []
t1 = time.time()
for ii, entity_dict in enumerate(wjd):

if entity_dict["type"] == "item":
entity = WikidataItem(entity_dict)
if has_occupation_politician(entity):
politicians.append(entity)

if ii % 1000 == 0:
t2 = time.time()
dt = t2 - t1
print(
"found {} politicians among {} entities [entities/s: {:.2f}]".format(
len(politicians), ii, ii / dt
)
)

if ii > 10_000:
break

# write the iterable of WikidataItem to disk as JSON
out_fname = "filtered_entities.json"
dump_entities_to_json(politicians, out_fname)
wjd_filtered = WikidataJsonDump(out_fname)

# load filtered entities and create instances of WikidataItem
for ii, entity_dict in enumerate(wjd_filtered):
item = WikidataItem(entity_dict)
17 changes: 17 additions & 0 deletions examples/basic_linked_data_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty
from qwikidata.linked_data_interface import get_entity_dict_from_api

# create an item representing "Douglas Adams"
Q_DOUGLAS_ADAMS = "Q42"
q42_dict = get_entity_dict_from_api(Q_DOUGLAS_ADAMS)
q42 = WikidataItem(q42_dict)

# create a property representing "subclass of"
P_SUBCLASS_OF = "P279"
p279_dict = get_entity_dict_from_api(P_SUBCLASS_OF)
p279 = WikidataProperty(p279_dict)

# create a lexeme representing "bank"
L_BANK = "L3354"
l3354_dict = get_entity_dict_from_api(L_BANK)
l3354 = WikidataLexeme(l3354_dict)
17 changes: 17 additions & 0 deletions examples/basic_sparql_query_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from qwikidata.sparql import (get_subclasses_of_item,
return_sparql_query_results)

# send any sparql query to the wikidata query service and get full result back
# here we use an example that counts the number of humans
sparql_query = """
SELECT (COUNT(?item) AS ?count)
WHERE {
?item wdt:P31/wdt:P279* wd:Q5 .
}
"""
res = return_sparql_query_results(sparql_query)


# use convenience function to get subclasses of an item as a list of item ids
Q_RIVER = "Q4022"
subclasses_of_river = get_subclasses_of_item(Q_RIVER)
2 changes: 1 addition & 1 deletion qwikidata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
"""Metadata for this package."""

__package_name__ = "qwikidata"
__version__ = "0.2.1"
__version__ = "0.3.0"
3 changes: 3 additions & 0 deletions qwikidata/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,3 +575,6 @@ def __str__(self) -> str:

def __repr__(self) -> str:
return self.__str__()


WikidataEntity = Union[WikidataItem, WikidataProperty, WikidataLexeme]
35 changes: 35 additions & 0 deletions qwikidata/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright 2019 Kensho Technologies, LLC.
"""qwikidata utilities."""

import itertools
import json
from typing import Iterable, Iterator, Tuple

from qwikidata.entity import WikidataEntity


def pairwise(iterable: Iterable) -> Iterator[Tuple]:
"""Return pairwise tuples s -> (s0,s1), (s1,s2), (s2, s3), ..."""
a, b = itertools.tee(iterable)
next(b, None)
return zip(a, b)


def dump_entities_to_json(entities: Iterable[WikidataEntity], out_fname: str) -> None:
"""Write entities to JSON file.
Parameters
----------
entities
An iterable of instances of WikidataEntity
out_fname
Output file name
"""
with open(out_fname, "w") as fp:
fp.write("[\n")
for ent_lo, ent_hi in pairwise(entities):
ent_str = json.dumps(ent_lo._entity_dict)
fp.write("{},\n".format(ent_str))
ent_str = json.dumps(ent_hi._entity_dict)
fp.write("{}".format(ent_str))
fp.write("\n]")

0 comments on commit 3af9dba

Please sign in to comment.