Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merged init download function from other repo #3

Merged
merged 1 commit into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions databusclient/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,9 @@ def deploy(


@app.command()
def download(collection: str):
typer.echo("TODO")
def download(
localDir: str = typer.Option(..., help="local databus folder"),
databus: str = typer.Option(..., help="databus URL"),
databusURIs: List[str] = typer.Argument(...,help="any kind of these: databus identifier, databus collection identifier, query file")
):
client.download(localDir=localDir,endpoint=databus,databusURIs=databusURIs)
100 changes: 100 additions & 0 deletions databusclient/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import requests
import hashlib
import json
from tqdm import tqdm
from SPARQLWrapper import SPARQLWrapper, JSON
from hashlib import sha256

__debug = False

Expand Down Expand Up @@ -386,3 +389,100 @@ def deploy(
if debug or __debug:
print("---------")
print(resp.text)


def __download_file__(url, filename):
"""
Download a file from the internet with a progress bar using tqdm.

Parameters:
- url: the URL of the file to download
- filename: the local file path where the file should be saved
"""
print("download "+url)
response = requests.get(url, stream=True)
total_size_in_bytes= int(response.headers.get('content-length', 0))
block_size = 1024 # 1 Kibibyte

progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
with open(filename, 'wb') as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
print("ERROR, something went wrong")


def __query_sparql__(endpoint_url, query)-> dict:
"""
Query a SPARQL endpoint and return results in JSON format.

Parameters:
- endpoint_url: the URL of the SPARQL endpoint
- query: the SPARQL query string

Returns:
- Dictionary containing the query results
"""
sparql = SPARQLWrapper(endpoint_url)
sparql.method = 'POST'
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
return results


def __handle__databus_file_query__(endpoint_url, query) -> List[str]:
result_dict = __query_sparql__(endpoint_url,query)
for binding in result_dict['results']['bindings']:
if len(binding.keys()) > 1:
print("Error multiple bindings in query response")
break
else:
value = binding[next(iter(binding.keys()))]['value']
yield value


def wsha256(raw: str):
return sha256(raw.encode('utf-8')).hexdigest()


def __handle_databus_collection__(endpoint, uri: str)-> str:
headers = {"Accept": "text/sparql"}
return requests.get(uri, headers=headers).text


def __download_list__(urls: List[str], localDir: str):
for url in urls:
__download_file__(url=url,filename=localDir+"/"+wsha256(url))


def download(
localDir: str,
endpoint: str,
databusURIs: List[str]
) -> None:
"""
Download datasets to local storage from databus registry
------
localDir: the local directory
databusURIs: identifiers to access databus registered datasets
"""
for databusURI in databusURIs:
# dataID or databus collection
if databusURI.startswith("http://") or databusURI.startswith("https://"):
# databus collection
if "/collections/" in databusURI:
query = __handle_databus_collection__(endpoint,databusURI)
res = __handle__databus_file_query__(endpoint, query)
else:
print("dataId not supported yet")
# query in local file
elif databusURI.startswith("file://"):
print("query in file not supported yet")
# query as argument
else:
print("QUERY {}", databusURI.replace("\n"," "))
res = __handle__databus_file_query__(endpoint,databusURI)
__download_list__(res,localDir)
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ readme = "README.md"
python = "^3.9"
typer = "^0.6.1"
requests = "^2.28.1"
tqdm = "^2.2.3"
SPARQLWrapper = "^2.0.0"


[tool.poetry.dev-dependencies]
black = "^22.6.0"
Expand Down
20 changes: 20 additions & 0 deletions tests/test_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""Download Tests"""
import pytest
import databusclient.client as cl

DEFAULT_ENDPOINT="https://databus.dbpedia.org/sparql"
TEST_QUERY="""
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?x WHERE {
?sub dcat:downloadURL ?x .
} LIMIT 10
"""
TEST_COLLECTION="https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12"

def test_with_query():
cl.download("target",DEFAULT_ENDPOINT,[TEST_QUERY]

)

def test_with_collection():
cl.download("target",DEFAULT_ENDPOINT,[TEST_COLLECTION])