Skip to content

Commit

Permalink
Add pueblo.io.to_io utility function
Browse files Browse the repository at this point in the history
  • Loading branch information
amotl committed Dec 2, 2023
1 parent 01e1247 commit 8980f05
Show file tree
Hide file tree
Showing 6 changed files with 173 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- ngr: Fix Gradle test runner by only conditionally invoking `gradle wrapper`
- ngr: Add capability to invoke projects using the `poethepoet` task runner
- Dependencies: Update to nbdime 4 and pytest-notebook 0.10
- Add `pueblo.io.to_io` utility function

## 2023-11-06 v0.0.3
- ngr: Fix `contextlib.chdir` only available on Python 3.11 and newer
Expand Down
1 change: 1 addition & 0 deletions pueblo/io/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .universal import to_io
74 changes: 74 additions & 0 deletions pueblo/io/universal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import contextlib
import io
import typing as t
from pathlib import Path

from pathlibfs import Path as PathPlus
from yarl import URL


@contextlib.contextmanager
def to_io(source: t.Union[str, Path, t.IO], mode: t.Literal["r", "rb", "rt"] = "rt") -> t.Generator[t.IO, None, None]:
"""
Converge filesystem path, remote URL, or file-like object into an IO handle.
"""
fp: t.IO
if isinstance(source, io.TextIOWrapper):
fp = source
elif isinstance(source, (str, Path, PathPlus)):
source = str(source)
path = open_url(source)
fp = path.open(mode=mode)
else:
raise TypeError(f"Unable to converge to IO handle. type={type(source)}, value={source}")
yield fp
fp.close()


def open_url(url: str) -> PathPlus:
"""
Access URL, with specific handling for GitHub URLs.
When approached using a GitHub HTTP URL, converge it to a pathlibfs / fsspec URL,
and open it.
Input URLs
----------
github+https://foobar:[email protected]/acme/sweet-camino/path/to/document.md
github+https://foobar:[email protected]/acme/sweet-camino/blob/main/path/to/document.md
Output Path
-----------
fs = Path("github://path/to/document.md", username="foobar", token="ghp_lalala", org="acme", repo="sweet-camino")
"""
uri = URL(url)

if uri.scheme.startswith("github+https"):
path_fragments = uri.path.split("/")[1:]
path_kwargs = {
"username": uri.user,
"token": uri.password,
"org": path_fragments[0],
"repo": path_fragments[1],
}

real_path_fragments = path_fragments[2:]
if path_fragments[2] in ["blob", "raw"]:
real_path_fragments = path_fragments[4:]

downstream_url = "github://" + "/".join(real_path_fragments)
path = PathPlus(downstream_url, **path_kwargs)

else:
path = PathPlus(url)
return path


def path_without_scheme(url_like: str) -> PathPlus:
"""
Return a pathlibfs Path, without the scheme.
"""
url = URL(str(url_like))
if url.is_absolute():
url = url.with_scheme("")
return PathPlus(str(url))
7 changes: 6 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ dependencies = [

[project.optional-dependencies]
all = [
"pueblo[cli,nlp,notebook,testing,web]",
"pueblo[cli,fileio,nlp,notebook,testing,web]",
]
cli = [
"click<9",
Expand All @@ -85,6 +85,11 @@ develop = [
"ruff==0.1.6",
"validate-pyproject<0.16",
]
fileio = [
"fsspec[adlfs,dask,gcs,git,github,http,s3,smb]<2023.11",
"pathlibfs<0.6",
"yarl<1.10",
]
nlp = [
"aiohttp<3.10",
"langchain",
Expand Down
32 changes: 32 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from pathlib import Path

import pytest


@pytest.fixture
def readme_file() -> Path:
return Path(__file__).parent.parent / "README.md"


def get_readme_url(infix: str = "", scheme: str = "https:") -> str:
return f"{scheme}//github.com/pyveci/pueblo/{infix}README.md"


@pytest.fixture
def readme_url_https_raw() -> str:
return get_readme_url(infix="raw/main/")


@pytest.fixture
def readme_url_github_https_bare() -> str:
return get_readme_url(scheme="github+https:")


@pytest.fixture
def readme_url_github_https_raw() -> str:
return get_readme_url(infix="raw/main/", scheme="github+https:")


@pytest.fixture
def readme_url_github_https_blob() -> str:
return get_readme_url(infix="blob/main/", scheme="github+https:")
59 changes: 59 additions & 0 deletions tests/test_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pytest
from pathlibfs import Path as PathPlus

from pueblo.io import to_io
from pueblo.io.universal import path_without_scheme

README_NEEDLE = "A Python toolbox library"


def test_to_io_failure():
with pytest.raises(TypeError) as ex:
with to_io(None):
pass
assert ex.match("Unable to converge to IO handle. type=<class 'NoneType'>, value=None")


def test_to_io_file(readme_file):
with to_io(readme_file) as fp:
content = fp.read()
assert README_NEEDLE in content


def test_to_io_memory(readme_file):
infile = open(readme_file, "r")
with to_io(infile) as fp:
content = fp.read()
assert README_NEEDLE in content


def test_to_io_url(readme_url_https_raw):
with to_io(readme_url_https_raw) as fp:
content = fp.read()
assert README_NEEDLE in content


def test_to_io_github_url_bare(readme_url_github_https_bare):
with to_io(readme_url_github_https_bare) as fp:
content = fp.read()
assert README_NEEDLE in content


def test_to_io_github_url_raw(readme_url_github_https_raw):
with to_io(readme_url_github_https_raw) as fp:
content = fp.read()
assert README_NEEDLE in content


def test_to_io_github_url_blob(readme_url_github_https_blob):
with to_io(readme_url_github_https_blob) as fp:
content = fp.read()
assert README_NEEDLE in content


def test_path_without_scheme_absolute():
assert path_without_scheme("foo://localhost/bar/baz") == PathPlus("file:////localhost/bar/baz")


def test_path_without_scheme_relative():
assert path_without_scheme("/bar/baz") == PathPlus("file:///bar/baz")

0 comments on commit 8980f05

Please sign in to comment.