-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Port to v3 #44
base: master
Are you sure you want to change the base?
Port to v3 #44
Conversation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perfect!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
now needs update
…o port-to-v3 # Conflicts: # ocrd_kraken/binarize.py
(after MacOS fails with `torch ... not supported on this platform` 🙄 )
CONFIGS = ['', 'pageparallel', 'metscache', 'pageparallel+metscache'] | ||
|
||
@pytest.fixture(params=CONFIGS) | ||
def workspace(tmpdir, pytestconfig, request): | ||
def _make_workspace(workspace_path): | ||
initLogging() | ||
if pytestconfig.getoption('verbose') > 0: | ||
setOverrideLogLevel('DEBUG') | ||
with pushd_popd(tmpdir): | ||
directory = str(tmpdir) | ||
resolver = Resolver() | ||
workspace = resolver.workspace_from_url(workspace_path, dst_dir=directory, download=True) | ||
config.OCRD_MISSING_OUTPUT = "ABORT" | ||
if 'metscache' in request.param: | ||
config.OCRD_METS_CACHING = True | ||
print("enabled METS caching") | ||
if 'pageparallel' in request.param: | ||
config.OCRD_MAX_PARALLEL_PAGES = 4 | ||
print("enabled page-parallel processing") | ||
def _start_mets_server(*args, **kwargs): | ||
print("running with METS server") | ||
server = OcrdMetsServer(*args, **kwargs) | ||
server.startup() | ||
process = Process(target=_start_mets_server, | ||
kwargs={'workspace': workspace, 'url': 'mets.sock'}) | ||
process.start() | ||
sleep(1) | ||
workspace = Workspace(resolver, directory, mets_server_url='mets.sock') | ||
yield {'workspace': workspace, 'mets_server_url': 'mets.sock'} | ||
process.terminate() | ||
else: | ||
yield {'workspace': workspace} | ||
config.reset_defaults() | ||
return _make_workspace | ||
|
||
|
||
@pytest.fixture | ||
def workspace_manifesto(workspace): | ||
yield from workspace(assets.path_to('communist_manifesto/data/mets.xml')) | ||
|
||
@pytest.fixture | ||
def workspace_aufklaerung(workspace): | ||
yield from workspace(assets.path_to('kant_aufklaerung_1784/data/mets.xml')) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BTW, this could be a template for all processor tests. Testing w/o METS Server and w/o is important IMO.
We can easily add more configuration scenarios there.
def test_recognize(workspace_aufklaerung): | ||
# some models (like default en) require binarized images | ||
run_processor(KrakenBinarize, | ||
input_file_grp="OCR-D-GT-PAGE", | ||
output_file_grp="OCR-D-GT-PAGE-BIN", | ||
**workspace_aufklaerung, | ||
) | ||
run_processor(KrakenRecognize, | ||
# re-use layout, overwrite text: | ||
input_file_grp="OCR-D-GT-PAGE-BIN", | ||
output_file_grp="OCR-D-OCR-KRAKEN", | ||
parameter={'overwrite_text': True}, | ||
**workspace_aufklaerung, | ||
) | ||
ws = workspace_aufklaerung['workspace'] | ||
ws.save_mets() | ||
assert os.path.isdir(os.path.join(ws.directory, 'OCR-D-OCR-KRAKEN')) | ||
results = ws.find_files(file_grp='OCR-D-OCR-KRAKEN', mimetype=MIMETYPE_PAGE) | ||
result0 = next(results, False) | ||
assert result0, "found no output PAGE file" | ||
result0 = page_from_file(result0) | ||
text0 = result0.etree.xpath('//page:Glyph/page:TextEquiv/page:Unicode', namespaces=NAMESPACES) | ||
assert len(text0) > 0, "found no glyph text in output PAGE file" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And here the consumer part.
@@ -68,7 +68,7 @@ docker: | |||
|
|||
# Run test | |||
test: tests/assets | |||
$(PYTHON) -m pytest tests $(PYTEST_ARGS) | |||
$(PYTHON) -m pytest tests --durations=0 $(PYTEST_ARGS) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And with this we get to see what difference in performance these settings make:
93.35s call tests/test_recognize.py::test_recognize[pageparallel+metscache]
92.28s call tests/test_recognize.py::test_recognize[pageparallel]
76.19s call tests/test_recognize.py::test_recognize[]
74.83s call tests/test_recognize.py::test_recognize[metscache]
55.92s call tests/test_segment.py::test_run_blla[metscache]
55.11s call tests/test_segment.py::test_run_blla[]
48.43s call tests/test_segment.py::test_run_blla[pageparallel+metscache]
41.80s call tests/test_segment.py::test_run_blla[pageparallel]
(In this case, it was only 2 pages – the scaling factor is not so great.)
No description provided.