diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..684a68a --- /dev/null +++ b/.coveragerc @@ -0,0 +1,6 @@ +[run] +omit = + app.py + *setup.py + docs/* + *test* \ No newline at end of file diff --git a/.gitignore b/.gitignore index 6a0812a..bdcd632 100644 --- a/.gitignore +++ b/.gitignore @@ -104,3 +104,7 @@ venv.bak/ .mypy_cache/ .DS_Store +.pytest_cache/ +.vscode/ +.history/ +.backfill/ \ No newline at end of file diff --git a/.pylintrc b/.pylintrc index 542888e..ae58db2 100644 --- a/.pylintrc +++ b/.pylintrc @@ -50,7 +50,7 @@ confidence= # --enable=similarities". If you want to run only the classes checker, but have # no Warning level messages displayed, use"--disable=all --enable=classes # --disable=W" -disable=blacklisted-name,invalid-name,import-error,print-statement,parameter-unpacking,unpacking-in-except,old-raise-syntax,backtick,long-suffix,old-ne-operator,old-octal-literal,import-star-module-level,parse-error,raw-checker-failed,bad-inline-option,locally-disabled,locally-enabled,file-ignored,suppressed-message,useless-suppression,deprecated-pragma,too-many-return-statements,too-many-arguments,too-many-locals,arguments-differ,signature-differs,unused-import,redefined-builtin,broad-except,apply-builtin,basestring-builtin,buffer-builtin,cmp-builtin,coerce-builtin,execfile-builtin,file-builtin,long-builtin,raw_input-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,no-absolute-import,old-division,dict-iter-method,dict-view-method,next-method-called,metaclass-assignment,indexing-exception,raising-string,reload-builtin,oct-method,hex-method,nonzero-method,cmp-method,input-builtin,round-builtin,intern-builtin,unichr-builtin,map-builtin-not-iterating,zip-builtin-not-iterating,range-builtin-not-iterating,filter-builtin-not-iterating,using-cmp-argument,eq-without-hash,div-method,idiv-method,rdiv-method,exception-message-attribute,invalid-str-codec,sys-max-int,bad-python3-import,deprecated-string-function,deprecated-str-translate-call,too-few-public-methods +disable=blacklisted-name,invalid-name,import-error,print-statement,parameter-unpacking,unpacking-in-except,old-raise-syntax,backtick,long-suffix,old-ne-operator,old-octal-literal,import-star-module-level,parse-error,raw-checker-failed,bad-inline-option,locally-disabled,locally-enabled,file-ignored,suppressed-message,useless-suppression,deprecated-pragma,too-many-return-statements,too-many-arguments,too-many-locals,arguments-differ,signature-differs,unused-import,redefined-builtin,broad-except,apply-builtin,basestring-builtin,buffer-builtin,cmp-builtin,coerce-builtin,execfile-builtin,file-builtin,long-builtin,raw_input-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,no-absolute-import,old-division,dict-iter-method,dict-view-method,next-method-called,metaclass-assignment,indexing-exception,raising-string,reload-builtin,oct-method,hex-method,nonzero-method,cmp-method,input-builtin,round-builtin,intern-builtin,unichr-builtin,map-builtin-not-iterating,zip-builtin-not-iterating,range-builtin-not-iterating,filter-builtin-not-iterating,using-cmp-argument,eq-without-hash,div-method,idiv-method,rdiv-method,exception-message-attribute,invalid-str-codec,sys-max-int,bad-python3-import,deprecated-string-function,deprecated-str-translate-call,too-few-public-methods,pointless-string-statement # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/.travis.yml b/.travis.yml index da619f7..b4428d1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,8 @@ services: - docker os: - linux +env: + - BOTO_CONFIG=/dev/null python: - "3.6" script: @@ -12,7 +14,7 @@ script: - pipenv install --dev - pipenv run pytest --cov=arxiv --cov=announcement/announcement --cov=repository/repository --cov-report=term-missing arxiv announcement/announcement repository/repository after_success: - - coveralls + - pipenv run -m coveralls - "./tests/lint.sh arxiv" - "./tests/lint.sh announcement/announcement" - "./tests/lint.sh repository/repository" diff --git a/Pipfile b/Pipfile index 96da84c..01d089f 100644 --- a/Pipfile +++ b/Pipfile @@ -13,6 +13,9 @@ pydocstyle = "*" mypy = "*" pytest-cov = "*" arxiv-canonical = {path = "."} +moto = "*" +sphinx = "*" +sphinx-autodoc-typehints = "*" [packages] backports-datetime-fromisoformat = "*" @@ -20,9 +23,12 @@ jsonschema = "*" python-dateutil = "*" pytz = "*" typing-extensions = "*" -arxiv-base = "==0.15.8.post1" +arxiv-base = "==0.16.2" arxiv-auth = "*" arxiv-canonical = {path = "."} +mypy = "==0.720" +moto = "==1.3.13" +retry = "*" [requires] python_version = "3.6" diff --git a/Pipfile.lock b/Pipfile.lock index 1b519b3..81ddf4b 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "4a0aa7c36e0337d70769d80ee9a048372b69eacfa0f59185f837d78a62efebc1" + "sha256": "ffc75cc107f4dacb99bd44d05935c69141aa0b238e308eafb94b1729430b8169" }, "pipfile-spec": 6, "requires": { @@ -18,27 +18,40 @@ "default": { "arxiv-auth": { "hashes": [ - "sha256:c65b73aadae3c2a7267838fadf0cc0c07a83a8f53b87b57f5d401625586e212b" + "sha256:cc0140b5135e7c364174141a8d8ca459afcb2c2de4ed278c2a1a9634cd5eef16" ], "index": "pypi", - "version": "==0.3.1" + "version": "==0.4.1" }, "arxiv-base": { "hashes": [ - "sha256:91e582be5f2f6a29e36af9ed3666f2892287c27dd9b0acfa89bef9cdd3d9bf5b" + "sha256:dea1755acb38d07ad07059937c7e9ad29c19a89499a097d86e82f2d5ddc15c1f" ], "index": "pypi", - "version": "==0.15.8.post1" + "version": "==0.16.2" }, "arxiv-canonical": { "path": "." }, "attrs": { "hashes": [ - "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", - "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399" + "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c", + "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72" + ], + "version": "==19.3.0" + }, + "aws-sam-translator": { + "hashes": [ + "sha256:11c62c00f37b57c39a55d7a29d93f4704a88549c29a6448ebc953147173fbe85" + ], + "version": "==1.15.1" + }, + "aws-xray-sdk": { + "hashes": [ + "sha256:75cbce8c777b7d8055719ee1a0db6043e53c44e8f1a62a956bd84db87c4a4c7c", + "sha256:ce4adb60fe67ebe91f2fc57d5067b4e44df6e233652987be4fb2e549688cf9fe" ], - "version": "==19.1.0" + "version": "==2.4.2" }, "backports-datetime-fromisoformat": { "hashes": [ @@ -54,19 +67,84 @@ ], "version": "==3.1.0" }, + "boto": { + "hashes": [ + "sha256:147758d41ae7240dc989f0039f27da8ca0d53734be0eb869ef16e3adcfa462e8", + "sha256:ea0d3b40a2d852767be77ca343b58a9e3a4b00d9db440efb8da74b4e58025e5a" + ], + "version": "==2.49.0" + }, "boto3": { "hashes": [ - "sha256:29cc84e5a12f6476f909710373ebc294f37217baf33b50b6acd7f67aee3bb384", - "sha256:43ac443a5f11153a372ff54e8ed9e3a5a06059f047df533a86324a4ce2c1beff" + "sha256:7fc97cb2c9cdff905e950750c8e8b23b872a84696158a28852355dc4b712ba3a", + "sha256:818c56a317c176142dbf1dca3f5b4366c80460c6cc3c4efe22f0bde736571283" ], - "version": "==1.9.178" + "version": "==1.10.2" }, "botocore": { "hashes": [ - "sha256:6b5a42cd6267467bf6a7d6ca9452385dd984b7b402b4a1e2a9c42e2a3d311edd", - "sha256:a9515ee8511b73d7caeaa33e27c50ea1b85d722007aa870d8d7a424cc3f903da" + "sha256:8223485841ef4731a5d4943a733295ba69d0005c4ae64c468308cc07f6960d39", + "sha256:f8e12dc6e536ea512f0ad25b74e7eecdf5d9e09ae92b5de236b535bee7804d5b" + ], + "version": "==1.13.2" + }, + "certifi": { + "hashes": [ + "sha256:e4f3620cfea4f83eedc95b24abd9cd56f3c4b146dd0177e83a21b4eb49e21e50", + "sha256:fd7c7c74727ddcf00e9acd26bba8da604ffec95bf1c2144e67aff7a8b50e6cef" + ], + "version": "==2019.9.11" + }, + "cffi": { + "hashes": [ + "sha256:00d890313797d9fe4420506613384b43099ad7d2b905c0752dbcc3a6f14d80fa", + "sha256:0cf9e550ac6c5e57b713437e2f4ac2d7fd0cd10336525a27224f5fc1ec2ee59a", + "sha256:0ea23c9c0cdd6778146a50d867d6405693ac3b80a68829966c98dd5e1bbae400", + "sha256:193697c2918ecdb3865acf6557cddf5076bb39f1f654975e087b67efdff83365", + "sha256:1ae14b542bf3b35e5229439c35653d2ef7d8316c1fffb980f9b7647e544baa98", + "sha256:1e389e069450609c6ffa37f21f40cce36f9be7643bbe5051ab1de99d5a779526", + "sha256:263242b6ace7f9cd4ea401428d2d45066b49a700852334fd55311bde36dcda14", + "sha256:33142ae9807665fa6511cfa9857132b2c3ee6ddffb012b3f0933fc11e1e830d5", + "sha256:364f8404034ae1b232335d8c7f7b57deac566f148f7222cef78cf8ae28ef764e", + "sha256:47368f69fe6529f8f49a5d146ddee713fc9057e31d61e8b6dc86a6a5e38cecc1", + "sha256:4895640844f17bec32943995dc8c96989226974dfeb9dd121cc45d36e0d0c434", + "sha256:558b3afef987cf4b17abd849e7bedf64ee12b28175d564d05b628a0f9355599b", + "sha256:5ba86e1d80d458b338bda676fd9f9d68cb4e7a03819632969cf6d46b01a26730", + "sha256:63424daa6955e6b4c70dc2755897f5be1d719eabe71b2625948b222775ed5c43", + "sha256:6381a7d8b1ebd0bc27c3bc85bc1bfadbb6e6f756b4d4db0aa1425c3719ba26b4", + "sha256:6381ab708158c4e1639da1f2a7679a9bbe3e5a776fc6d1fd808076f0e3145331", + "sha256:6fd58366747debfa5e6163ada468a90788411f10c92597d3b0a912d07e580c36", + "sha256:728ec653964655d65408949b07f9b2219df78badd601d6c49e28d604efe40599", + "sha256:7cfcfda59ef1f95b9f729c56fe8a4041899f96b72685d36ef16a3440a0f85da8", + "sha256:819f8d5197c2684524637f940445c06e003c4a541f9983fd30d6deaa2a5487d8", + "sha256:825ecffd9574557590e3225560a8a9d751f6ffe4a49e3c40918c9969b93395fa", + "sha256:8a2bcae2258d00fcfc96a9bde4a6177bc4274fe033f79311c5dd3d3148c26518", + "sha256:9009e917d8f5ef780c2626e29b6bc126f4cb2a4d43ca67aa2b40f2a5d6385e78", + "sha256:9c77564a51d4d914ed5af096cd9843d90c45b784b511723bd46a8a9d09cf16fc", + "sha256:a19089fa74ed19c4fe96502a291cfdb89223a9705b1d73b3005df4256976142e", + "sha256:a40ed527bffa2b7ebe07acc5a3f782da072e262ca994b4f2085100b5a444bbb2", + "sha256:b8f09f21544b9899defb09afbdaeb200e6a87a2b8e604892940044cf94444644", + "sha256:bb75ba21d5716abc41af16eac1145ab2e471deedde1f22c6f99bd9f995504df0", + "sha256:e22a00c0c81ffcecaf07c2bfb3672fa372c50e2bd1024ffee0da191c1b27fc71", + "sha256:e55b5a746fb77f10c83e8af081979351722f6ea48facea79d470b3731c7b2891", + "sha256:ec2fa3ee81707a5232bf2dfbd6623fdb278e070d596effc7e2d788f2ada71a05", + "sha256:fd82eb4694be712fcae03c717ca2e0fc720657ac226b80bbb597e971fc6928c2" + ], + "version": "==1.13.1" + }, + "cfn-lint": { + "hashes": [ + "sha256:6d32de719943f2cf1f2e8edf6ab1893764bb66dcbb78718540711bd9a0711bff", + "sha256:9ff6aaee29faae063006be2740a34c141f75512c0f237cd842eb3c76890cd66c" + ], + "version": "==0.24.6" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" ], - "version": "==1.12.178" + "version": "==3.0.4" }, "click": { "hashes": [ @@ -75,20 +153,101 @@ ], "version": "==7.0" }, + "cryptography": { + "hashes": [ + "sha256:02079a6addc7b5140ba0825f542c0869ff4df9a69c360e339ecead5baefa843c", + "sha256:1df22371fbf2004c6f64e927668734070a8953362cd8370ddd336774d6743595", + "sha256:369d2346db5934345787451504853ad9d342d7f721ae82d098083e1f49a582ad", + "sha256:3cda1f0ed8747339bbdf71b9f38ca74c7b592f24f65cdb3ab3765e4b02871651", + "sha256:44ff04138935882fef7c686878e1c8fd80a723161ad6a98da31e14b7553170c2", + "sha256:4b1030728872c59687badcca1e225a9103440e467c17d6d1730ab3d2d64bfeff", + "sha256:58363dbd966afb4f89b3b11dfb8ff200058fbc3b947507675c19ceb46104b48d", + "sha256:6ec280fb24d27e3d97aa731e16207d58bd8ae94ef6eab97249a2afe4ba643d42", + "sha256:7270a6c29199adc1297776937a05b59720e8a782531f1f122f2eb8467f9aab4d", + "sha256:73fd30c57fa2d0a1d7a49c561c40c2f79c7d6c374cc7750e9ac7c99176f6428e", + "sha256:7f09806ed4fbea8f51585231ba742b58cbcfbfe823ea197d8c89a5e433c7e912", + "sha256:90df0cc93e1f8d2fba8365fb59a858f51a11a394d64dbf3ef844f783844cc793", + "sha256:971221ed40f058f5662a604bd1ae6e4521d84e6cad0b7b170564cc34169c8f13", + "sha256:a518c153a2b5ed6b8cc03f7ae79d5ffad7315ad4569b2d5333a13c38d64bd8d7", + "sha256:b0de590a8b0979649ebeef8bb9f54394d3a41f66c5584fff4220901739b6b2f0", + "sha256:b43f53f29816ba1db8525f006fa6f49292e9b029554b3eb56a189a70f2a40879", + "sha256:d31402aad60ed889c7e57934a03477b572a03af7794fa8fb1780f21ea8f6551f", + "sha256:de96157ec73458a7f14e3d26f17f8128c959084931e8997b9e655a39c8fde9f9", + "sha256:df6b4dca2e11865e6cfbfb708e800efb18370f5a46fd601d3755bc7f85b3a8a2", + "sha256:ecadccc7ba52193963c0475ac9f6fa28ac01e01349a2ca48509667ef41ffd2cf", + "sha256:fb81c17e0ebe3358486cd8cc3ad78adbae58af12fc2bf2bc0bb84e8090fa5ce8" + ], + "version": "==2.8" + }, + "datetime": { + "hashes": [ + "sha256:371dba07417b929a4fa685c2f7a3eaa6a62d60c02947831f97d4df9a9e70dfd0", + "sha256:5cef605bab8259ff61281762cdf3290e459fbf0b4719951d5fab967d5f2ea0ea" + ], + "version": "==4.3" + }, + "decorator": { + "hashes": [ + "sha256:54c38050039232e1db4ad7375cfce6748d7b41c29e95a081c8a6d2c30364a2ce", + "sha256:5d19b92a3c8f7f101c8dd86afd86b0f061a8ce4540ab8cd401fa2542756bce6d" + ], + "version": "==4.4.1" + }, + "docker": { + "hashes": [ + "sha256:6e06c5e70ba4fad73e35f00c55a895a448398f3ada7faae072e2bb01348bafc1", + "sha256:8f93775b8bdae3a2df6bc9a5312cce564cade58d6555f2c2570165a1270cd8a7" + ], + "version": "==4.1.0" + }, "docutils": { "hashes": [ - "sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6", - "sha256:51e64ef2ebfb29cae1faa133b3710143496eca21c530f3f71424d77687764274", - "sha256:7a4bd47eaf6596e1295ecb11361139febe29b084a87bf005bf899f9a42edc3c6" + "sha256:6c4f696463b79f1fb8ba0c594b63840ebd41f059e92b31957c46b74a4599b6d0", + "sha256:9e4d7ecfc600058e07ba661411a2b7de2fd0fafa17d1a7f7361cd47b1175c827", + "sha256:a2aeea129088da402665e92e0b25b04b073c04b2dce4ab65caaa38b7ce2e1a99" + ], + "version": "==0.15.2" + }, + "ecdsa": { + "hashes": [ + "sha256:163c80b064a763ea733870feb96f9dd9b92216cfcacd374837af18e4e8ec3d4d", + "sha256:9814e700890991abeceeb2242586024d4758c8fc18445b194a49bd62d85861db" ], - "version": "==0.14" + "version": "==0.13.3" }, "flask": { "hashes": [ - "sha256:ad7c6d841e64296b962296c2c2dabc6543752985727af86a975072dea984b6f3", - "sha256:e7d32475d1de5facaa55e3958bc4ec66d3762076b074296aa50ef8fdc5b9df61" + "sha256:13f9f196f330c7c2c5d7a5cf91af894110ca0215ac051b5844701f2bfd934d52", + "sha256:45eb5a6fd193d6cf7e0cf5d8a5b31f83d5faae0293695626f539a823e93b13f6" + ], + "version": "==1.1.1" + }, + "flask-sqlalchemy": { + "hashes": [ + "sha256:0078d8663330dc05a74bc72b3b6ddc441b9a744e2f56fe60af1a5bfc81334327", + "sha256:6974785d913666587949f7c2946f7001e4fa2cb2d19f4e69ead02e4b8f50b33d" + ], + "version": "==2.4.1" + }, + "future": { + "hashes": [ + "sha256:858e38522e8fd0d3ce8f0c1feaf0603358e366d5403209674c7b617fa0c24093" + ], + "version": "==0.18.1" + }, + "idna": { + "hashes": [ + "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", + "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" + ], + "version": "==2.8" + }, + "importlib-metadata": { + "hashes": [ + "sha256:aa18d7378b00b40847790e7c27e11673d7fed219354109d0e7b9e5b25dc3ad26", + "sha256:d5f18a79777f3aa179c145737780282e27b508fc8fd688cb17c7a813e8bd39af" ], - "version": "==1.0.3" + "version": "==0.23" }, "itsdangerous": { "hashes": [ @@ -99,10 +258,10 @@ }, "jinja2": { "hashes": [ - "sha256:065c4f02ebe7f7cf559e49ee5a95fb800a9e4528727aec6f24402a5374c65013", - "sha256:14dd6caf1527abb21f08f86c784eac40853ba93edb79552aa1e4b8aef1b61c7b" + "sha256:74320bb91f31270f9551d46522e33af46a80c3d619f4a4bf42b3164d30b5911f", + "sha256:9fe95f19286cfefaa917656583d020be14e7859c6b0252588391e47db34527de" ], - "version": "==2.10.1" + "version": "==2.10.3" }, "jmespath": { "hashes": [ @@ -111,13 +270,40 @@ ], "version": "==0.9.4" }, + "jsondiff": { + "hashes": [ + "sha256:7e18138aecaa4a8f3b7ac7525b8466234e6378dd6cae702b982c9ed851d2ae21" + ], + "version": "==1.1.2" + }, + "jsonpatch": { + "hashes": [ + "sha256:83f29a2978c13da29bfdf89da9d65542d62576479caf215df19632d7dc04c6e6", + "sha256:cbb72f8bf35260628aea6b508a107245f757d1ec839a19c34349985e2c05645a" + ], + "version": "==1.24" + }, + "jsonpickle": { + "hashes": [ + "sha256:d0c5a4e6cb4e58f6d5406bdded44365c2bcf9c836c4f52910cc9ba7245a59dc2", + "sha256:d3e922d781b1d0096df2dad89a2e1f47177d7969b596aea806a9d91b4626b29b" + ], + "version": "==1.2" + }, + "jsonpointer": { + "hashes": [ + "sha256:c192ba86648e05fdae4f08a17ec25180a9aef5008d973407b581798a83975362", + "sha256:ff379fa021d1b81ab539f5ec467c7745beb1a5671463f9dcc2b2d458bd361c1e" + ], + "version": "==2.0" + }, "jsonschema": { "hashes": [ - "sha256:0c0a81564f181de3212efa2d17de1910f8732fa1b71c42266d983cd74304e20d", - "sha256:a5f6559964a3851f59040d3b961de5e68e70971afb88ba519d27e6a039efff1a" + "sha256:2fa0684276b6333ff3c0b1b27081f4b2305f0a36cf702a23db50edb141893c3f", + "sha256:94c0a13b4a0616458b42529091624e66700a17f847453e52279e35509a5b7631" ], "index": "pypi", - "version": "==3.0.1" + "version": "==3.1.1" }, "markupsafe": { "hashes": [ @@ -152,20 +338,91 @@ ], "version": "==1.1.1" }, + "mimesis": { + "hashes": [ + "sha256:4b8fc414bd101109615fa8b6ad49f1811199e2745a4e9ef527193a4ab69637fc", + "sha256:faf4eed0abea190e77257f1c8d9fc8278784bb0de889e365e3af462aa53f0416" + ], + "version": "==3.3.0" + }, + "mock": { + "hashes": [ + "sha256:83657d894c90d5681d62155c82bda9c1187827525880eda8ff5df4ec813437c3", + "sha256:d157e52d4e5b938c550f39eb2fd15610db062441a9c2747d3dbfa9298211d0f8" + ], + "version": "==3.0.5" + }, + "more-itertools": { + "hashes": [ + "sha256:409cd48d4db7052af495b09dec721011634af3753ae1ef92d2b32f73a745f832", + "sha256:92b8c4b06dac4f0611c0729b2f2ede52b2e1bac1ab48f089c7ddc12e26bb60c4" + ], + "version": "==7.2.0" + }, + "moto": { + "hashes": [ + "sha256:95d48d8ebaad47fb5bb4233854cf1cf8523ec5307d50eb1e4017ce10f1960b66" + ], + "index": "pypi", + "version": "==1.3.13" + }, + "mypy": { + "hashes": [ + "sha256:0107bff4f46a289f0e4081d59b77cef1c48ea43da5a0dbf0005d54748b26df2a", + "sha256:07957f5471b3bb768c61f08690c96d8a09be0912185a27a68700f3ede99184e4", + "sha256:10af62f87b6921eac50271e667cc234162a194e742d8e02fc4ddc121e129a5b0", + "sha256:11fd60d2f69f0cefbe53ce551acf5b1cec1a89e7ce2d47b4e95a84eefb2899ae", + "sha256:15e43d3b1546813669bd1a6ec7e6a11d2888db938e0607f7b5eef6b976671339", + "sha256:352c24ba054a89bb9a35dd064ee95ab9b12903b56c72a8d3863d882e2632dc76", + "sha256:437020a39417e85e22ea8edcb709612903a9924209e10b3ec6d8c9f05b79f498", + "sha256:49925f9da7cee47eebf3420d7c0e00ec662ec6abb2780eb0a16260a7ba25f9c4", + "sha256:6724fcd5777aa6cebfa7e644c526888c9d639bd22edd26b2a8038c674a7c34bd", + "sha256:7a17613f7ea374ab64f39f03257f22b5755335b73251d0d253687a69029701ba", + "sha256:cdc1151ced496ca1496272da7fc356580e95f2682be1d32377c22ddebdf73c91" + ], + "index": "pypi", + "version": "==0.720" + }, + "mypy-extensions": { + "hashes": [ + "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d", + "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8" + ], + "version": "==0.4.3" + }, "mysqlclient": { "hashes": [ - "sha256:425e733b05e359a714d6007c0fc44582be66b63e5a3df0a50949274ae16f4bc6", - "sha256:62e4770b6a797b9416bcf70488365b7d6b9c9066878108499c559293bb464380", - "sha256:f257d250f2675d0ef99bd318906f3cfc05cef4a2f385ea695ff32a3f04b9f9a7" + "sha256:79a498ddda955e488f80c82a6392bf6e07c323d48db236033f33825665d8ba5c", + "sha256:8c3b61d89f7daaeab6aad6bf4c4bc3ef30bec1a8169f94dc59aea87ba2fabf80", + "sha256:9c737cc55a5dc8dd3583a942d5a9b21be58d16f00f5fefca4e575e7d9682e98c" ], - "version": "==1.4.2.post1" + "version": "==1.4.4" + }, + "py": { + "hashes": [ + "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", + "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53" + ], + "version": "==1.8.0" + }, + "pyasn1": { + "hashes": [ + "sha256:62cdade8b5530f0b185e09855dd422bc05c0bbff6b72ff61381c09dac7befd8c", + "sha256:a9495356ca1d66ed197a0f72b41eb1823cf7ea8b5bd07191673e8147aecf8604" + ], + "version": "==0.4.7" }, "pycountry": { "hashes": [ - "sha256:104a8ca94c700898c42a0172da2eab5a5675c49637b729a11db9e1dac2d983cd", - "sha256:8ec4020b2b15cd410893d573820d42ee12fe50365332e58c0975c953b60a16de" + "sha256:3c57aa40adcf293d59bebaffbe60d8c39976fba78d846a018dc0c2ec9c6cb3cb" ], - "version": "==18.12.8" + "version": "==19.8.18" + }, + "pycparser": { + "hashes": [ + "sha256:a988718abfad80b6b157acce7bf130a30876d27603738ac39f140993246b25b3" + ], + "version": "==2.19" }, "pyjwt": { "hashes": [ @@ -176,9 +433,9 @@ }, "pyrsistent": { "hashes": [ - "sha256:16692ee739d42cf5e39cef8d27649a8c1fdb7aa99887098f1460057c5eb75c3a" + "sha256:eb6545dbeb1aa69ab1fb4809bfbf5a8705e44d92ef8fc7c2361682a47c46c778" ], - "version": "==0.15.2" + "version": "==0.15.5" }, "python-dateutil": { "hashes": [ @@ -188,13 +445,38 @@ "index": "pypi", "version": "==2.8.0" }, + "python-jose": { + "hashes": [ + "sha256:29701d998fe560e52f17246c3213a882a4a39da7e42c7015bcc1f7823ceaff1c", + "sha256:ed7387f0f9af2ea0ddc441d83a6eb47a5909bd0c8a72ac3250e75afec2cc1371" + ], + "version": "==3.0.1" + }, "pytz": { "hashes": [ - "sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda", - "sha256:d747dd3d23d77ef44c6a3526e274af6efeb0a6f1afd5a69ba4d5be4098c8e141" + "sha256:1c557d7d0e871de1f5ccd5833f60fb2550652da6be2693c1e02300743d21500d", + "sha256:b02c06db6cf09c12dd25137e563b31700d3b80fcc4ad23abb7a315f2789819be" ], "index": "pypi", - "version": "==2019.1" + "version": "==2019.3" + }, + "pyyaml": { + "hashes": [ + "sha256:0113bc0ec2ad727182326b61326afa3d1d8280ae1122493553fd6f4397f33df9", + "sha256:01adf0b6c6f61bd11af6e10ca52b7d4057dd0be0343eb9283c878cf3af56aee4", + "sha256:5124373960b0b3f4aa7df1707e63e9f109b5263eca5976c66e08b1c552d4eaf8", + "sha256:5ca4f10adbddae56d824b2c09668e91219bb178a1eee1faa56af6f99f11bf696", + "sha256:7907be34ffa3c5a32b60b95f4d95ea25361c951383a894fec31be7252b2b6f34", + "sha256:7ec9b2a4ed5cad025c2278a1e6a19c011c80a3caaac804fd2d329e9cc2c287c9", + "sha256:87ae4c829bb25b9fe99cf71fbb2140c448f534e24c998cc60f39ae4f94396a73", + "sha256:9de9919becc9cc2ff03637872a440195ac4241c80536632fffeb6a1e25a74299", + "sha256:a5a85b10e450c66b49f98846937e8cfca1db3127a9d5d1e31ca45c3d0bef4c5b", + "sha256:b0997827b4f6a7c286c01c5f60384d218dca4ed7d9efa945c3e1aa623d5709ae", + "sha256:b631ef96d3222e62861443cc89d6563ba3eeb816eeb96b2629345ab795e53681", + "sha256:bf47c0607522fdbca6c9e817a6e81b08491de50f3766a7a0e6a5be7905961b41", + "sha256:f81025eddd0327c7d4cfe9b62cf33190e1e736cc6e97502b3ec425f574b3e7a8" + ], + "version": "==5.1.2" }, "redis": { "hashes": [ @@ -209,6 +491,35 @@ ], "version": "==1.3.6" }, + "requests": { + "hashes": [ + "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", + "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" + ], + "version": "==2.22.0" + }, + "responses": { + "hashes": [ + "sha256:502d9c0c8008439cfcdef7e251f507fcfdd503b56e8c0c87c3c3e3393953f790", + "sha256:97193c0183d63fba8cd3a041c75464e4b09ea0aff6328800d1546598567dde0b" + ], + "version": "==0.10.6" + }, + "retry": { + "hashes": [ + "sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606", + "sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4" + ], + "index": "pypi", + "version": "==0.9.2" + }, + "rsa": { + "hashes": [ + "sha256:14ba45700ff1ec9eeb206a2ce76b32814958a98e372006c8fb76ba820211be66", + "sha256:1a836406405730121ae9823e19c6e806c62bbad73f890574fff50efa4122c487" + ], + "version": "==4.0" + }, "s3transfer": { "hashes": [ "sha256:6efc926738a3cd576c2a79725fed9afde92378aa5c6a957e3af010cb019fac9d", @@ -225,26 +536,58 @@ }, "sqlalchemy": { "hashes": [ - "sha256:c30925d60af95443458ebd7525daf791f55762b106049ae71e18f8dd58084c2f" + "sha256:0f0768b5db594517e1f5e1572c73d14cf295140756431270d89496dc13d5e46c" + ], + "version": "==1.3.10" + }, + "sshpubkeys": { + "hashes": [ + "sha256:9f73d51c2ef1e68cd7bde0825df29b3c6ec89f4ce24ebca3bf9eaa4a23a284db", + "sha256:b388399caeeccdc145f06fd0d2665eeecc545385c60b55c282a15a022215af80" ], - "version": "==1.3.5" + "version": "==3.1.0" + }, + "typed-ast": { + "hashes": [ + "sha256:1170afa46a3799e18b4c977777ce137bb53c7485379d9706af8a59f2ea1aa161", + "sha256:18511a0b3e7922276346bcb47e2ef9f38fb90fd31cb9223eed42c85d1312344e", + "sha256:262c247a82d005e43b5b7f69aff746370538e176131c32dda9cb0f324d27141e", + "sha256:2b907eb046d049bcd9892e3076c7a6456c93a25bebfe554e931620c90e6a25b0", + "sha256:354c16e5babd09f5cb0ee000d54cfa38401d8b8891eefa878ac772f827181a3c", + "sha256:48e5b1e71f25cfdef98b013263a88d7145879fbb2d5185f2a0c79fa7ebbeae47", + "sha256:4e0b70c6fc4d010f8107726af5fd37921b666f5b31d9331f0bd24ad9a088e631", + "sha256:630968c5cdee51a11c05a30453f8cd65e0cc1d2ad0d9192819df9978984529f4", + "sha256:66480f95b8167c9c5c5c87f32cf437d585937970f3fc24386f313a4c97b44e34", + "sha256:71211d26ffd12d63a83e079ff258ac9d56a1376a25bc80b1cdcdf601b855b90b", + "sha256:7954560051331d003b4e2b3eb822d9dd2e376fa4f6d98fee32f452f52dd6ebb2", + "sha256:838997f4310012cf2e1ad3803bce2f3402e9ffb71ded61b5ee22617b3a7f6b6e", + "sha256:95bd11af7eafc16e829af2d3df510cecfd4387f6453355188342c3e79a2ec87a", + "sha256:bc6c7d3fa1325a0c6613512a093bc2a2a15aeec350451cbdf9e1d4bffe3e3233", + "sha256:cc34a6f5b426748a507dd5d1de4c1978f2eb5626d51326e43280941206c209e1", + "sha256:d755f03c1e4a51e9b24d899561fec4ccaf51f210d52abdf8c07ee2849b212a36", + "sha256:d7c45933b1bdfaf9f36c579671fec15d25b06c8398f113dab64c18ed1adda01d", + "sha256:d896919306dd0aa22d0132f62a1b78d11aaf4c9fc5b3410d3c666b818191630a", + "sha256:fdc1c9bbf79510b76408840e009ed65958feba92a88833cdceecff93ae8fff66", + "sha256:ffde2fbfad571af120fcbfbbc61c72469e72f550d676c3342492a9dfdefb8f12" + ], + "version": "==1.4.0" }, "typing-extensions": { "hashes": [ - "sha256:2ed632b30bb54fc3941c382decfd0ee4148f5c591651c9272473fea2c6397d95", - "sha256:b1edbbf0652660e32ae780ac9433f4231e7339c7f9a8057d0f042fcbcea49b87", - "sha256:d8179012ec2c620d3791ca6fe2bf7979d979acdbef1fca0bc56b37411db682ed" + "sha256:091ecc894d5e908ac75209f10d5b4f118fbdb2eb1ede6a63544054bb1edb41f2", + "sha256:910f4656f54de5993ad9304959ce9bb903f90aadc7c67a0bef07e678014e892d", + "sha256:cf8b63fedea4d89bab840ecbb93e75578af28f76f66c35889bd7065f5af88575" ], "index": "pypi", - "version": "==3.7.4" + "version": "==3.7.4.1" }, "urllib3": { "hashes": [ - "sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1", - "sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232" + "sha256:3de946ffbed6e6746608990594d08faac602528ac7015ac28d33cee6a45b7398", + "sha256:9a107b99a5393caf59c7aa3c1249c16e6879447533d0887f4336dde834c7be86" ], "markers": "python_version >= '3.4'", - "version": "==1.25.3" + "version": "==1.25.6" }, "uwsgi": { "hashes": [ @@ -259,24 +602,92 @@ ], "version": "==0.5.1" }, + "websocket-client": { + "hashes": [ + "sha256:1151d5fb3a62dc129164292e1227655e4bbc5dd5340a5165dfae61128ec50aa9", + "sha256:1fd5520878b68b84b5748bb30e592b10d0a91529d5383f74f4964e72b297fd3a" + ], + "version": "==0.56.0" + }, "werkzeug": { "hashes": [ - "sha256:865856ebb55c4dcd0630cdd8f3331a1847a819dda7e8c750d3db6f2aa6c0209c", - "sha256:a0b915f0815982fb2a09161cb8f31708052d0951c3ba433ccc5e1aa276507ca6" + "sha256:7280924747b5733b246fe23972186c6b348f9ae29724135a6dfc1e53cea433e7", + "sha256:e5f4a1f98b52b18a93da705a7458e55afb26f32bff83ff5d19189f92462d65c4" ], - "version": "==0.15.4" + "version": "==0.16.0" + }, + "wrapt": { + "hashes": [ + "sha256:565a021fd19419476b9362b05eeaa094178de64f8361e44468f9e9d7843901e1" + ], + "version": "==1.11.2" + }, + "xmltodict": { + "hashes": [ + "sha256:50d8c638ed7ecb88d90561beedbf720c9b4e851a9fa6c47ebd64e99d166d8a21", + "sha256:8bbcb45cc982f48b2ca8fe7e7827c5d792f217ecf1792626f808bf41c3b86051" + ], + "version": "==0.12.0" + }, + "zipp": { + "hashes": [ + "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e", + "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335" + ], + "version": "==0.6.0" + }, + "zope.interface": { + "hashes": [ + "sha256:086707e0f413ff8800d9c4bc26e174f7ee4c9c8b0302fbad68d083071822316c", + "sha256:1157b1ec2a1f5bf45668421e3955c60c610e31913cc695b407a574efdbae1f7b", + "sha256:11ebddf765bff3bbe8dbce10c86884d87f90ed66ee410a7e6c392086e2c63d02", + "sha256:14b242d53f6f35c2d07aa2c0e13ccb710392bcd203e1b82a1828d216f6f6b11f", + "sha256:1b3d0dcabc7c90b470e59e38a9acaa361be43b3a6ea644c0063951964717f0e5", + "sha256:20a12ab46a7e72b89ce0671e7d7a6c3c1ca2c2766ac98112f78c5bddaa6e4375", + "sha256:298f82c0ab1b182bd1f34f347ea97dde0fffb9ecf850ecf7f8904b8442a07487", + "sha256:2f6175722da6f23dbfc76c26c241b67b020e1e83ec7fe93c9e5d3dd18667ada2", + "sha256:3b877de633a0f6d81b600624ff9137312d8b1d0f517064dfc39999352ab659f0", + "sha256:4265681e77f5ac5bac0905812b828c9fe1ce80c6f3e3f8574acfb5643aeabc5b", + "sha256:550695c4e7313555549aa1cdb978dc9413d61307531f123558e438871a883d63", + "sha256:5f4d42baed3a14c290a078e2696c5f565501abde1b2f3f1a1c0a94fbf6fbcc39", + "sha256:62dd71dbed8cc6a18379700701d959307823b3b2451bdc018594c48956ace745", + "sha256:7040547e5b882349c0a2cc9b50674b1745db551f330746af434aad4f09fba2cc", + "sha256:7e099fde2cce8b29434684f82977db4e24f0efa8b0508179fce1602d103296a2", + "sha256:7e5c9a5012b2b33e87980cee7d1c82412b2ebabcb5862d53413ba1a2cfde23aa", + "sha256:81295629128f929e73be4ccfdd943a0906e5fe3cdb0d43ff1e5144d16fbb52b1", + "sha256:95cc574b0b83b85be9917d37cd2fad0ce5a0d21b024e1a5804d044aabea636fc", + "sha256:968d5c5702da15c5bf8e4a6e4b67a4d92164e334e9c0b6acf080106678230b98", + "sha256:9e998ba87df77a85c7bed53240a7257afe51a07ee6bc3445a0bf841886da0b97", + "sha256:a0c39e2535a7e9c195af956610dba5a1073071d2d85e9d2e5d789463f63e52ab", + "sha256:a15e75d284178afe529a536b0e8b28b7e107ef39626a7809b4ee64ff3abc9127", + "sha256:a6a6ff82f5f9b9702478035d8f6fb6903885653bff7ec3a1e011edc9b1a7168d", + "sha256:b639f72b95389620c1f881d94739c614d385406ab1d6926a9ffe1c8abbea23fe", + "sha256:bad44274b151d46619a7567010f7cde23a908c6faa84b97598fd2f474a0c6891", + "sha256:bbcef00d09a30948756c5968863316c949d9cedbc7aabac5e8f0ffbdb632e5f1", + "sha256:d788a3999014ddf416f2dc454efa4a5dbeda657c6aba031cf363741273804c6b", + "sha256:eed88ae03e1ef3a75a0e96a55a99d7937ed03e53d0cffc2451c208db445a2966", + "sha256:f99451f3a579e73b5dd58b1b08d1179791d49084371d9a47baad3b22417f0317" + ], + "version": "==4.6.0" } }, "develop": { + "alabaster": { + "hashes": [ + "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359", + "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02" + ], + "version": "==0.7.12" + }, "arxiv-canonical": { "path": "." }, "astroid": { "hashes": [ - "sha256:6560e1e1749f68c64a4b5dee4e091fce798d2f0d84ebe638cf0e0585a343acf4", - "sha256:b65db1bbaac9f9f4d190199bb8680af6f6f84fd3769a5ea883df8a91fe68b4c4" + "sha256:09a3fba616519311f1af8a461f804b68f0370e100c9264a035aa7846d7852e33", + "sha256:5a79c9b4bd6c4be777424593f957c996e20beb5f74e0bc332f47713c6f675efe" ], - "version": "==2.2.5" + "version": "==2.3.2" }, "atomicwrites": { "hashes": [ @@ -287,17 +698,102 @@ }, "attrs": { "hashes": [ - "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", - "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399" + "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c", + "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72" ], - "version": "==19.1.0" + "version": "==19.3.0" }, - "certifi": { + "aws-sam-translator": { + "hashes": [ + "sha256:11c62c00f37b57c39a55d7a29d93f4704a88549c29a6448ebc953147173fbe85" + ], + "version": "==1.15.1" + }, + "aws-xray-sdk": { + "hashes": [ + "sha256:75cbce8c777b7d8055719ee1a0db6043e53c44e8f1a62a956bd84db87c4a4c7c", + "sha256:ce4adb60fe67ebe91f2fc57d5067b4e44df6e233652987be4fb2e549688cf9fe" + ], + "version": "==2.4.2" + }, + "babel": { "hashes": [ - "sha256:046832c04d4e752f37383b628bc601a7ea7211496b4638f6514d0e5b9acc4939", - "sha256:945e3ba63a0b9f577b1395204e13c3a231f9bc0223888be653286534e5873695" + "sha256:af92e6106cb7c55286b25b38ad7695f8b4efb36a90ba483d7f7a6628c46158ab", + "sha256:e86135ae101e31e2c8ec20a4e0c5220f4eed12487d5cf3f78be7e98d3a57fc28" ], - "version": "==2019.6.16" + "version": "==2.7.0" + }, + "boto": { + "hashes": [ + "sha256:147758d41ae7240dc989f0039f27da8ca0d53734be0eb869ef16e3adcfa462e8", + "sha256:ea0d3b40a2d852767be77ca343b58a9e3a4b00d9db440efb8da74b4e58025e5a" + ], + "version": "==2.49.0" + }, + "boto3": { + "hashes": [ + "sha256:7fc97cb2c9cdff905e950750c8e8b23b872a84696158a28852355dc4b712ba3a", + "sha256:818c56a317c176142dbf1dca3f5b4366c80460c6cc3c4efe22f0bde736571283" + ], + "version": "==1.10.2" + }, + "botocore": { + "hashes": [ + "sha256:8223485841ef4731a5d4943a733295ba69d0005c4ae64c468308cc07f6960d39", + "sha256:f8e12dc6e536ea512f0ad25b74e7eecdf5d9e09ae92b5de236b535bee7804d5b" + ], + "version": "==1.13.2" + }, + "certifi": { + "hashes": [ + "sha256:e4f3620cfea4f83eedc95b24abd9cd56f3c4b146dd0177e83a21b4eb49e21e50", + "sha256:fd7c7c74727ddcf00e9acd26bba8da604ffec95bf1c2144e67aff7a8b50e6cef" + ], + "version": "==2019.9.11" + }, + "cffi": { + "hashes": [ + "sha256:00d890313797d9fe4420506613384b43099ad7d2b905c0752dbcc3a6f14d80fa", + "sha256:0cf9e550ac6c5e57b713437e2f4ac2d7fd0cd10336525a27224f5fc1ec2ee59a", + "sha256:0ea23c9c0cdd6778146a50d867d6405693ac3b80a68829966c98dd5e1bbae400", + "sha256:193697c2918ecdb3865acf6557cddf5076bb39f1f654975e087b67efdff83365", + "sha256:1ae14b542bf3b35e5229439c35653d2ef7d8316c1fffb980f9b7647e544baa98", + "sha256:1e389e069450609c6ffa37f21f40cce36f9be7643bbe5051ab1de99d5a779526", + "sha256:263242b6ace7f9cd4ea401428d2d45066b49a700852334fd55311bde36dcda14", + "sha256:33142ae9807665fa6511cfa9857132b2c3ee6ddffb012b3f0933fc11e1e830d5", + "sha256:364f8404034ae1b232335d8c7f7b57deac566f148f7222cef78cf8ae28ef764e", + "sha256:47368f69fe6529f8f49a5d146ddee713fc9057e31d61e8b6dc86a6a5e38cecc1", + "sha256:4895640844f17bec32943995dc8c96989226974dfeb9dd121cc45d36e0d0c434", + "sha256:558b3afef987cf4b17abd849e7bedf64ee12b28175d564d05b628a0f9355599b", + "sha256:5ba86e1d80d458b338bda676fd9f9d68cb4e7a03819632969cf6d46b01a26730", + "sha256:63424daa6955e6b4c70dc2755897f5be1d719eabe71b2625948b222775ed5c43", + "sha256:6381a7d8b1ebd0bc27c3bc85bc1bfadbb6e6f756b4d4db0aa1425c3719ba26b4", + "sha256:6381ab708158c4e1639da1f2a7679a9bbe3e5a776fc6d1fd808076f0e3145331", + "sha256:6fd58366747debfa5e6163ada468a90788411f10c92597d3b0a912d07e580c36", + "sha256:728ec653964655d65408949b07f9b2219df78badd601d6c49e28d604efe40599", + "sha256:7cfcfda59ef1f95b9f729c56fe8a4041899f96b72685d36ef16a3440a0f85da8", + "sha256:819f8d5197c2684524637f940445c06e003c4a541f9983fd30d6deaa2a5487d8", + "sha256:825ecffd9574557590e3225560a8a9d751f6ffe4a49e3c40918c9969b93395fa", + "sha256:8a2bcae2258d00fcfc96a9bde4a6177bc4274fe033f79311c5dd3d3148c26518", + "sha256:9009e917d8f5ef780c2626e29b6bc126f4cb2a4d43ca67aa2b40f2a5d6385e78", + "sha256:9c77564a51d4d914ed5af096cd9843d90c45b784b511723bd46a8a9d09cf16fc", + "sha256:a19089fa74ed19c4fe96502a291cfdb89223a9705b1d73b3005df4256976142e", + "sha256:a40ed527bffa2b7ebe07acc5a3f782da072e262ca994b4f2085100b5a444bbb2", + "sha256:b8f09f21544b9899defb09afbdaeb200e6a87a2b8e604892940044cf94444644", + "sha256:bb75ba21d5716abc41af16eac1145ab2e471deedde1f22c6f99bd9f995504df0", + "sha256:e22a00c0c81ffcecaf07c2bfb3672fa372c50e2bd1024ffee0da191c1b27fc71", + "sha256:e55b5a746fb77f10c83e8af081979351722f6ea48facea79d470b3731c7b2891", + "sha256:ec2fa3ee81707a5232bf2dfbd6623fdb278e070d596effc7e2d788f2ada71a05", + "sha256:fd82eb4694be712fcae03c717ca2e0fc720657ac226b80bbb597e971fc6928c2" + ], + "version": "==1.13.1" + }, + "cfn-lint": { + "hashes": [ + "sha256:6d32de719943f2cf1f2e8edf6ab1893764bb66dcbb78718540711bd9a0711bff", + "sha256:9ff6aaee29faae063006be2740a34c141f75512c0f237cd842eb3c76890cd66c" + ], + "version": "==0.24.6" }, "chardet": { "hashes": [ @@ -308,48 +804,89 @@ }, "coverage": { "hashes": [ - "sha256:3684fabf6b87a369017756b551cef29e505cb155ddb892a7a29277b978da88b9", - "sha256:39e088da9b284f1bd17c750ac672103779f7954ce6125fd4382134ac8d152d74", - "sha256:3c205bc11cc4fcc57b761c2da73b9b72a59f8d5ca89979afb0c1c6f9e53c7390", - "sha256:465ce53a8c0f3a7950dfb836438442f833cf6663d407f37d8c52fe7b6e56d7e8", - "sha256:48020e343fc40f72a442c8a1334284620f81295256a6b6ca6d8aa1350c763bbe", - "sha256:5296fc86ab612ec12394565c500b412a43b328b3907c0d14358950d06fd83baf", - "sha256:5f61bed2f7d9b6a9ab935150a6b23d7f84b8055524e7be7715b6513f3328138e", - "sha256:68a43a9f9f83693ce0414d17e019daee7ab3f7113a70c79a3dd4c2f704e4d741", - "sha256:6b8033d47fe22506856fe450470ccb1d8ba1ffb8463494a15cfc96392a288c09", - "sha256:7ad7536066b28863e5835e8cfeaa794b7fe352d99a8cded9f43d1161be8e9fbd", - "sha256:7bacb89ccf4bedb30b277e96e4cc68cd1369ca6841bde7b005191b54d3dd1034", - "sha256:839dc7c36501254e14331bcb98b27002aa415e4af7ea039d9009409b9d2d5420", - "sha256:8f9a95b66969cdea53ec992ecea5406c5bd99c9221f539bca1e8406b200ae98c", - "sha256:932c03d2d565f75961ba1d3cec41ddde00e162c5b46d03f7423edcb807734eab", - "sha256:988529edadc49039d205e0aa6ce049c5ccda4acb2d6c3c5c550c17e8c02c05ba", - "sha256:998d7e73548fe395eeb294495a04d38942edb66d1fa61eb70418871bc621227e", - "sha256:9de60893fb447d1e797f6bf08fdf0dbcda0c1e34c1b06c92bd3a363c0ea8c609", - "sha256:9e80d45d0c7fcee54e22771db7f1b0b126fb4a6c0a2e5afa72f66827207ff2f2", - "sha256:a545a3dfe5082dc8e8c3eb7f8a2cf4f2870902ff1860bd99b6198cfd1f9d1f49", - "sha256:a5d8f29e5ec661143621a8f4de51adfb300d7a476224156a39a392254f70687b", - "sha256:aca06bfba4759bbdb09bf52ebb15ae20268ee1f6747417837926fae990ebc41d", - "sha256:bb23b7a6fd666e551a3094ab896a57809e010059540ad20acbeec03a154224ce", - "sha256:bfd1d0ae7e292105f29d7deaa9d8f2916ed8553ab9d5f39ec65bcf5deadff3f9", - "sha256:c62ca0a38958f541a73cf86acdab020c2091631c137bd359c4f5bddde7b75fd4", - "sha256:c709d8bda72cf4cd348ccec2a4881f2c5848fd72903c185f363d361b2737f773", - "sha256:c968a6aa7e0b56ecbd28531ddf439c2ec103610d3e2bf3b75b813304f8cb7723", - "sha256:df785d8cb80539d0b55fd47183264b7002077859028dfe3070cf6359bf8b2d9c", - "sha256:f406628ca51e0ae90ae76ea8398677a921b36f0bd71aab2099dfed08abd0322f", - "sha256:f46087bbd95ebae244a0eda01a618aff11ec7a069b15a3ef8f6b520db523dcf1", - "sha256:f8019c5279eb32360ca03e9fac40a12667715546eed5c5eb59eb381f2f501260", - "sha256:fc5f4d209733750afd2714e9109816a29500718b32dd9a5db01c0cb3a019b96a" + "sha256:08907593569fe59baca0bf152c43f3863201efb6113ecb38ce7e97ce339805a6", + "sha256:0be0f1ed45fc0c185cfd4ecc19a1d6532d72f86a2bac9de7e24541febad72650", + "sha256:141f08ed3c4b1847015e2cd62ec06d35e67a3ac185c26f7635f4406b90afa9c5", + "sha256:19e4df788a0581238e9390c85a7a09af39c7b539b29f25c89209e6c3e371270d", + "sha256:23cc09ed395b03424d1ae30dcc292615c1372bfba7141eb85e11e50efaa6b351", + "sha256:245388cda02af78276b479f299bbf3783ef0a6a6273037d7c60dc73b8d8d7755", + "sha256:331cb5115673a20fb131dadd22f5bcaf7677ef758741312bee4937d71a14b2ef", + "sha256:386e2e4090f0bc5df274e720105c342263423e77ee8826002dcffe0c9533dbca", + "sha256:3a794ce50daee01c74a494919d5ebdc23d58873747fa0e288318728533a3e1ca", + "sha256:60851187677b24c6085248f0a0b9b98d49cba7ecc7ec60ba6b9d2e5574ac1ee9", + "sha256:63a9a5fc43b58735f65ed63d2cf43508f462dc49857da70b8980ad78d41d52fc", + "sha256:6b62544bb68106e3f00b21c8930e83e584fdca005d4fffd29bb39fb3ffa03cb5", + "sha256:6ba744056423ef8d450cf627289166da65903885272055fb4b5e113137cfa14f", + "sha256:7494b0b0274c5072bddbfd5b4a6c6f18fbbe1ab1d22a41e99cd2d00c8f96ecfe", + "sha256:826f32b9547c8091679ff292a82aca9c7b9650f9fda3e2ca6bf2ac905b7ce888", + "sha256:93715dffbcd0678057f947f496484e906bf9509f5c1c38fc9ba3922893cda5f5", + "sha256:9a334d6c83dfeadae576b4d633a71620d40d1c379129d587faa42ee3e2a85cce", + "sha256:af7ed8a8aa6957aac47b4268631fa1df984643f07ef00acd374e456364b373f5", + "sha256:bf0a7aed7f5521c7ca67febd57db473af4762b9622254291fbcbb8cd0ba5e33e", + "sha256:bf1ef9eb901113a9805287e090452c05547578eaab1b62e4ad456fcc049a9b7e", + "sha256:c0afd27bc0e307a1ffc04ca5ec010a290e49e3afbe841c5cafc5c5a80ecd81c9", + "sha256:dd579709a87092c6dbee09d1b7cfa81831040705ffa12a1b248935274aee0437", + "sha256:df6712284b2e44a065097846488f66840445eb987eb81b3cc6e4149e7b6982e1", + "sha256:e07d9f1a23e9e93ab5c62902833bf3e4b1f65502927379148b6622686223125c", + "sha256:e2ede7c1d45e65e209d6093b762e98e8318ddeff95317d07a27a2140b80cfd24", + "sha256:e4ef9c164eb55123c62411f5936b5c2e521b12356037b6e1c2617cef45523d47", + "sha256:eca2b7343524e7ba246cab8ff00cab47a2d6d54ada3b02772e908a45675722e2", + "sha256:eee64c616adeff7db37cc37da4180a3a5b6177f5c46b187894e633f088fb5b28", + "sha256:ef824cad1f980d27f26166f86856efe11eff9912c4fed97d3804820d43fa550c", + "sha256:efc89291bd5a08855829a3c522df16d856455297cf35ae827a37edac45f466a7", + "sha256:fa964bae817babece5aa2e8c1af841bebb6d0b9add8e637548809d040443fee0", + "sha256:ff37757e068ae606659c28c3bd0d923f9d29a85de79bf25b2b34b148473b5025" ], "index": "pypi", - "version": "==4.5.3" + "version": "==4.5.4" }, "coveralls": { "hashes": [ - "sha256:d3d49234bffd41e91b241a69f0ebb9f64d7f0515711a76134d53d4647e7eb509", - "sha256:dafabcff87425fa2ab3122dee21229afbb4d6692cfdacc6bb895f7dfa8b2c849" + "sha256:9bc5a1f92682eef59f688a8f280207190d9a6afb84cef8f567fa47631a784060", + "sha256:fb51cddef4bc458de347274116df15d641a735d3f0a580a9472174e2e62f408c" ], "index": "pypi", - "version": "==1.8.1" + "version": "==1.8.2" + }, + "cryptography": { + "hashes": [ + "sha256:02079a6addc7b5140ba0825f542c0869ff4df9a69c360e339ecead5baefa843c", + "sha256:1df22371fbf2004c6f64e927668734070a8953362cd8370ddd336774d6743595", + "sha256:369d2346db5934345787451504853ad9d342d7f721ae82d098083e1f49a582ad", + "sha256:3cda1f0ed8747339bbdf71b9f38ca74c7b592f24f65cdb3ab3765e4b02871651", + "sha256:44ff04138935882fef7c686878e1c8fd80a723161ad6a98da31e14b7553170c2", + "sha256:4b1030728872c59687badcca1e225a9103440e467c17d6d1730ab3d2d64bfeff", + "sha256:58363dbd966afb4f89b3b11dfb8ff200058fbc3b947507675c19ceb46104b48d", + "sha256:6ec280fb24d27e3d97aa731e16207d58bd8ae94ef6eab97249a2afe4ba643d42", + "sha256:7270a6c29199adc1297776937a05b59720e8a782531f1f122f2eb8467f9aab4d", + "sha256:73fd30c57fa2d0a1d7a49c561c40c2f79c7d6c374cc7750e9ac7c99176f6428e", + "sha256:7f09806ed4fbea8f51585231ba742b58cbcfbfe823ea197d8c89a5e433c7e912", + "sha256:90df0cc93e1f8d2fba8365fb59a858f51a11a394d64dbf3ef844f783844cc793", + "sha256:971221ed40f058f5662a604bd1ae6e4521d84e6cad0b7b170564cc34169c8f13", + "sha256:a518c153a2b5ed6b8cc03f7ae79d5ffad7315ad4569b2d5333a13c38d64bd8d7", + "sha256:b0de590a8b0979649ebeef8bb9f54394d3a41f66c5584fff4220901739b6b2f0", + "sha256:b43f53f29816ba1db8525f006fa6f49292e9b029554b3eb56a189a70f2a40879", + "sha256:d31402aad60ed889c7e57934a03477b572a03af7794fa8fb1780f21ea8f6551f", + "sha256:de96157ec73458a7f14e3d26f17f8128c959084931e8997b9e655a39c8fde9f9", + "sha256:df6b4dca2e11865e6cfbfb708e800efb18370f5a46fd601d3755bc7f85b3a8a2", + "sha256:ecadccc7ba52193963c0475ac9f6fa28ac01e01349a2ca48509667ef41ffd2cf", + "sha256:fb81c17e0ebe3358486cd8cc3ad78adbae58af12fc2bf2bc0bb84e8090fa5ce8" + ], + "version": "==2.8" + }, + "datetime": { + "hashes": [ + "sha256:371dba07417b929a4fa685c2f7a3eaa6a62d60c02947831f97d4df9a9e70dfd0", + "sha256:5cef605bab8259ff61281762cdf3290e459fbf0b4719951d5fab967d5f2ea0ea" + ], + "version": "==4.3" + }, + "docker": { + "hashes": [ + "sha256:6e06c5e70ba4fad73e35f00c55a895a448398f3ada7faae072e2bb01348bafc1", + "sha256:8f93775b8bdae3a2df6bc9a5312cce564cade58d6555f2c2570165a1270cd8a7" + ], + "version": "==4.1.0" }, "docopt": { "hashes": [ @@ -357,6 +894,27 @@ ], "version": "==0.6.2" }, + "docutils": { + "hashes": [ + "sha256:6c4f696463b79f1fb8ba0c594b63840ebd41f059e92b31957c46b74a4599b6d0", + "sha256:9e4d7ecfc600058e07ba661411a2b7de2fd0fafa17d1a7f7361cd47b1175c827", + "sha256:a2aeea129088da402665e92e0b25b04b073c04b2dce4ab65caaa38b7ce2e1a99" + ], + "version": "==0.15.2" + }, + "ecdsa": { + "hashes": [ + "sha256:163c80b064a763ea733870feb96f9dd9b92216cfcacd374837af18e4e8ec3d4d", + "sha256:9814e700890991abeceeb2242586024d4758c8fc18445b194a49bd62d85861db" + ], + "version": "==0.13.3" + }, + "future": { + "hashes": [ + "sha256:858e38522e8fd0d3ce8f0c1feaf0603358e366d5403209674c7b617fa0c24093" + ], + "version": "==0.18.1" + }, "idna": { "hashes": [ "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", @@ -364,12 +922,19 @@ ], "version": "==2.8" }, + "imagesize": { + "hashes": [ + "sha256:3f349de3eb99145973fefb7dbe38554414e5c30abd0c8e4b970a7c9d09f3a1d8", + "sha256:f3832918bc3c66617f92e35f5d70729187676313caa60c187eb0f28b8fe5e3b5" + ], + "version": "==1.1.0" + }, "importlib-metadata": { "hashes": [ - "sha256:6dfd58dfe281e8d240937776065dd3624ad5469c835248219bd16cf2e12dbeb7", - "sha256:cb6ee23b46173539939964df59d3d72c3e0c1b5d54b84f1d8a7e912fe43612db" + "sha256:aa18d7378b00b40847790e7c27e11673d7fed219354109d0e7b9e5b25dc3ad26", + "sha256:d5f18a79777f3aa179c145737780282e27b508fc8fd688cb17c7a813e8bd39af" ], - "version": "==0.18" + "version": "==0.23" }, "isort": { "hashes": [ @@ -378,28 +943,113 @@ ], "version": "==4.3.21" }, + "jinja2": { + "hashes": [ + "sha256:74320bb91f31270f9551d46522e33af46a80c3d619f4a4bf42b3164d30b5911f", + "sha256:9fe95f19286cfefaa917656583d020be14e7859c6b0252588391e47db34527de" + ], + "version": "==2.10.3" + }, + "jmespath": { + "hashes": [ + "sha256:3720a4b1bd659dd2eecad0666459b9788813e032b83e7ba58578e48254e0a0e6", + "sha256:bde2aef6f44302dfb30320115b17d030798de8c4110e28d5cf6cf91a7a31074c" + ], + "version": "==0.9.4" + }, + "jsondiff": { + "hashes": [ + "sha256:7e18138aecaa4a8f3b7ac7525b8466234e6378dd6cae702b982c9ed851d2ae21" + ], + "version": "==1.1.2" + }, + "jsonpatch": { + "hashes": [ + "sha256:83f29a2978c13da29bfdf89da9d65542d62576479caf215df19632d7dc04c6e6", + "sha256:cbb72f8bf35260628aea6b508a107245f757d1ec839a19c34349985e2c05645a" + ], + "version": "==1.24" + }, + "jsonpickle": { + "hashes": [ + "sha256:d0c5a4e6cb4e58f6d5406bdded44365c2bcf9c836c4f52910cc9ba7245a59dc2", + "sha256:d3e922d781b1d0096df2dad89a2e1f47177d7969b596aea806a9d91b4626b29b" + ], + "version": "==1.2" + }, + "jsonpointer": { + "hashes": [ + "sha256:c192ba86648e05fdae4f08a17ec25180a9aef5008d973407b581798a83975362", + "sha256:ff379fa021d1b81ab539f5ec467c7745beb1a5671463f9dcc2b2d458bd361c1e" + ], + "version": "==2.0" + }, + "jsonschema": { + "hashes": [ + "sha256:2fa0684276b6333ff3c0b1b27081f4b2305f0a36cf702a23db50edb141893c3f", + "sha256:94c0a13b4a0616458b42529091624e66700a17f847453e52279e35509a5b7631" + ], + "index": "pypi", + "version": "==3.1.1" + }, "lazy-object-proxy": { "hashes": [ - "sha256:159a745e61422217881c4de71f9eafd9d703b93af95618635849fe469a283661", - "sha256:23f63c0821cc96a23332e45dfaa83266feff8adc72b9bcaef86c202af765244f", - "sha256:3b11be575475db2e8a6e11215f5aa95b9ec14de658628776e10d96fa0b4dac13", - "sha256:3f447aff8bc61ca8b42b73304f6a44fa0d915487de144652816f950a3f1ab821", - "sha256:4ba73f6089cd9b9478bc0a4fa807b47dbdb8fad1d8f31a0f0a5dbf26a4527a71", - "sha256:4f53eadd9932055eac465bd3ca1bd610e4d7141e1278012bd1f28646aebc1d0e", - "sha256:64483bd7154580158ea90de5b8e5e6fc29a16a9b4db24f10193f0c1ae3f9d1ea", - "sha256:6f72d42b0d04bfee2397aa1862262654b56922c20a9bb66bb76b6f0e5e4f9229", - "sha256:7c7f1ec07b227bdc561299fa2328e85000f90179a2f44ea30579d38e037cb3d4", - "sha256:7c8b1ba1e15c10b13cad4171cfa77f5bb5ec2580abc5a353907780805ebe158e", - "sha256:8559b94b823f85342e10d3d9ca4ba5478168e1ac5658a8a2f18c991ba9c52c20", - "sha256:a262c7dfb046f00e12a2bdd1bafaed2408114a89ac414b0af8755c696eb3fc16", - "sha256:acce4e3267610c4fdb6632b3886fe3f2f7dd641158a843cf6b6a68e4ce81477b", - "sha256:be089bb6b83fac7f29d357b2dc4cf2b8eb8d98fe9d9ff89f9ea6012970a853c7", - "sha256:bfab710d859c779f273cc48fb86af38d6e9210f38287df0069a63e40b45a2f5c", - "sha256:c10d29019927301d524a22ced72706380de7cfc50f767217485a912b4c8bd82a", - "sha256:dd6e2b598849b3d7aee2295ac765a578879830fb8966f70be8cd472e6069932e", - "sha256:e408f1eacc0a68fed0c08da45f31d0ebb38079f043328dce69ff133b95c29dc1" - ], - "version": "==1.4.1" + "sha256:0c4b206227a8097f05c4dbdd323c50edf81f15db3b8dc064d08c62d37e1a504d", + "sha256:194d092e6f246b906e8f70884e620e459fc54db3259e60cf69a4d66c3fda3449", + "sha256:1be7e4c9f96948003609aa6c974ae59830a6baecc5376c25c92d7d697e684c08", + "sha256:4677f594e474c91da97f489fea5b7daa17b5517190899cf213697e48d3902f5a", + "sha256:48dab84ebd4831077b150572aec802f303117c8cc5c871e182447281ebf3ac50", + "sha256:5541cada25cd173702dbd99f8e22434105456314462326f06dba3e180f203dfd", + "sha256:59f79fef100b09564bc2df42ea2d8d21a64fdcda64979c0fa3db7bdaabaf6239", + "sha256:8d859b89baf8ef7f8bc6b00aa20316483d67f0b1cbf422f5b4dc56701c8f2ffb", + "sha256:9254f4358b9b541e3441b007a0ea0764b9d056afdeafc1a5569eee1cc6c1b9ea", + "sha256:9651375199045a358eb6741df3e02a651e0330be090b3bc79f6d0de31a80ec3e", + "sha256:97bb5884f6f1cdce0099f86b907aa41c970c3c672ac8b9c8352789e103cf3156", + "sha256:9b15f3f4c0f35727d3a0fba4b770b3c4ebbb1fa907dbcc046a1d2799f3edd142", + "sha256:a2238e9d1bb71a56cd710611a1614d1194dc10a175c1e08d75e1a7bcc250d442", + "sha256:a6ae12d08c0bf9909ce12385803a543bfe99b95fe01e752536a60af2b7797c62", + "sha256:ca0a928a3ddbc5725be2dd1cf895ec0a254798915fb3a36af0964a0a4149e3db", + "sha256:cb2c7c57005a6804ab66f106ceb8482da55f5314b7fcb06551db1edae4ad1531", + "sha256:d74bb8693bf9cf75ac3b47a54d716bbb1a92648d5f781fc799347cfc95952383", + "sha256:d945239a5639b3ff35b70a88c5f2f491913eb94871780ebfabb2568bd58afc5a", + "sha256:eba7011090323c1dadf18b3b689845fd96a61ba0a1dfbd7f24b921398affc357", + "sha256:efa1909120ce98bbb3777e8b6f92237f5d5c8ea6758efea36a473e1d38f7d3e4", + "sha256:f3900e8a5de27447acbf900b4750b0ddfd7ec1ea7fbaf11dfa911141bc522af0" + ], + "version": "==1.4.3" + }, + "markupsafe": { + "hashes": [ + "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473", + "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", + "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", + "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", + "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", + "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", + "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", + "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", + "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", + "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", + "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", + "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", + "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", + "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", + "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905", + "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735", + "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d", + "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e", + "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d", + "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c", + "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21", + "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2", + "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5", + "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b", + "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", + "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", + "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", + "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7" + ], + "version": "==1.1.1" }, "mccabe": { "hashes": [ @@ -408,37 +1058,50 @@ ], "version": "==0.6.1" }, + "mock": { + "hashes": [ + "sha256:83657d894c90d5681d62155c82bda9c1187827525880eda8ff5df4ec813437c3", + "sha256:d157e52d4e5b938c550f39eb2fd15610db062441a9c2747d3dbfa9298211d0f8" + ], + "version": "==3.0.5" + }, "more-itertools": { "hashes": [ - "sha256:3ad685ff8512bf6dc5a8b82ebf73543999b657eded8c11803d9ba6b648986f4d", - "sha256:8bb43d1f51ecef60d81854af61a3a880555a14643691cc4b64a6ee269c78f09a" + "sha256:409cd48d4db7052af495b09dec721011634af3753ae1ef92d2b32f73a745f832", + "sha256:92b8c4b06dac4f0611c0729b2f2ede52b2e1bac1ab48f089c7ddc12e26bb60c4" ], - "markers": "python_version > '2.7'", - "version": "==7.1.0" + "version": "==7.2.0" + }, + "moto": { + "hashes": [ + "sha256:95d48d8ebaad47fb5bb4233854cf1cf8523ec5307d50eb1e4017ce10f1960b66" + ], + "index": "pypi", + "version": "==1.3.13" }, "mypy": { "hashes": [ - "sha256:12d18bd7fc642c5d54b1bb62dde813a7e2ab79b32ee11ff206ac387c68fc2ad4", - "sha256:23e24bc1683a36f39dee67d8ac74ea414654642eee26d420bada95b8ee8c9095", - "sha256:2b38e64c52a8968df4ebcae0ddba4a54eb94d184695dd4e54e14509a9389b78c", - "sha256:3d4f551466a76e278187ec3a5b26cfb50f72f6760b749aa00ac69a6f9c99898d", - "sha256:53d5dacb8d844e50be698830509aa592b093547e7ab90aee63eb23db61109007", - "sha256:56f981d246010ba21cac6b2455eaecfaf68fc8a5663d865b26c8e579c36f751d", - "sha256:8c57f6f59f1e8479d9fc6e1bf034353e54626ed64e32394c613afc493a441dc1", - "sha256:bbed4a593d87476b592d52867ef86da2155ccd0becf0c4c02e6567d842e43368", - "sha256:d6ff850e2ba18b2db7704897c8f2f1384478e3b75ad292ec06196bf7794f3a40", - "sha256:e13b1bb8785d7f785e0b88873f1c21cda58ceba9ce1153b58cbfa24b09a111d5", - "sha256:e2b9ee6f648ce72d6741925a47c88c2391168ef973b6f74f17969450c5b1ffdd" + "sha256:0107bff4f46a289f0e4081d59b77cef1c48ea43da5a0dbf0005d54748b26df2a", + "sha256:07957f5471b3bb768c61f08690c96d8a09be0912185a27a68700f3ede99184e4", + "sha256:10af62f87b6921eac50271e667cc234162a194e742d8e02fc4ddc121e129a5b0", + "sha256:11fd60d2f69f0cefbe53ce551acf5b1cec1a89e7ce2d47b4e95a84eefb2899ae", + "sha256:15e43d3b1546813669bd1a6ec7e6a11d2888db938e0607f7b5eef6b976671339", + "sha256:352c24ba054a89bb9a35dd064ee95ab9b12903b56c72a8d3863d882e2632dc76", + "sha256:437020a39417e85e22ea8edcb709612903a9924209e10b3ec6d8c9f05b79f498", + "sha256:49925f9da7cee47eebf3420d7c0e00ec662ec6abb2780eb0a16260a7ba25f9c4", + "sha256:6724fcd5777aa6cebfa7e644c526888c9d639bd22edd26b2a8038c674a7c34bd", + "sha256:7a17613f7ea374ab64f39f03257f22b5755335b73251d0d253687a69029701ba", + "sha256:cdc1151ced496ca1496272da7fc356580e95f2682be1d32377c22ddebdf73c91" ], "index": "pypi", - "version": "==0.711" + "version": "==0.720" }, "mypy-extensions": { "hashes": [ - "sha256:37e0e956f41369209a3d5f34580150bcacfabaa57b33a15c0b25f4b5725e0812", - "sha256:b16cabe759f55e3409a7d231ebd2841378fb0c27a5d1994719e340e4f429ac3e" + "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d", + "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8" ], - "version": "==0.4.1" + "version": "==0.4.3" }, "nose2": { "hashes": [ @@ -450,17 +1113,17 @@ }, "packaging": { "hashes": [ - "sha256:0c98a5d0be38ed775798ece1b9727178c4469d9c3b4ada66e8e6b7849f8732af", - "sha256:9e1cbf8c12b1f1ce0bb5344b8d7ecf66a6f8a6e91bcb0c84593ed6d3ab5c4ab3" + "sha256:28b924174df7a2fa32c1953825ff29c61e2f5e082343165438812f00d3a7fc47", + "sha256:d9551545c6d761f3def1677baf08ab2a3ca17c56879e70fecba2fc4dde4ed108" ], - "version": "==19.0" + "version": "==19.2" }, "pluggy": { "hashes": [ - "sha256:0825a152ac059776623854c1543d65a4ad408eb3d33ee114dff91e57ec6ae6fc", - "sha256:b9817417e95936bf75d85d3f8767f7df6cdde751fc40aed3bb3074cbcb77757c" + "sha256:0db4b7601aae1d35b4a033282da476845aa19185c1e6964b25cf324b5e4ec3e6", + "sha256:fa5fa1622fa6dd5c030e9cad086fa19ef6a0cf6d7a2d12318e10cb49d6d68f34" ], - "version": "==0.12.0" + "version": "==0.13.0" }, "py": { "hashes": [ @@ -469,45 +1132,111 @@ ], "version": "==1.8.0" }, + "pyasn1": { + "hashes": [ + "sha256:62cdade8b5530f0b185e09855dd422bc05c0bbff6b72ff61381c09dac7befd8c", + "sha256:a9495356ca1d66ed197a0f72b41eb1823cf7ea8b5bd07191673e8147aecf8604" + ], + "version": "==0.4.7" + }, + "pycparser": { + "hashes": [ + "sha256:a988718abfad80b6b157acce7bf130a30876d27603738ac39f140993246b25b3" + ], + "version": "==2.19" + }, "pydocstyle": { "hashes": [ - "sha256:2258f9b0df68b97bf3a6c29003edc5238ff8879f1efb6f1999988d934e432bd8", - "sha256:5741c85e408f9e0ddf873611085e819b809fca90b619f5fd7f34bd4959da3dd4", - "sha256:ed79d4ec5e92655eccc21eb0c6cf512e69512b4a97d215ace46d17e4990f2039" + "sha256:04c84e034ebb56eb6396c820442b8c4499ac5eb94a3bda88951ac3dc519b6058", + "sha256:66aff87ffe34b1e49bff2dd03a88ce6843be2f3346b0c9814410d34987fbab59" ], "index": "pypi", - "version": "==3.0.0" + "version": "==4.0.1" + }, + "pygments": { + "hashes": [ + "sha256:71e430bc85c88a430f000ac1d9b331d2407f681d6f6aec95e8bcfbc3df5b0127", + "sha256:881c4c157e45f30af185c1ffe8d549d48ac9127433f2c380c24b84572ad66297" + ], + "version": "==2.4.2" }, "pylint": { "hashes": [ - "sha256:5d77031694a5fb97ea95e828c8d10fc770a1df6eb3906067aaed42201a8a6a09", - "sha256:723e3db49555abaf9bf79dc474c6b9e2935ad82230b10c1138a71ea41ac0fff1" + "sha256:7b76045426c650d2b0f02fc47c14d7934d17898779da95288a74c2a7ec440702", + "sha256:856476331f3e26598017290fd65bebe81c960e806776f324093a46b76fb2d1c0" ], "index": "pypi", - "version": "==2.3.1" + "version": "==2.4.3" }, "pyparsing": { "hashes": [ - "sha256:1873c03321fc118f4e9746baf201ff990ceb915f433f23b395f5580d1840cb2a", - "sha256:9b6323ef4ab914af344ba97510e966d64ba91055d6b9afa6b30799340e89cc03" + "sha256:6f98a7b9397e206d78cc01df10131398f1c8b8510a2f4d97d9abd82e1aacdd80", + "sha256:d9338df12903bbf5d65a0e4e87c2161968b10d2e489652bb47001d82a9b028b4" + ], + "version": "==2.4.2" + }, + "pyrsistent": { + "hashes": [ + "sha256:eb6545dbeb1aa69ab1fb4809bfbf5a8705e44d92ef8fc7c2361682a47c46c778" ], - "version": "==2.4.0" + "version": "==0.15.5" }, "pytest": { "hashes": [ - "sha256:4a784f1d4f2ef198fe9b7aef793e9fa1a3b2f84e822d9b3a64a181293a572d45", - "sha256:926855726d8ae8371803f7b2e6ec0a69953d9c6311fa7c3b6c1b929ff92d27da" + "sha256:27abc3fef618a01bebb1f0d6d303d2816a99aa87a5968ebc32fe971be91eb1e6", + "sha256:58cee9e09242937e136dbb3dab466116ba20d6b7828c7620f23947f37eb4dae4" ], "index": "pypi", - "version": "==4.6.3" + "version": "==5.2.2" }, "pytest-cov": { "hashes": [ - "sha256:2b097cde81a302e1047331b48cadacf23577e431b61e9c6f49a1170bbe3d3da6", - "sha256:e00ea4fdde970725482f1f35630d12f074e121a23801aabf2ae154ec6bdd343a" + "sha256:cc6742d8bac45070217169f5f72ceee1e0e55b0221f54bcf24845972d3a47f2b", + "sha256:cdbdef4f870408ebdbfeb44e63e07eb18bb4619fae852f6e760645fa36172626" + ], + "index": "pypi", + "version": "==2.8.1" + }, + "python-dateutil": { + "hashes": [ + "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb", + "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e" ], "index": "pypi", - "version": "==2.7.1" + "version": "==2.8.0" + }, + "python-jose": { + "hashes": [ + "sha256:29701d998fe560e52f17246c3213a882a4a39da7e42c7015bcc1f7823ceaff1c", + "sha256:ed7387f0f9af2ea0ddc441d83a6eb47a5909bd0c8a72ac3250e75afec2cc1371" + ], + "version": "==3.0.1" + }, + "pytz": { + "hashes": [ + "sha256:1c557d7d0e871de1f5ccd5833f60fb2550652da6be2693c1e02300743d21500d", + "sha256:b02c06db6cf09c12dd25137e563b31700d3b80fcc4ad23abb7a315f2789819be" + ], + "index": "pypi", + "version": "==2019.3" + }, + "pyyaml": { + "hashes": [ + "sha256:0113bc0ec2ad727182326b61326afa3d1d8280ae1122493553fd6f4397f33df9", + "sha256:01adf0b6c6f61bd11af6e10ca52b7d4057dd0be0343eb9283c878cf3af56aee4", + "sha256:5124373960b0b3f4aa7df1707e63e9f109b5263eca5976c66e08b1c552d4eaf8", + "sha256:5ca4f10adbddae56d824b2c09668e91219bb178a1eee1faa56af6f99f11bf696", + "sha256:7907be34ffa3c5a32b60b95f4d95ea25361c951383a894fec31be7252b2b6f34", + "sha256:7ec9b2a4ed5cad025c2278a1e6a19c011c80a3caaac804fd2d329e9cc2c287c9", + "sha256:87ae4c829bb25b9fe99cf71fbb2140c448f534e24c998cc60f39ae4f94396a73", + "sha256:9de9919becc9cc2ff03637872a440195ac4241c80536632fffeb6a1e25a74299", + "sha256:a5a85b10e450c66b49f98846937e8cfca1db3127a9d5d1e31ca45c3d0bef4c5b", + "sha256:b0997827b4f6a7c286c01c5f60384d218dca4ed7d9efa945c3e1aa623d5709ae", + "sha256:b631ef96d3222e62861443cc89d6563ba3eeb816eeb96b2629345ab795e53681", + "sha256:bf47c0607522fdbca6c9e817a6e81b08491de50f3766a7a0e6a5be7905961b41", + "sha256:f81025eddd0327c7d4cfe9b62cf33190e1e736cc6e97502b3ec425f574b3e7a8" + ], + "version": "==5.1.2" }, "requests": { "hashes": [ @@ -516,6 +1245,27 @@ ], "version": "==2.22.0" }, + "responses": { + "hashes": [ + "sha256:502d9c0c8008439cfcdef7e251f507fcfdd503b56e8c0c87c3c3e3393953f790", + "sha256:97193c0183d63fba8cd3a041c75464e4b09ea0aff6328800d1546598567dde0b" + ], + "version": "==0.10.6" + }, + "rsa": { + "hashes": [ + "sha256:14ba45700ff1ec9eeb206a2ce76b32814958a98e372006c8fb76ba820211be66", + "sha256:1a836406405730121ae9823e19c6e806c62bbad73f890574fff50efa4122c487" + ], + "version": "==4.0" + }, + "s3transfer": { + "hashes": [ + "sha256:6efc926738a3cd576c2a79725fed9afde92378aa5c6a957e3af010cb019fac9d", + "sha256:b780f2411b824cb541dbcd2c713d0cb61c7d1bcadae204cdddda2b35cef493ba" + ], + "version": "==0.2.1" + }, "six": { "hashes": [ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", @@ -525,38 +1275,117 @@ }, "snowballstemmer": { "hashes": [ - "sha256:9f3b9ffe0809d174f7047e121431acf99c89a7040f0ca84f94ba53a498e6d0c9" + "sha256:209f257d7533fdb3cb73bdbd24f436239ca3b2fa67d56f6ff88e86be08cc5ef0", + "sha256:df3bac3df4c2c01363f3dd2cfa78cce2840a79b9f1c2d2de9ce8d31683992f52" ], + "version": "==2.0.0" + }, + "sphinx": { + "hashes": [ + "sha256:31088dfb95359384b1005619827eaee3056243798c62724fd3fa4b84ee4d71bd", + "sha256:52286a0b9d7caa31efee301ec4300dbdab23c3b05da1c9024b4e84896fb73d79" + ], + "index": "pypi", + "version": "==2.2.1" + }, + "sphinx-autodoc-typehints": { + "hashes": [ + "sha256:0dfd26be5b81049ce81b644913a06a12795ded2791adc0a4e13420eb9a832852", + "sha256:81bfdf4fbb9e248f10a6bc3f931bf80e85e95a185530e1a862db9269efd72cf9" + ], + "index": "pypi", "version": "==1.9.0" }, + "sphinxcontrib-applehelp": { + "hashes": [ + "sha256:edaa0ab2b2bc74403149cb0209d6775c96de797dfd5b5e2a71981309efab3897", + "sha256:fb8dee85af95e5c30c91f10e7eb3c8967308518e0f7488a2828ef7bc191d0d5d" + ], + "version": "==1.0.1" + }, + "sphinxcontrib-devhelp": { + "hashes": [ + "sha256:6c64b077937330a9128a4da74586e8c2130262f014689b4b89e2d08ee7294a34", + "sha256:9512ecb00a2b0821a146736b39f7aeb90759834b07e81e8cc23a9c70bacb9981" + ], + "version": "==1.0.1" + }, + "sphinxcontrib-htmlhelp": { + "hashes": [ + "sha256:4670f99f8951bd78cd4ad2ab962f798f5618b17675c35c5ac3b2132a14ea8422", + "sha256:d4fd39a65a625c9df86d7fa8a2d9f3cd8299a3a4b15db63b50aac9e161d8eff7" + ], + "version": "==1.0.2" + }, + "sphinxcontrib-jsmath": { + "hashes": [ + "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", + "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" + ], + "version": "==1.0.1" + }, + "sphinxcontrib-qthelp": { + "hashes": [ + "sha256:513049b93031beb1f57d4daea74068a4feb77aa5630f856fcff2e50de14e9a20", + "sha256:79465ce11ae5694ff165becda529a600c754f4bc459778778c7017374d4d406f" + ], + "version": "==1.0.2" + }, + "sphinxcontrib-serializinghtml": { + "hashes": [ + "sha256:c0efb33f8052c04fd7a26c0a07f1678e8512e0faec19f4aa8f2473a8b81d5227", + "sha256:db6615af393650bf1151a6cd39120c29abaf93cc60db8c48eb2dddbfdc3a9768" + ], + "version": "==1.1.3" + }, + "sshpubkeys": { + "hashes": [ + "sha256:9f73d51c2ef1e68cd7bde0825df29b3c6ec89f4ce24ebca3bf9eaa4a23a284db", + "sha256:b388399caeeccdc145f06fd0d2665eeecc545385c60b55c282a15a022215af80" + ], + "version": "==3.1.0" + }, "typed-ast": { "hashes": [ + "sha256:1170afa46a3799e18b4c977777ce137bb53c7485379d9706af8a59f2ea1aa161", "sha256:18511a0b3e7922276346bcb47e2ef9f38fb90fd31cb9223eed42c85d1312344e", "sha256:262c247a82d005e43b5b7f69aff746370538e176131c32dda9cb0f324d27141e", "sha256:2b907eb046d049bcd9892e3076c7a6456c93a25bebfe554e931620c90e6a25b0", "sha256:354c16e5babd09f5cb0ee000d54cfa38401d8b8891eefa878ac772f827181a3c", + "sha256:48e5b1e71f25cfdef98b013263a88d7145879fbb2d5185f2a0c79fa7ebbeae47", "sha256:4e0b70c6fc4d010f8107726af5fd37921b666f5b31d9331f0bd24ad9a088e631", "sha256:630968c5cdee51a11c05a30453f8cd65e0cc1d2ad0d9192819df9978984529f4", "sha256:66480f95b8167c9c5c5c87f32cf437d585937970f3fc24386f313a4c97b44e34", "sha256:71211d26ffd12d63a83e079ff258ac9d56a1376a25bc80b1cdcdf601b855b90b", + "sha256:7954560051331d003b4e2b3eb822d9dd2e376fa4f6d98fee32f452f52dd6ebb2", + "sha256:838997f4310012cf2e1ad3803bce2f3402e9ffb71ded61b5ee22617b3a7f6b6e", "sha256:95bd11af7eafc16e829af2d3df510cecfd4387f6453355188342c3e79a2ec87a", "sha256:bc6c7d3fa1325a0c6613512a093bc2a2a15aeec350451cbdf9e1d4bffe3e3233", "sha256:cc34a6f5b426748a507dd5d1de4c1978f2eb5626d51326e43280941206c209e1", "sha256:d755f03c1e4a51e9b24d899561fec4ccaf51f210d52abdf8c07ee2849b212a36", "sha256:d7c45933b1bdfaf9f36c579671fec15d25b06c8398f113dab64c18ed1adda01d", "sha256:d896919306dd0aa22d0132f62a1b78d11aaf4c9fc5b3410d3c666b818191630a", + "sha256:fdc1c9bbf79510b76408840e009ed65958feba92a88833cdceecff93ae8fff66", "sha256:ffde2fbfad571af120fcbfbbc61c72469e72f550d676c3342492a9dfdefb8f12" ], - "markers": "implementation_name == 'cpython'", "version": "==1.4.0" }, + "typing-extensions": { + "hashes": [ + "sha256:091ecc894d5e908ac75209f10d5b4f118fbdb2eb1ede6a63544054bb1edb41f2", + "sha256:910f4656f54de5993ad9304959ce9bb903f90aadc7c67a0bef07e678014e892d", + "sha256:cf8b63fedea4d89bab840ecbb93e75578af28f76f66c35889bd7065f5af88575" + ], + "index": "pypi", + "version": "==3.7.4.1" + }, "urllib3": { "hashes": [ - "sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1", - "sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232" + "sha256:3de946ffbed6e6746608990594d08faac602528ac7015ac28d33cee6a45b7398", + "sha256:9a107b99a5393caf59c7aa3c1249c16e6879447533d0887f4336dde834c7be86" ], "markers": "python_version >= '3.4'", - "version": "==1.25.3" + "version": "==1.25.6" }, "wcwidth": { "hashes": [ @@ -565,18 +1394,73 @@ ], "version": "==0.1.7" }, + "websocket-client": { + "hashes": [ + "sha256:1151d5fb3a62dc129164292e1227655e4bbc5dd5340a5165dfae61128ec50aa9", + "sha256:1fd5520878b68b84b5748bb30e592b10d0a91529d5383f74f4964e72b297fd3a" + ], + "version": "==0.56.0" + }, + "werkzeug": { + "hashes": [ + "sha256:7280924747b5733b246fe23972186c6b348f9ae29724135a6dfc1e53cea433e7", + "sha256:e5f4a1f98b52b18a93da705a7458e55afb26f32bff83ff5d19189f92462d65c4" + ], + "version": "==0.16.0" + }, "wrapt": { "hashes": [ "sha256:565a021fd19419476b9362b05eeaa094178de64f8361e44468f9e9d7843901e1" ], "version": "==1.11.2" }, - "zipp": { + "xmltodict": { "hashes": [ - "sha256:8c1019c6aad13642199fbe458275ad6a84907634cc9f0989877ccc4a2840139d", - "sha256:ca943a7e809cc12257001ccfb99e3563da9af99d52f261725e96dfe0f9275bc3" + "sha256:50d8c638ed7ecb88d90561beedbf720c9b4e851a9fa6c47ebd64e99d166d8a21", + "sha256:8bbcb45cc982f48b2ca8fe7e7827c5d792f217ecf1792626f808bf41c3b86051" ], - "version": "==0.5.1" + "version": "==0.12.0" + }, + "zipp": { + "hashes": [ + "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e", + "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335" + ], + "version": "==0.6.0" + }, + "zope.interface": { + "hashes": [ + "sha256:086707e0f413ff8800d9c4bc26e174f7ee4c9c8b0302fbad68d083071822316c", + "sha256:1157b1ec2a1f5bf45668421e3955c60c610e31913cc695b407a574efdbae1f7b", + "sha256:11ebddf765bff3bbe8dbce10c86884d87f90ed66ee410a7e6c392086e2c63d02", + "sha256:14b242d53f6f35c2d07aa2c0e13ccb710392bcd203e1b82a1828d216f6f6b11f", + "sha256:1b3d0dcabc7c90b470e59e38a9acaa361be43b3a6ea644c0063951964717f0e5", + "sha256:20a12ab46a7e72b89ce0671e7d7a6c3c1ca2c2766ac98112f78c5bddaa6e4375", + "sha256:298f82c0ab1b182bd1f34f347ea97dde0fffb9ecf850ecf7f8904b8442a07487", + "sha256:2f6175722da6f23dbfc76c26c241b67b020e1e83ec7fe93c9e5d3dd18667ada2", + "sha256:3b877de633a0f6d81b600624ff9137312d8b1d0f517064dfc39999352ab659f0", + "sha256:4265681e77f5ac5bac0905812b828c9fe1ce80c6f3e3f8574acfb5643aeabc5b", + "sha256:550695c4e7313555549aa1cdb978dc9413d61307531f123558e438871a883d63", + "sha256:5f4d42baed3a14c290a078e2696c5f565501abde1b2f3f1a1c0a94fbf6fbcc39", + "sha256:62dd71dbed8cc6a18379700701d959307823b3b2451bdc018594c48956ace745", + "sha256:7040547e5b882349c0a2cc9b50674b1745db551f330746af434aad4f09fba2cc", + "sha256:7e099fde2cce8b29434684f82977db4e24f0efa8b0508179fce1602d103296a2", + "sha256:7e5c9a5012b2b33e87980cee7d1c82412b2ebabcb5862d53413ba1a2cfde23aa", + "sha256:81295629128f929e73be4ccfdd943a0906e5fe3cdb0d43ff1e5144d16fbb52b1", + "sha256:95cc574b0b83b85be9917d37cd2fad0ce5a0d21b024e1a5804d044aabea636fc", + "sha256:968d5c5702da15c5bf8e4a6e4b67a4d92164e334e9c0b6acf080106678230b98", + "sha256:9e998ba87df77a85c7bed53240a7257afe51a07ee6bc3445a0bf841886da0b97", + "sha256:a0c39e2535a7e9c195af956610dba5a1073071d2d85e9d2e5d789463f63e52ab", + "sha256:a15e75d284178afe529a536b0e8b28b7e107ef39626a7809b4ee64ff3abc9127", + "sha256:a6a6ff82f5f9b9702478035d8f6fb6903885653bff7ec3a1e011edc9b1a7168d", + "sha256:b639f72b95389620c1f881d94739c614d385406ab1d6926a9ffe1c8abbea23fe", + "sha256:bad44274b151d46619a7567010f7cde23a908c6faa84b97598fd2f474a0c6891", + "sha256:bbcef00d09a30948756c5968863316c949d9cedbc7aabac5e8f0ffbdb632e5f1", + "sha256:d788a3999014ddf416f2dc454efa4a5dbeda657c6aba031cf363741273804c6b", + "sha256:eed88ae03e1ef3a75a0e96a55a99d7937ed03e53d0cffc2451c208db445a2966", + "sha256:f99451f3a579e73b5dd58b1b08d1179791d49084371d9a47baad3b22417f0317" + ], + "version": "==4.6.0" } } } diff --git a/README.md b/README.md index b9dd5ed..77242c0 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,22 @@ # arXiv NG Canonical Record This repository contains a library and applications for working with the core -arXiv canonical record. The canonical record is the authoritative history and +arXiv canonical record. The canonical record is the authoritative history and state for announced e-prints on the arXiv platform. -Work on this project will proceed in two phases, each corresponding to a major +Work on this project will proceed in two phases, each corresponding to a major version: ## Version 0: Replication of the Legacy Record to the Canonical Record -The first major objective of this project is to replicate all of the core -announcement events that occur in the legacy system to the cloud-native +The first major objective of this project is to replicate all of the core +announcement events that occur in the legacy system to the cloud-native canonical record. -- The legacy system emits event notifications via a Kinesis stream for new +- The legacy system emits event notifications via a Kinesis stream for new e-prints, replacements, cross-listing, withdrawals, and updates. - An announcement agent (``announcement/`` in this repo)... - + - consumes legacy events, - retrieves metadata, source package, and first-compiled PDF from legacy, - formats and stores content as part of the canonical record. The canonical @@ -27,17 +27,57 @@ canonical record. content, and events available via a RESTful JSON API. This is a Flask application that will be deployed as a Docker container. -Both the ``announcement/`` and ``repository/`` applications use the +Both the ``announcement/`` and ``repository/`` applications use the ``arxiv.canonical`` package (``arxiv/canonical/`` in this repo) to interact with the canonical record. +### Implementation notes + +- ``arxiv.canonical.classic`` provides a CLI with ``backfill`` and + ``backfill_today`` commands. + + - ``backfill` can be used to backfill the NG canonical record from the legacy + record. + - ``backfill_today`` can be used to update the NG canonical record from the + legacy record on a daily basis. This should be run after the announcement + process has completed. + - These commands should be extended with an option to also propagate events + once they are successfully backfilled. See that module docstring for + details. + +- ``repository/`` has a minimal integration with the updated + ``arxiv.canonical`` package. This could be a guide for a similar service + module in the ``browse`` application. + +### TODO + +- [ ] Consider removing entirely the ``render`` property throughout + ``arxiv.canonical``. In the initial implementation, only a source package + and a single rendered output (e.g. PDF) were stored. In the current + implementation, the source file plus all classic dissemination formats are + preserved. Thus ``render`` is more or less obviated. +- [ ] Some attention to the semantics of exceptions throughout + ``arxiv.canonical``. In many places we are still using native Python + exceptions that may not provide the most meaningful information or be used + consistently. +- [ ] Implementation of the daily preservation package. This will involve + implementing supporting structs in ``domain``, ``record``, and ``integrity`` + modules, and probably something analagous to the ``register`` API for + constructing and storing the daily preservation package. +- [ ] Integration in arxiv-browse. The recommended approach is to treat browse + as a flavor of the canonical repository (see + https://arxiv.github.io/arxiv-arxitecture/subsystems/announcement.html#primary-repository) + with an HTML interface. See ``repository/`` in this repo for how this + integration could look. The ``RegisterAPI`` may need to be extended to + support some of browse's requirements, such as listing events by week. + ## Version 1: Orchestration of the Announcement Process Once several other dependencies are resolved in the legacy system, this project -will assume primary responsibility for announcing submitted e-prints on a +will assume primary responsibility for announcing submitted e-prints on a daily basis. This is a bit further down the road. -# Contributing +# Contributing For a list of things that need doing, please see the issues tracker for this repository. @@ -62,7 +102,7 @@ and the corresponding ``wsgi_[xxx].py`` entrypoints. ## AWS services, mocking It's helpful to use a live API when developing components against AWS services. -We use [Localstack](https://github.com/localstack/localstack) for this +We use [Localstack](https://github.com/localstack/localstack) for this purpose. ## Contributor guidelines @@ -101,16 +141,16 @@ The canonical record can be stored on any system that supports a key-binary data structure, such as a filesystem or an object store. The two core data structures in the record are: -1. E-prints, comprised of... +1. E-prints, comprised of... - - metadata, - - submitted content, + - metadata, + - submitted content, - and the first rendering of the PDF. 2. Announcement records, representing a single announcement-related event, such as a new version, a withdrawal, or a cross-list; these records are: - - organized into daily announcement listing files and + - organized into daily announcement listing files and - emitted via a notification broker in real time, to trigger updates to downstream services and data stores. @@ -119,7 +159,7 @@ An e-print is comprised of (1) a metadata record, (2) a source package, containing the original content provided by the submitter, and (3) a canonical rendering of the e-print in PDF format. A manifest is also stored for each e-print, containing the keys for the resources above and a base-64 encoded MD5 -hash of their binary content. +hash of their binary content. The key prefix structure for an e-print record is: @@ -128,12 +168,12 @@ e-prints////v/ ``` Where ``YYYY`` is the year and ``MM`` the month during which the first version -of the e-print was announced. +of the e-print was announced. Sub-keys are: - Metadata record: ``v.json`` -- Source package: ``v.tar.gz`` +- Source package: ``v.tar`` - PDF: ``v.pdf`` - Manifest: ``v.manifest.json`` @@ -144,7 +184,7 @@ versions of an e-print. ## Announcement listings The announcement listings commemorate the announcement-related events that occur on a given day. This includes new e-prints/versions, withdrawals, -cross-lists, etc. +cross-lists, etc. The key prefix structure for an announcement listing file is: @@ -155,11 +195,11 @@ announcement///
/ Each daily key prefix may contain one or more sub-keys. Each sub-key ending in .json is treated as a listing file. This allows for the possibility of sharded/multi-threaded announcement processes that write separate listing -files, e.g. for specific classification domains. +files, e.g. for specific classification domains. ``YYYY`` is the year, ``MM`` the month, and ``DD`` the day on which the announcement events encoded therein occurred and on which the subordinate -listing files were generated. +listing files were generated. ## Preservation record The preservation record is a daily digest containing e-print content, @@ -170,10 +210,10 @@ corresponding tombstones). ``` announcement/.json e-prints/v/ - v.json # Metadata record -v.tar.gz # Source package -v.pdf # First PDF -v.manifest.json # Manifest. + v.json # Metadata record + v.tar # Source package + v.pdf # First PDF + v.manifest.json # Manifest. suppress/v/tombstone preservation.manifest.json ``` diff --git a/announcement/README.md b/announcement/README.md index 9107bca..488bfc1 100644 --- a/announcement/README.md +++ b/announcement/README.md @@ -3,55 +3,7 @@ The announcement agent is responsible for adding new e-prints to the canonical record. -## Version 0 : Clone legacy announcement record to canonical format - -In v0 of the announcement agent, the e-print event consumer... - -- Processes events from a Kinesis stream (see below). -- Retrieves metadata, content for e-prints from the legacy system. -- Uses ``arxiv.canonical`` to update the canonical record in the cloud. - -## Events - -The legacy system produces e-print events on a Kinesis stream called -``Announce``. Each message has the structure: - -```json -{ - "event_type": "...", - "identifier": "...", - "version": "...", - "timestamp": "..." -} -``` - -``event_type`` may be one of: - -| Event type | Description | -|------------|----------------------------------------------------------------| -| ``new`` | An e-print is announced for the first time. | -| ``updated`` | An e-print is updated without producing a new version. | -| ``replaced`` | A new version of an e-print is announced. | -| ``cross-list`` | Cross-list classifications are added for an e-print. | -| ``withdrawn`` | An e-print is withdrawn. This generates a new version. | - -``identifier`` is an arXiv identifier without a version affix. - -``version`` is a positive integer. - -``timestamp`` is an ISO-8601 datetime, localized to UTC. - -## Legacy integration - -Metadata, PDFs, and source are retrieved from the legacy system via HTTP -request. - -- Metadata: ``arxiv.org/docmeta/{IDENTIFIER}v{VERSION}`` -- PDF: ``arxiv.org/pdf/{IDENTIFIER}v{VERSION}`` -- Source: ``arxiv.org/src/{IDENTIFIER}v{VERSION}`` - - -# Contributing +# Contributing For a list of things that need doing, please see the issues tracker for this repository. diff --git a/announcement/announcement/agent/consumer.py b/announcement/announcement/agent/consumer.py index 325d7c5..470d4eb 100644 --- a/announcement/announcement/agent/consumer.py +++ b/announcement/announcement/agent/consumer.py @@ -1,7 +1,7 @@ """ E-print event consumer. -In v0 of the announcement agent, the e-print event consumer processes +In v0 of the announcement agent, the e-print event consumer processes notifications about announcement events generated by the legacy system, and updates its version of the canonical record. @@ -15,7 +15,7 @@ "event_type": "...", "identifier": "...", "version": "...", - "timestamp": "..." + "timestamp": "..." } ``` @@ -29,7 +29,7 @@ | cross-list | Cross-list classifications are added for an e-print. | | withdrawn | An e-print is withdrawn. This generates a new version. | -``identifier`` is an arXiv identifier; see :class:`.Identifier`. +``identifier`` is an arXiv identifier; see :class:`.Identifier`. ``version`` is a positive integer. @@ -42,6 +42,7 @@ (https://github.com/arXiv/arxiv-base/blob/master/arxiv/integration/kinesis/consumer/__init__.py). """ +from typing import Any from arxiv.integration.kinesis.consumer import BaseConsumer @@ -54,7 +55,7 @@ class AnnouncementConsumer(BaseConsumer): """Consumes announcement events, and updates the canonical record.""" - def __init__(self, *args, **kwargs) -> None: + def __init__(self, *args: Any, **kwargs: Any) -> None: super(AnnouncementConsumer, self).__init__(*args, **kwargs) self._metadata_service = LegacyMetadataService.current_session() self._pdf_service = LegacyPDFService.current_session() diff --git a/announcement/announcement/config.py b/announcement/announcement/config.py index 6eaac11..7425fc9 100644 --- a/announcement/announcement/config.py +++ b/announcement/announcement/config.py @@ -4,10 +4,21 @@ Docstrings are from the `Flask configuration documentation `_. """ -from typing import Optional +from typing import Any, Optional, Type import warnings from os import environ + +def _showwarning(message: str, + *args: Any, + category: Type[Exception] = UserWarning, + filename: str = '', + lineno: int = -1, + **kwargs: Any) -> None: + print(message) + +warnings.showwarning = _showwarning + NAMESPACE = environ.get('NAMESPACE') """Namespace in which this service is deployed; to qualify keys for secrets.""" @@ -245,7 +256,7 @@ BASE_SERVER = environ.get('BASE_SERVER', 'arxiv.org') URLS = [ - + ] """ URLs for external services, for use with :func:`flask.url_for`. diff --git a/arxiv/canonical/__init__.py b/arxiv/canonical/__init__.py index f419752..50c9c1f 100644 --- a/arxiv/canonical/__init__.py +++ b/arxiv/canonical/__init__.py @@ -52,7 +52,7 @@ of the e-print was announced. Sub-keys are: - Metadata record: ``v.json`` -- Source package: ``v.tar.gz`` +- Source package: ``v.tar`` - PDF: ``v.pdf`` - Manifest: ``v.manifest.json`` @@ -92,7 +92,7 @@ announcement/.json e-prints/v/ v.json # Metadata record - v.tar.gz # Source package + v.tar # Source package v.pdf # PDF Manifest: v.manifest.json suppress/v/tombstone @@ -136,3 +136,5 @@ """ from . import serialize, domain +from .register import NoSuchResource +from .role import Primary, Repository, Replicant, Observer diff --git a/arxiv/canonical/classic/__init__.py b/arxiv/canonical/classic/__init__.py new file mode 100644 index 0000000..1ec079a --- /dev/null +++ b/arxiv/canonical/classic/__init__.py @@ -0,0 +1,10 @@ +""" +(De)Serialization of the classic announcement record. + +There are two sources of information that can be used to piece together the +announcement history of the classic record: + +1. The daily.log file contains a daily record of new, replacement, and + cross-list announcements. +2. The classic abs file contains version metadata. +""" \ No newline at end of file diff --git a/arxiv/canonical/classic/__main__.py b/arxiv/canonical/classic/__main__.py new file mode 100644 index 0000000..37a2705 --- /dev/null +++ b/arxiv/canonical/classic/__main__.py @@ -0,0 +1,5 @@ +"""Main CLI endpoint for :mod:`arxiv.canonical.classic`.""" + +from .cli import cli + +cli.main() diff --git a/arxiv/canonical/classic/abs.py b/arxiv/canonical/classic/abs.py new file mode 100644 index 0000000..b019cf7 --- /dev/null +++ b/arxiv/canonical/classic/abs.py @@ -0,0 +1,414 @@ +"""Parse fields from a single arXiv abstract (.abs) file.""" + +import os +import re +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, \ + NamedTuple +from functools import wraps +from dateutil import parser +from pytz import timezone +from datetime import datetime, date +from dateutil.tz import tzutc, gettz + +from .. import domain as D + +AnyIdentifier = Union[D.VersionedIdentifier, D.Identifier] + +EASTERN = gettz('US/Eastern') + +RE_ABS_COMPONENTS = re.compile(r'^\\\\\n', re.MULTILINE) +RE_FROM_FIELD = re.compile( + r'(?PFrom:\s*)(?P[^<]+)?\s+(<(?P.*)>)?') +RE_DATE_COMPONENTS = re.compile( + r'^Date\s*(?::|\(revised\s*(?P.*?)\):)\s*(?P.*?)' + r'(?:\s+\((?P\d+)kb,?(?P.*)\))?$') +RE_FIELD_COMPONENTS = re.compile( + r'^(?P[-a-z\)\(]+\s*):\s*(?P.*)', re.IGNORECASE) +RE_ARXIV_ID_FROM_PREHISTORY = re.compile( + r'(Paper:\s+|arXiv:)(?P\S+)') + +NAMED_FIELDS = ['Title', 'Authors', 'Categories', 'Comments', 'Proxy', + 'Report-no', 'ACM-class', 'MSC-class', 'Journal-ref', + 'DOI', 'License'] +""" +Fields that may be parsed from the key-value pairs in second +major component of .abs string. Field names are not normalized. +""" + +REQUIRED_FIELDS = ['title', 'authors', 'abstract'] +""" +Required parsed fields with normalized field names. + +Note the absense of 'categories' as a required field. A subset of version- +affixed .abs files with the old identifiers predate the introduction of +categories and therefore do not have a "Categories:" line; only the (higher- +level) archive and group can be be inferred, and this must be done via the +identifier itself. + +The latest versions of these papers should always have the "Categories:" line. +""" + +# arXiv ID format used from 1991 to 2007-03 +RE_ARXIV_OLD_ID = re.compile( + r'^(?P[a-z]{1,}(\-[a-z]{2,})?)(\.([a-zA-Z\-]{2,}))?\/' + r'(?P(?P\d\d)(?P\d\d))(?P\d\d\d)' + r'(v(?P[1-9]\d*))?([#\/].*)?$') + +# arXiv ID format used from 2007-04 to present +RE_ARXIV_NEW_ID = re.compile( + r'^(?P(?P\d\d)(?P\d\d))\.(?P\d{4,5})' + r'(v(?P[1-9]\d*))?([#\/].*)?$' +) + +ASSUMED_LICENSE = D.License( + href='http://arxiv.org/licenses/nonexclusive-distrib/1.0/' +) + + +class AbsRef(NamedTuple): + identifier: D.VersionedIdentifier + submitted_date: datetime + announced_month: str + source_type: D.SourceType + size_kilobytes: int + + +class AbsData(NamedTuple): + identifier: D.VersionedIdentifier + submitter: Optional[D.Person] + submitted_date: datetime + announced_month: str + updated_date: datetime + license: D.License + primary_classification: D.Category + title: str + abstract: str + authors: str + size_kilobytes: int + submission_type: D.EventType + secondary_classification: List[D.Category] + source_type: Optional[D.SourceType] = None + journal_ref: Optional[str] = None + report_num: Optional[str] = None + doi: Optional[str] = None + msc_class: Optional[str] = None + acm_class: Optional[str] = None + proxy: Optional[str] = None + comments: str = '' + previous_versions: Optional[List[AbsRef]] = None + + +class NoSuchAbs(RuntimeError): + pass + + +def original_base_path(data_path: str) -> str: + return os.path.join(data_path, 'orig') + + +def latest_base_path(data_path: str) -> str: + return os.path.join(data_path, 'ftp') + + +def latest_path_month(data_path: str, identifier: AnyIdentifier) -> str: + """ + Get the base path for the month block containing the "latest" e-prints. + + This is where the most recent version of each e-print always lives. + """ + return os.path.join( + latest_base_path(data_path), + identifier.category_part if identifier.is_old_style else 'arxiv', + 'papers', + identifier.yymm + ) + + +def original_path_month(data_path: str, identifier: AnyIdentifier) -> str: + """ + Get the main base path for an abs file. + + This is where all of the versions except for the most recent one live. + """ + return os.path.join( + original_base_path(data_path), + identifier.category_part if identifier.is_old_style else 'arxiv', + 'papers', + identifier.yymm + ) + + +def latest_path(data_path: str, identifier: AnyIdentifier) -> str: + return os.path.join(latest_path_month(data_path, identifier), + f'{identifier.numeric_part}.abs') + + +def original_path(data_path: str, identifier: D.VersionedIdentifier) -> str: + return os.path.join(original_path_month(data_path, identifier), + f'{identifier.numeric_part}v{identifier.version}.abs') + + +def get_path(data_path: str, identifier: D.VersionedIdentifier) -> str: + # We look first for an "original" abs file that is explicitly identified + # as the version we are looking for. + path = original_path(data_path, identifier) + if os.path.exists(path): + return path + # If we are asking for the first version and haven't found it already, the + # only possibility is that there is one version and its abs file is located + # in the "latest" section. + if identifier.version == 1: + path = latest_path(data_path, identifier) + if not os.path.exists(path): + raise NoSuchAbs(f'Cannot find abs record for {identifier}') + return path + # The only remaining possibility is that the version we are looking for + # is indeed the "latest" version, in which case we must be able to find + # an abs record for the previous version in the "original" section. + previous = D.VersionedIdentifier.from_parts(identifier.arxiv_id, + identifier.version - 1) + if os.path.exists(original_path(data_path, previous)): + return latest_path(data_path, identifier) # Voila! + raise NoSuchAbs(f'Cannot find abs record for {identifier}') + + +def parse_versions(data_path: str, identifier: D.Identifier) \ + -> Iterable[AbsData]: + return [parse(data_path, v) for v in list_versions(data_path, identifier)] + + +def parse_latest(data_path: str, identifier: D.Identifier) -> AbsData: + """Parse the abs for the latest version of an e-print.""" + return _parse(latest_path(data_path, identifier)) + + +def parse_first(data_path: str, identifier: D.Identifier) -> AbsData: + """Parse the abs for the first version of an e-print.""" + return _parse(get_path(data_path, + D.VersionedIdentifier.from_parts(identifier, 1))) + + +def iter_all(data_path: str, from_id: Optional[D.Identifier] = None, + to_id: Optional[D.Identifier] = None) -> Iterable[D.Identifier]: + """ + List all of the identifiers for which we have abs files. + + The "latest" section will have an abs file for every e-print, so that's the + only place we need look. + """ + latest_root = latest_base_path(data_path) + for dirpath, _, filenames in os.walk(latest_root): + for filename in filenames: + if filename.endswith('.abs'): + prefix = dirpath.split(latest_root)[1].split('/')[1] + numeric_part, _ = os.path.splitext(filename) + if prefix == 'arxiv': + identifier = D.Identifier(numeric_part) + else: + identifier = D.Identifier(f'{prefix}/{numeric_part}') + if from_id and identifier < from_id: + continue + elif to_id and identifier >= to_id: + continue + yield identifier + + +def list_versions(data_path: str, identifier: D.Identifier) \ + -> List[D.VersionedIdentifier]: + """ + List all of the versions for an identifier from abs files. + + This works by looking at the presence of abs files in both the "latest" + and "original" locations. + """ + identifiers: List[D.VersionedIdentifier] = [] + + # We look first at "original" versions, as they will be explicitly named + # with their numeric version affix. + old_versions_exist = False + orig_month_root = original_path_month(data_path, identifier) + category = orig_month_root.split(data_path)[1].split('/')[2] + for dpath, _, fnames in os.walk(orig_month_root): + for filename in sorted(fnames): + if filename.endswith('.abs') \ + and filename.startswith(identifier.numeric_part): + numeric_part_v, _ = os.path.splitext(filename) + if identifier.is_old_style: + vid = D.VersionedIdentifier(f'{category}/{numeric_part_v}') + else: + vid = D.VersionedIdentifier(numeric_part_v) + old_versions_exist = True + identifiers.append(vid) + + if old_versions_exist: + # We are looking only at past versions above; the most recent version + # lives somewhere else. We can infer its existence. + _, v = numeric_part_v.split('v') + identifiers.append( + D.VersionedIdentifier.from_parts(identifier, int(v) + 1) + ) + elif os.path.exists(latest_path(data_path, identifier)): + # There is only one version, the first version, and it is the + # latest version. + identifiers.append(D.VersionedIdentifier.from_parts(identifier, 1)) + return identifiers + + +def parse(data_path: str, identifier: D.VersionedIdentifier) -> AbsData: + return _parse(get_path(data_path, identifier)) + + +def _parse(path: str) -> AbsData: + with open(path, mode='r', encoding='latin-1') as f: + raw = f.read() + + # The best we can do to infer when the last update was made was to examine + # the modification time of the abs file itself. + mtime = os.path.getmtime(path) + modified = datetime.fromtimestamp(mtime, tz=EASTERN).astimezone(tz=tzutc()) + + # There are two main components to an .abs file that contain data, + # but the split must always return four components. + components = RE_ABS_COMPONENTS.split(raw) + if not len(components) == 4: + raise IOError(f'Unexpected number of components parsed from {path}') + + # Everything else is in the second main component. + prehistory, misc_fields = re.split(r'\n\n', components[1]) + + fields: Dict[str, Any] = _parse_metadata(key_value_block=misc_fields) + # Abstract is the first main component. + fields['abstract'] = components[2] + + id_match = RE_ARXIV_ID_FROM_PREHISTORY.match(prehistory) + if not id_match: + raise IOError('Could not extract arXiv ID from prehistory component.') + + arxiv_id = id_match.group('arxiv_id') + prehistory = re.sub(r'^.*\n', '', prehistory) + parsed_version_entries = re.split(r'\n', prehistory) + + # Submitter data. + from_match = RE_FROM_FIELD.match(parsed_version_entries.pop(0)) + if not from_match: + raise IOError('Could not extract submitter data.') + + name = from_match.group('name') + if name is not None: + name = name.rstrip() + + # Get the version history for this particular version of the document. + if not len(parsed_version_entries) >= 1: + raise IOError('At least one version entry expected.') + + versions = _parse_versions(arxiv_id=arxiv_id, + version_entry_list=parsed_version_entries) + + secondary_classification: List[str] = [] + if 'categories' in fields and fields['categories']: + classifications = fields['categories'].split() + primary_classification = classifications[0] + secondary_classification = classifications[1:] + else: + match = RE_ARXIV_OLD_ID.match(arxiv_id) + if not match: + raise IOError('Could not determine primary classification') + primary_classification = match.group('archive') + + if 'license' in fields: + license = D.License(fields['license']) + else: + license = ASSUMED_LICENSE + + if versions[-1].identifier.version == 1: + submission_type = D.EventType.NEW + elif versions[-1].size_kilobytes == 0: + submission_type = D.EventType.WITHDRAWN + else: + submission_type = D.EventType.REPLACED + + return AbsData( + identifier=versions[-1].identifier, + submitter=D.Person(full_name=name) if name else None, + submitted_date=versions[-1].submitted_date, + announced_month=versions[-1].announced_month, + updated_date=modified, + license=license, + primary_classification=primary_classification, + title=fields['title'], + abstract=fields['abstract'], + authors=fields['authors'], + source_type=versions[-1].source_type, + size_kilobytes=versions[-1].size_kilobytes, + submission_type=submission_type, + secondary_classification=secondary_classification, + journal_ref=fields.get('journal_ref'), + report_num=fields.get('report_num'), + doi=fields.get('doi'), + msc_class=fields.get('msc_class'), + acm_class=fields.get('acm_class'), + proxy=fields.get('proxy'), + comments=fields.get('comments', ''), + previous_versions=versions[:-1], + ) + + +def _parse_metadata(key_value_block: str) -> Dict[str, str]: + """Parse the key-value block from the arXiv .abs string.""" + key_value_block = key_value_block.lstrip() + field_lines = re.split(r'\n', key_value_block) + field_name = 'unknown' + fields_builder: Dict[str, str] = {} + for field_line in field_lines: + field_match = RE_FIELD_COMPONENTS.match(field_line) + if field_match and field_match.group('field') in NAMED_FIELDS: + field_name = field_match.group('field').lower().replace('-', '_') + field_name = re.sub(r'_no$', '_num', field_name) + fields_builder[field_name] = field_match.group('value').rstrip() + elif field_name != 'unknown': + # we have a line with leading spaces + fields_builder[field_name] += re.sub(r'^\s+', ' ', field_line) + return fields_builder + + +def _parse_announced(arxiv_id: str) -> str: + match = RE_ARXIV_OLD_ID.match(arxiv_id) + if not match: + match = RE_ARXIV_NEW_ID.match(arxiv_id) + if not match: + raise ValueError('Not a valid arXiv ID') + yy = int(match.group('yy')) + mm = int(match.group('mm')) + year = f'19{yy}' if yy > 90 else f'20{yy}' + return f'{year}-{mm}' + + +def _parse_versions(arxiv_id: str, version_entry_list: List) -> List[AbsRef]: + """Parse the version entries from the arXiv .abs file.""" + version_entries: List[AbsRef] = list() + for parsed_version_entry in version_entry_list: + date_match = RE_DATE_COMPONENTS.match(parsed_version_entry) + if not date_match: + raise IOError('Could not extract date components from date line.') + try: + sd = date_match.group('date') + submitted_date = parser.parse(date_match.group('date')) + except (ValueError, TypeError): + raise IOError(f'Could not parse submitted date {sd} as datetime') + + source_type = D.SourceType(date_match.group('source_type')) + size_kilobytes = int(date_match.group('size_kilobytes')) + V = len(version_entries) + 1 + identifier = \ + D.VersionedIdentifier(f'{D.Identifier(arxiv_id)}v{V}') + version_entries.append( + AbsRef( + identifier=identifier, + submitted_date=submitted_date, + announced_month=_parse_announced(arxiv_id), + source_type=source_type, + size_kilobytes=size_kilobytes + ) + ) + + return version_entries diff --git a/arxiv/canonical/classic/backfill.py b/arxiv/canonical/classic/backfill.py new file mode 100644 index 0000000..433304f --- /dev/null +++ b/arxiv/canonical/classic/backfill.py @@ -0,0 +1,723 @@ +""" +Functions for backfilling the NG record from classic. + +In order to ensure a smooth transition from classic to the NG announcement +process, we need to be able to initially operate both the classic and NG +canonical records in parallel. This means that we need to be able to: + +1. Backfill the canonical record from the classic record, starting at the + beginning of time and running up to the present. See :func:`backfill`. +2. Continuously update the canonical record from data in the classic system. + See :func:`backfill_today`. + +This module is implemented on the assumption that its functions will be +executed on a machine with access to the classic filesystem, specifically to +the abs/source files and daily.log file. It is agnostic, however, about the +target storage medium for the canonical record. So this these functions can be +used to backfill the canonical record both on local filesystems and in (for +example) an S3 bucket. + +What version is this? +===================== +The lacuna of the classic record is an unambiguous mapping between announcement +events and specific versions of an e-print. For example, if we encounter a +replacement event in the daily.log file, there is no explicit indication of +whether the resulting version is 2, 3, or some higher value. The abs file +does not provide this information either, as only the submission date of each +version is preserved (although this could at least be used as a lower bound). +So, we need to get creative. + +Start at the beginning of time. Initialize a counter that keeps track of the +last version number seen for each e-print identifier. + +Prior to the start of the daily.log (mid-1998): Read the abs file for each +e-print, and generate a ``new`` and subsequent ``replace`` event(s) **using the +submission date(s) as the announcement date(s)**. + +Read daily.log in order. Rely on the version number mapping to keep track of +where we are with each e-print. + +""" +import logging +import os + +from collections import Counter +from datetime import date, datetime +from functools import partial +from itertools import chain +from operator import attrgetter + +from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, \ + Set, Tuple + +from pytz import timezone +from pprint import pprint +from typing_extensions import Protocol + +from ..domain import CanonicalFile, Category, ContentType, Event, EventSummary, EventType, \ + Identifier, Metadata, Version, VersionedIdentifier +from ..log import Log, WRITE +from ..register import IRegisterAPI +from . import abs, daily, content +from .util import PersistentIndex, PersistentList + +logger = logging.getLogger(__name__) +logger.setLevel(int(os.environ.get('LOGLEVEL', '40'))) + +ET = timezone('US/Eastern') + + +class _ILoader(Protocol): + """ + Interface for functions that load events from the classic record. + + This is here mostly because the semantics for ``typing.Callable`` are + pretty limited. + """ + + def __call__(self, + current: Optional[MutableMapping[Identifier, int]] = None, + first: Optional[MutableMapping[Identifier, date]] = None, + limit_to: Optional[Set[Identifier]] = None, + cf_cache: Optional['_CF_PersistentIndex'] = None) \ + -> Iterable[Event]: + """Load events from the classic record.""" + + +_CF_Key = Tuple[VersionedIdentifier, ContentType] + + +class _CF_PersistentIndex(PersistentIndex): + def __getitem__(self, key: _CF_Key) -> CanonicalFile: + skey = f'{key[0]}::{key[1].value}' + cf: CanonicalFile = super(_CF_PersistentIndex, self).__getitem__(skey) + return cf + + def __setitem__(self, key: _CF_Key, value: CanonicalFile) -> None: + skey = f'{key[0]}::{key[1].value}' + return super(_CF_PersistentIndex, self).__setattr__(skey, value) + + def __contains__(self, key: Any) -> bool: + if not isinstance(key, tuple): + return False + skey = f'{key[0]}::{key[1].value}' + return bool(super(_CF_PersistentIndex, self).__contains__(skey)) + + +def backfill(register: IRegisterAPI, + daily_path: str, + abs_path: str, + ps_cache_path: str, + state_path: str, + limit_to: Optional[Set[Identifier]] = None, + cache_path: Optional[str] = None, + until: Optional[date] = None) -> Iterable[Event]: + """ + Lazily backfill the canonical record from the classic record. + + Note: you **must** consume this iterator in order for backfilling to occur. + This was implemented lazily because there is considerable I/O (including + possibly some over the network), and being able to control processing rate + at a high level was foreseen as important. + + Parameters + ---------- + register : :class:`IRegisterAPI` + A canonical register instance that will handle events derived from the + classic record. + daily_path : str + Absolute path to the daily.log file. + abs_path : str + Absolute path of the directory containing abs files and source + packages. Specifically, this is the directory that contains the ``ftp`` + and ``orig`` subdirectories. + state_path : str + Absolute path of a writeable directory where backfill state can be + stored. This allows us to persist the backfill state, in case we need + to restart after a failure. + limit_to : set + A set of :class:`Identifier`s indicating a subset of e-prints to + backfill. If ``None`` (default) all events for all e-prints are + backfilled. + cache_path : str + If provided, a writable directory where a cache of events can be + maintained. This cuts down on spin-up time considerably. + + Returns + ------- + iterator + Yields :class:`Event`s that have been successfully backfilled. + + """ + loader = partial(_load_all, daily_path, abs_path, ps_cache_path, + cache_path=cache_path) + return _backfill(register, loader, state_path, limit_to=limit_to, + until=until) + + +def backfill_today(register: IRegisterAPI, + daily_path: str, + abs_path: str, + ps_cache_path: str, + state_path: str, + cache_path: Optional[str] = None) -> Iterable[Event]: + """ + Lazily backfill the canonical record from today's events in classic record. + + This is intended to be used to keep the canonical record up to date from + the classic record on a daily basis, after the initial backfill. + + Note: you **must** consume this iterator in order for backfilling to occur. + This was implemented lazily because there is considerable I/O (including + possibly some over the network), and being able to control processing rate + at a high level was foreseen as important. + + Parameters + ---------- + register : :class:`IRegisterAPI` + A canonical register instance that will handle events derived from the + classic record. + daily_path : str + Absolute path to the daily.log file. + abs_path : str + Absolute path of the directory containing abs files and source + packages. Specifically, this is the directory that contains the ``ftp`` + and ``orig`` subdirectories. + state_path : str + Absolute path of a writeable directory where backfill state can be + stored. This allows us to persist the backfill state, in case we need + to restart after a failure. + cache_path : str + If provided, a writable directory where a cache of events can be + maintained. This cuts down on spin-up time considerably. + + Returns + ------- + iterator + Yields :class:`Event`s that have been successfully backfilled. + + """ + loader = partial(_load_today, daily_path, abs_path, ps_cache_path, + cache_path=cache_path) + return _backfill(register, loader, state_path) + + +def _backfill(register: IRegisterAPI, + loader: _ILoader, + state_path: str, + limit_to: Optional[Set[Identifier]] = None, + until: Optional[date] = None) -> Iterable[Event]: + # These mappings are stored on disk so that we can resume after a failure. + # They will be created now if they don't already exist. + if not os.path.exists(state_path): + os.makedirs(state_path) + first = PersistentIndex() + first.load(os.path.join(state_path, 'first.json')) + current = PersistentIndex() + current.load(os.path.join(state_path, 'current.json')) + cf_cache = _CF_PersistentIndex() + cf_cache.load(os.path.join(state_path, 'content.json')) + log = Log(state_path) # The log keeps track of our progress. + + # We may be resuming after a failure. If so, we will start right after the + # last successful event. + resume_after = log.read_last_succeeded() + skip = True if resume_after else False + event: Optional[Event] = None + + logger.info(f'Starting backfill') # The logger is just for us humans. + if skip: + logger.info(f'Skip until {resume_after}') + i = 0 + try: + # Because of the format of daily.log, it's not at all straightforward + # to skip unwanted events without fully parsing them. In this + # implementation all events are loaded and we are just choosy about + # which ones we work with. At this level of the process, we are + # skipping any events that were already backfilled successfully on + # previous runs. Filtering for ``limit_to`` happens deeper. + for event in loader(current=current, first=first, limit_to=limit_to, + cf_cache=cf_cache): + logger.debug(f'Got event %s for %s (%s)', event.event_id, + event.identifier, event.event_date) + if skip: + if resume_after and event.event_id == resume_after.event_id: + skip = False # Start on the next event. + continue + + if until and event.event_date.date() >= until: + # Explicitly instructed to stop processing when this date is + # reached. + break + + logger.info(f'Handling: {event.event_date}: {event.identifier}') + register.add_events(event) # Add event to the canonical record! + log.log_success(event.event_id, WRITE) # Mark our progress. + i += 1 + logger.debug('Successfully handled %i events', i) + cf_cache.save() # Only save if successful. + yield event + except Exception as e: + logger.error('Encountered unhandled exception: %s (%s)', e, type(e)) + if event: + # Log lines are stored as json, so newlines in the exception should + # not cause problems. + log.log_failure(event.event_id, WRITE, + message=f'Encountered error: {e}') + + raise + finally: + # Keep our version and announcement-date mappings up to date on disk. + first.save() + current.save() + + +# Private functions follow in alphabetic order. + +def _datetime_from_date(source_date: date, identifier: Identifier) -> datetime: + # We are artificially coercing a date value to a datetime, which + # means that every event on a particular day will occur at + # precisely the same moment. In order to preserve event order, we + # set the microsecond part based on the arXiv ID. + return datetime(year=source_date.year, + month=source_date.month, + day=source_date.day, + hour=20, + minute=0, + second=0, + microsecond=int(identifier.incremental_part), + tzinfo=ET) + + +def _event_from_abs(abs_path: str, ps_cache_path: str, abs_data: abs.AbsData, + event_date: datetime, + cf_cache: Optional[_CF_PersistentIndex] = None) -> Event: + + source = content.get_source(abs_path, abs_data.identifier) + formats = {cf.content_type: cf + for cf in content.get_formats(abs_path, ps_cache_path, + abs_data.identifier, + abs_data.source_type, source, + cf_cache=cf_cache)} + render = formats.get(ContentType.pdf, None) + version = Version( + identifier=abs_data.identifier, + announced_date=abs_data.submitted_date, + announced_date_first=abs_data.submitted_date, + submitted_date=event_date, + updated_date=event_date, + metadata=Metadata( + primary_classification=abs_data.primary_classification, + secondary_classification=abs_data.secondary_classification, + title=abs_data.title, + abstract=abs_data.abstract, + authors=abs_data.authors, + license=abs_data.license, + comments=abs_data.comments, + journal_ref=abs_data.journal_ref, + report_num=abs_data.report_num, + doi=abs_data.doi, + msc_class=abs_data.msc_class, + acm_class=abs_data.acm_class + ), + events=[], + submitter=abs_data.submitter, + proxy=abs_data.proxy, + is_announced=True, + is_withdrawn=False, + is_legacy=True, + source=source, + render=render, + formats=formats, + source_type=abs_data.source_type + ) + + event = Event( + identifier=abs_data.identifier, + event_date=event_date, + event_type=(EventType.NEW + if abs_data.identifier.version == 1 + else EventType.REPLACED), + is_legacy=True, + version=version + ) + version.events.append(event.summary) + return event + + +def _load_all(daily_path: str, + abs_path: str, + ps_cache_path: str, + current: Optional[MutableMapping[Identifier, int]] = None, + first: Optional[MutableMapping[Identifier, date]] = None, + cf_cache: Optional[_CF_PersistentIndex] = None, + limit_to: Optional[Set[Identifier]] = None, + cache_path: Optional[str] = None) -> Iterable[Event]: + """ + Load all classic events, using both abs files and the daily.log. + + Pre-daily.log events are inferred from the abs files. Events derived from + daily.log are supplemented by the abs files to infer things like version + number. + + Parameters + ---------- + daily_path : str + Absolute path to the daily.log file. + abs_path : str + Absolute path of the directory containing abs files and source + packages. Specifically, this is the directory that contains the ``ftp`` + and ``orig`` subdirectories. + current : mapping + A ``dict`` or other mutable mapping of :class:`Identifier`s onto the + most recent loaded numeric version of the corresponding e-print. + first : mapping + A ``dict`` or other mutable mapping of :class:`Identifier`s onto the + announcement date of the first version of the corresponding e-print. + limit_to : set + A set of :class:`Identifier`s indicating a subset of e-prints to load. + If ``None`` (default) all events for all e-prints are loaded. + cache_path : str + If provided, a writable directory where a cache of events can be + maintained. This cuts down on spin-up time considerably. + + Returns + ------- + iterator + Yields :class:`Event`s in chronological order. + + """ + if current is None: + current = {} + if first is None: + first = {} + + # Normalize all of our paths. + daily_path = os.path.abspath(daily_path) + cache_path = os.path.abspath(cache_path) + abs_path = os.path.abspath(abs_path) + ps_cache_path = os.path.abspath(ps_cache_path) + + logger.info(f'Load events from {daily_path}') + # We can't infer whether an abs file was written prior to the daily.log + # from the abs file alone. But if the e-print identifier comes prior to the + # earlier identifier for a ``new`` event in the daily.log, then we can + # be certain it is not covered in the daily.log. + first_entry = next( + iter(daily.parse(daily_path, cache_path=cache_path)), + None + ) + if first_entry is None: + raise RuntimeError('Could not load the first entry from daily.log') + logger.info(f'First: {first_entry.event_date}: {first_entry.arxiv_id}') + + first_day = daily.parse(daily_path, for_date=first_entry.event_date, + cache_path=cache_path) + new_identifiers = sorted([Identifier(ed.arxiv_id) for ed in first_day + if ed.event_type is EventType.NEW]) + logger.info(f'Found {len(new_identifiers)} NEW events on the first day') + first_identifier = new_identifiers[0] + logger.info(f'The earliest NEW identifier is {first_identifier}') + # These are e-prints that were first announced prior to the beginning of + # the daily.log file, i.e. we have no ``new`` event. + ids_prior_to_first_event = \ + list(abs.iter_all(abs_path, to_id=first_identifier)) + + # Load all of the pre-daily events at once. + logger.info('Loading pre-daily events for %i identifiers...', + len(ids_prior_to_first_event)) + predaily_events: List[Event] = [] + for ident in ids_prior_to_first_event: + if limit_to and ident not in limit_to: + continue + for event in _load_predaily(daily_path, abs_path, ps_cache_path, ident, + current, first, cache_path=cache_path, + cf_cache=cf_cache): + predaily_events.append(event) + predaily_events = sorted(predaily_events, key=attrgetter('event_date')) + + logger.info('Loaded %i pre-daily events', len(predaily_events)) + + # Lazily load the daily events. + daily_events = _load_events(abs_path, daily_path, ps_cache_path, + current, first, limit_to=limit_to, + cache_path=cache_path, + cf_cache=cf_cache) + return chain(predaily_events, daily_events) + + +def _load_daily_event(abs_path: str, ps_cache_path: str, + event_datum: daily.EventData, + current: MutableMapping[Identifier, int], + first: MutableMapping[Identifier, date], + cf_cache: Optional[_CF_PersistentIndex] = None) -> Event: + + identifier = _make_id(event_datum, current) + + abs_datum = abs.parse(abs_path, identifier) + + if abs_datum.identifier != identifier: # Loaded the correct abs file? + raise RuntimeError(f'Loaded the wrong abs file! Expected {identifier},' + f' got {abs_datum.identifier}. This may be because' + f' the abs file for {identifier} is missing.') + + event = _make_event(abs_path, ps_cache_path, abs_datum, event_datum, + identifier, first, cf_cache=cf_cache) + + current[event.identifier.arxiv_id] = event.identifier.version + if event.identifier.version == 1: + first[event.identifier.arxiv_id] = event.event_date + return event + + +def _load_events(abs_path: str, daily_path: str, ps_cache_path: str, + current: MutableMapping[Identifier, int], + first: MutableMapping[Identifier, date], + limit_to: Optional[Set[Identifier]] = None, + cache_path: Optional[str] = None, + cf_cache: Optional[_CF_PersistentIndex] = None) \ + -> Iterable[Event]: + for event_datum in daily.parse(daily_path, cache_path=cache_path): + if limit_to and event_datum.arxiv_id not in limit_to: + continue + yield _load_daily_event(abs_path, ps_cache_path, event_datum, current, + first, cf_cache=cf_cache) + + +def _load_predaily(daily_path: str, abs_path: str, ps_cache_path: str, + identifier: Identifier, + current: MutableMapping[Identifier, int], + first: MutableMapping[Identifier, date], + cache_path: Optional[str] = None, + cf_cache: Optional[_CF_PersistentIndex] = None) \ + -> List[Event]: + """ + Generate inferred events prior to daily.log based on abs files. + + Approach: + + - v1 announced date is the v1 submission date + - if there are multiple versions: + - scan the daily.log for all replacements of that e-print + - align from the most recent version, backward + - if there are any remaining versions between v1 and the lowest v from + the previous step, use the submission date for that v from the abs + file as the announced date. + - if we have explicit cross-list events, exclude those crosses from any + events that we generate here. + + """ + events: List[Event] = [] + abs_for_this_ident = sorted(abs.parse_versions(abs_path, identifier), + key=lambda a: a.identifier.version) + N_versions = len(abs_for_this_ident) + events_for_this_ident = sorted(daily.scan(daily_path, identifier, + cache_path=cache_path), + key=lambda d: d.event_date) + # These result in new versions. + replacements = [e for e in events_for_this_ident + if e.event_type == EventType.REPLACED] + # These do not. + crosslists = [e for e in events_for_this_ident + if e.event_type == EventType.CROSSLIST] + + # If there are more replacement events than we have abs beyond the + # first version, we're in trouble. + assert len(replacements) < len(abs_for_this_ident) + + # Work backward, since we do not know whether there were replacements + # prior to the start of the daily.log file. + repl_map = {} + for i, event in enumerate(replacements[::-1]): + repl_map[abs_for_this_ident[-(i + 1)].identifier.version] = event + + # Generate replacement events as needed, and remove cross-list + # categories for which we have explicit CROSSLIST events in daily. + for i, abs_datum in enumerate(abs_for_this_ident): + if abs_datum.identifier.version in repl_map: + event_date = _datetime_from_date( + repl_map[abs_datum.identifier.version].event_date, + identifier + ) + else: + # We don't know the announcement date, so we will fall back to + # the submission date for this abs version. + event_date = _datetime_from_date(abs_datum.submitted_date, + identifier) + + # Some of the abs categories may have been added after the + # initial new/replacement event. We want to pare out those + # secondaries, since they were not actually present. + while crosslists and crosslists[0].event_date < event_date.date(): + cross = crosslists.pop(0) + last = events[-1] + last.version.metadata.secondary_classification = [ + c for c in last.version.metadata.secondary_classification + if c not in cross.categories + ] + + # If we have aligned an abs version with an event from daily.log, + # we will skip it for now; we will handle all events from daily.log + # in order later on. + if abs_datum.identifier.version not in repl_map: + # This event is inferred from the presence of an abs file. + events.append(_event_from_abs(abs_path, ps_cache_path, abs_datum, + event_date, cf_cache=cf_cache)) + current[events[-1].identifier.arxiv_id] \ + = events[-1].identifier.version + if events[-1].identifier.version == 1: + first[events[-1].identifier.arxiv_id] = event_date + return events + + +def _load_today(daily_path: str, + abs_path: str, + ps_cache_path: str, + first: Optional[MutableMapping[Identifier, date]] = None, + cache_path: Optional[str] = None, + **_: Any) -> Iterable[Event]: + """ + Load the events that were generated today. + + This is a unique case, in that we are able to directly infer the version + associated with each event based on the most recent abs file for each + e-print. + + Parameters + ---------- + daily_path : str + Absolute path to the daily.log file. + abs_path : str + Absolute path of the directory containing abs files and source + packages. Specifically, this is the directory that contains the ``ftp`` + and ``orig`` subdirectories. + first : mapping + A ``dict`` or other mutable mapping of :class:`Identifier`s onto the + announcement date of the first version of the corresponding e-print. + cache_path : str + If provided, a writable directory where a cache of events can be + maintained. This cuts down on spin-up time considerably. + + Returns + ------- + iterator + Yields :class:`Event`s in chronological order. + + """ + # Normalize all of our paths. + daily_path = os.path.abspath(daily_path) + if cache_path is not None: + cache_path = os.path.abspath(cache_path) + abs_path = os.path.abspath(abs_path) + ps_cache_path = os.path.abspath(ps_cache_path) + + if first is None: + first = {} + + for datum in daily.parse(daily_path, for_date=datetime.now(ET).date(), + cache_path=cache_path): + abs_datum = abs.parse_latest(abs_path, datum.arxiv_id) + yield _make_event(abs_path, ps_cache_path, abs_datum, datum, + abs_datum.identifier, first) + + +def _make_categories(event_datum: daily.EventData, abs_datum: abs.AbsData) \ + -> Tuple[Category, List[Category]]: + if event_datum.event_type.is_new_version: + primary_classification = event_datum.categories[0] + secondary_classification = event_datum.categories[1:] + else: + primary_classification = abs_datum.primary_classification + secondary_classification = abs_datum.secondary_classification + # else: + # raise RuntimeError(f'Unxpected event type: {event_datum.event_type}') + return primary_classification, secondary_classification + + +def _make_event(abs_path: str, ps_cache_path: str, abs_datum: abs.AbsData, + event_datum: daily.EventData, + identifier: VersionedIdentifier, + first: MutableMapping[Identifier, date], + cf_cache: Optional[_CF_PersistentIndex] = None) -> Event: + + # Look up the date that the first version of this e-print was announced. + if identifier.version > 1: + announced_date_first = first[event_datum.arxiv_id] + else: + announced_date_first = event_datum.event_date + + primary, secondary = _make_categories(event_datum, abs_datum) + if abs_datum.submission_type == EventType.WITHDRAWN: + event_type = EventType.WITHDRAWN + is_withdrawn = True + else: + event_type = event_datum.event_type + is_withdrawn = False + + source = content.get_source(abs_path, identifier) + formats = {cf.content_type: cf + for cf in content.get_formats(abs_path, ps_cache_path, + identifier, abs_datum.source_type, + source, cf_cache=cf_cache)} + render = formats.get(ContentType.pdf, None) + + version = Version( + identifier=identifier, + announced_date=event_datum.event_date, + announced_date_first=announced_date_first, + submitted_date=abs_datum.submitted_date, + updated_date=abs_datum.updated_date, + metadata=Metadata( + primary_classification=primary, + secondary_classification=secondary, + title=abs_datum.title, + abstract=abs_datum.abstract, + authors=abs_datum.authors, + license=abs_datum.license, + comments=abs_datum.comments, + journal_ref=abs_datum.journal_ref, + report_num=abs_datum.report_num, + doi=abs_datum.doi, + msc_class=abs_datum.msc_class, + acm_class=abs_datum.acm_class + ), + events=[], + submitter=abs_datum.submitter, + proxy=abs_datum.proxy, + is_announced=True, + is_withdrawn=is_withdrawn, + is_legacy=True, + source=source, + render=render, + formats=formats, + source_type=abs_datum.source_type + ) + event = Event( + identifier=abs_datum.identifier, + event_date=_datetime_from_date(event_datum.event_date, + identifier.arxiv_id), + event_type=event_type, + is_legacy=True, + version=version, + categories=event_datum.categories + ) + version.events.append(event.summary) + return event + + +def _make_id(event_datum: daily.EventData, + current: MutableMapping[Identifier, int]) -> VersionedIdentifier: + if event_datum.event_type == EventType.NEW: + identifier = VersionedIdentifier.from_parts(event_datum.arxiv_id, 1) + elif event_datum.event_type.is_new_version: + identifier = VersionedIdentifier.from_parts( + event_datum.arxiv_id, + current[event_datum.arxiv_id] + 1 + ) + else: + print(event_datum.event_type) + identifier = VersionedIdentifier.from_parts( + event_datum.arxiv_id, + current[event_datum.arxiv_id] + ) + return identifier diff --git a/arxiv/canonical/classic/cli.py b/arxiv/canonical/classic/cli.py new file mode 100644 index 0000000..612e291 --- /dev/null +++ b/arxiv/canonical/classic/cli.py @@ -0,0 +1,175 @@ +""" +Command-line tools for the classic record. + +Next step: propagating events during backfill +============================================= +In the current implementation, the legacy record is used to backfill the +NG canonical record on a start-up and daily basis (after announcement for that +day is complete). Especially for the daily update, it will be desirable to +also propagate the backfilled events on the announcement event stream. Use +the implementation in :mod:`arxiv.canonical.services.stream` to emit events +as they are yielded by the backfill/backfill_today function. + +Note that this assumes that a minimal version of the NG canonical repository +application is running and accessible to consumers, who will need to retrieve +bitstreams identified by canonical URIs. +""" + +import os +from datetime import date, datetime +from typing import Optional + +import click + +from ..register import RegisterAPI +from ..services import CanonicalFilesystem, Filesystem, RemoteSource +from . import backfill as _backfill + + +@click.group() +def cli() -> None: + """Placeholder for the CLI command group provided by this module.""" + pass + + +# The \b on its own line in the docstring is because click's docstring parsing +# kind of sucks: +# https://click.palletsprojects.com/en/7.x/documentation/?highlight=help#preventing-rewrapping +@click.command('backfill', + short_help='Backfill canonical record from the classic record.') +@click.argument('record_path') +@click.argument('classic_path', default='/data') +@click.argument('daily_path', default='/data/logs_archive/misc/daily.log') +@click.argument('ps_cache_path', default='/cache') +@click.option('--state-path', default=None, type=str) +@click.option('--until', default=None, type=click.DateTime(['%Y-%m-%d']), # pylint: disable=no-member + help='If provided, will only backfill up to the specified date.') +@click.option('--remote', default='arxiv.org', type=str, + help='Host to use when formats are missing from ps_cache') +def backfill(record_path: str, + classic_path: str, + daily_path: str, + ps_cache_path: str, + state_path: Optional[str] = None, + cache_path: Optional[str] = None, + until: Optional[datetime] = None, + remote: str = 'arxiv.org') -> None: + """ + Backfill the canonical record from the classic record. + + TODO: add support for ``s3://`` path for ``record_path``. + + \b + Parameters + ---------- + record_path : str + Full path to the target canonical record. + classic_path : str + Path to data directory containing orig/, ftp/. Default: ``/data``. + daily_path : str + Full path to the daily.log file. Default: + ``/data/logs_archive/misc/daily.log``. + ps_cache_path : str + Full path to the directory containing ps_cache/. Default: ``/cache``. + state_path : str + Path for the backfill state. Allows re-starting from the last + successfully handled event. Default: ``.backfill/`` in the CWD. + cache_path : str + Path for the backfill cache. Used to cache expensive metadata about + classic bitstreams. Default: ``.backfill/cache/`` in the CWD. + until : date + If provided, will only backfill up to the specified date. + remote : str + Host to use when formats are missing from ps_cache. + + """ + if state_path is None: + state_path = './.backfill' + if cache_path is None: + cache_path = './.backfill/cache' + storage = CanonicalFilesystem(record_path) + classic = Filesystem(classic_path) + remote_source = RemoteSource(remote) + api = RegisterAPI(storage, [storage, classic, remote_source]) + + until_date: Optional[date] = None if not until else until.date() + + for event in _backfill.backfill(api, daily_path, classic_path, + ps_cache_path, state_path, + cache_path=cache_path, until=until_date): + click.echo(f'{event.event_date}' + f'\t{event.identifier}' + f'\t{event.event_type.value}') + + +@click.command('backfill_today', + short_help='Backfill today\'s events from the classic record.') +@click.argument('record_path') +@click.argument('classic_path', default='/data') +@click.argument('daily_path', default='/data/logs_archive/misc/daily.log') +@click.argument('ps_cache_path', default='/cache') +@click.option('--state-path', default=None, type=str) +@click.option('--until', default=None, type=click.DateTime(['%Y-%m-%d']), # pylint: disable=no-member + help='If provided, will only backfill up to the specified date.') +@click.option('--remote', default='arxiv.org', type=str, + help='Host to use when formats are missing from ps_cache') +def backfill_today(record_path: str, + classic_path: str, + daily_path: str, + ps_cache_path: str, + state_path: Optional[str] = None, + cache_path: Optional[str] = None, + remote: str = 'arxiv.org') -> None: + """ + Backfill today\'s events from the classic record. + + This is a unique case, in that we are able to directly infer the version + associated with each event based on the most recent abs file for each + e-print. + + TODO: add support for ``s3://`` path for ``record_path``. + + \b + Parameters + ---------- + record_path : str + Full path to the target canonical record. + classic_path : str + Path to data directory containing orig/, ftp/. Default: ``/data``. + daily_path : str + Full path to the daily.log file. Default: + ``/data/logs_archive/misc/daily.log``. + ps_cache_path : str + Full path to the directory containing ps_cache/. Default: ``/cache``. + state_path : str + Path for the backfill state. Allows re-starting from the last + successfully handled event. Default: ``.backfill/`` in the CWD. + cache_path : str + Path for the backfill cache. Used to cache expensive metadata about + classic bitstreams. Default: ``.backfill/cache/`` in the CWD. + remote : str + Host to use when formats are missing from ps_cache. + + """ + if state_path is None: + state_path = './.backfill' + if cache_path is None: + cache_path = './.backfill/cache' + storage = CanonicalFilesystem(record_path) + classic = Filesystem(classic_path) + remote_source = RemoteSource(remote) + api = RegisterAPI(storage, [storage, classic, remote_source]) + + for event in _backfill.backfill_today(api, daily_path, classic_path, + ps_cache_path, state_path, + cache_path=cache_path): + click.echo(f'{event.event_date}' + f'\t{event.identifier}' + f'\t{event.event_type.value}') + + +cli.add_command(backfill) + + +if __name__ == '__main__': + cli() \ No newline at end of file diff --git a/arxiv/canonical/classic/content.py b/arxiv/canonical/classic/content.py new file mode 100644 index 0000000..7ab3ad2 --- /dev/null +++ b/arxiv/canonical/classic/content.py @@ -0,0 +1,302 @@ +""" +Functions for resolving classic content. + +TODO: really need to cache stuff here. +""" +import logging +import os +import time +from datetime import datetime +from functools import partial +from typing import Callable, Iterable, List, MutableMapping, Optional, Tuple + +from pytz import timezone + +from .. import domain as D +from ..services import RemoteSource + +logger = logging.getLogger(__name__) +logger.setLevel(int(os.environ.get('LOGLEVEL', '40'))) + +ET = timezone('US/Eastern') + +REMOTE: Optional['RemoteSourceWithHead'] = None + + +_CF_Cache = MutableMapping[Tuple[D.VersionedIdentifier, D.ContentType], + Optional[D.CanonicalFile]] + + +def get_source_path(dpath: str, ident: D.VersionedIdentifier) -> str: + for ext in D.list_source_extensions(): + path = _get_path(_orig_source, _latest_source, dpath, ident, ext) + if path is not None: + return path + raise IOError(f'No source path found for {ident}') + + +def get_source(data: str, ident: D.VersionedIdentifier) -> D.CanonicalFile: + """Get the source file for a version from classic.""" + logger.debug(f'Getting source for {ident}') + path = get_source_path(data, ident) + mtime = datetime.utcfromtimestamp(os.path.getmtime(path)).astimezone(ET) + try: + content_type = D.ContentType.from_filename(path) + except ValueError: + # In classic, stand-alone tex files were not given extensions. + content_type = D.ContentType.tex + + is_gzipped = bool(path.endswith('.gz')) + cf = D.CanonicalFile( + modified=mtime, + size_bytes=os.path.getsize(path), + content_type=content_type, + ref=D.URI(path), + filename=content_type.make_filename(ident), + is_gzipped=is_gzipped + ) + logger.debug('Got source file for %s: %s', ident, cf.ref) + return cf + + +def get_formats(dpath: str, + ps_cache_path: str, + ident: D.VersionedIdentifier, + source_type: Optional[D.SourceType], + source: D.CanonicalFile, + cf_cache: Optional[_CF_Cache] = None) \ + -> Iterable[D.CanonicalFile]: + """Get the dissemination formats for a version.""" + available_formats: Optional[List[D.ContentType]] = None + if source.filename is not None: + available_formats = D.available_formats_by_ext(source.filename) + if available_formats is None and source_type is not None: + available_formats = source_type.available_formats + + if not available_formats: # Nothing more can be done at this point. + logger.debug('No available dissemination formats for: %s', ident) + return + + for content_type in available_formats: + cf: Optional[D.CanonicalFile] = None # What we hope to yield. + + cache_key = (ident, content_type) + if cf_cache is not None and cache_key in cf_cache: + cf = cf_cache[cache_key] + assert cf is not None + yield cf + continue + + path: Optional[str] = None # We hope to find the file on disk. + + # We want to try both gziped and non-gzipped variants of the filename. + ext = content_type.ext + ext_gz = f'{ext}.gz' if not ext.endswith('.gz') else ext + ext = ext if not ext.endswith('.gz') else f'{ext}.gz' + + # In some cases, the resource may have just been the original + # source (e.g. pdf-only submissions). + for _ext in (ext, ext_gz): + path = _get_path(_orig_source, _latest_source, dpath, ident, _ext) + if path and os.path.exists(path): + logger.debug('Got source path for %s %s: %s', + ident, content_type.value, path) + break + logger.debug('Tried source path for %s %s with ext %s', + ident, content_type.value, _ext) + + # HTML is kind of a special case. The HTML source package is either a + # .html.gz or .tar.gz file, and the ps_cache content is a directory + # containing one or many files that get served up directly by the web + # server. .html.gz will have gotten picked up above, so we just want + # to make sure that if we have a multi-file HTML file we also check for + # a .tar.gz in the source directory. + if path is None \ + and content_type == D.ContentType.html \ + and source_type is not None and source_type.has_html: + path = ( + _get_path(_orig_source, _latest_source, dpath, ident, 'tar') + or + _get_path(_orig_source, _latest_source, dpath, ident, 'tar.gz') + ) + + # Otherwise look in the ps_cache. + if path is None or not os.path.exists(path): + for _ext in (ext, ext_gz): + path = _cache(content_type, ps_cache_path, ident, _ext) + if path and os.path.exists(path): + logger.debug('Got ps_cache path for %s %s: %s', + ident, content_type.value, path) + break + logger.debug('Tried ps_cache for %s %s with ext %s', + ident, content_type.value, _ext) + + if path is not None and os.path.exists(path): + mtime = datetime.utcfromtimestamp(os.path.getmtime(path)) \ + .astimezone(ET) + # We want the canonical filename to correspond to the content type + # more precisely (this is not the case in classic). + filename = content_type.make_filename(ident) + cf = D.CanonicalFile( + modified=mtime, + size_bytes=os.path.getsize(path), + content_type=content_type, + ref=D.URI(path), + filename=filename, + is_gzipped=bool(path.endswith('.gz')) + ) + else: # Fall back to a HEAD request to the main site. + cf = _get_via_http(ident, content_type) + + if cf is not None: + # Sanity check. + assert cf.filename is not None + if not cf.filename.endswith(cf.content_type.ext): + logger.error('Expected ext %s, but filename is %s', + cf.content_type.ext, cf.filename) + raise RuntimeError('Expected ext %s, but filename is %s' % + (cf.content_type.ext, cf.filename)) + + if cf_cache is not None: # A null result is still worth saving. + cf_cache[cache_key] = cf + + if cf is not None: + yield cf + + +def _get_via_http(ident: D.VersionedIdentifier, + content_type: D.ContentType, + remote: str = 'arxiv.org') -> Optional[D.CanonicalFile]: + """Retrieve the""" + logger.debug('Getting metadata for %s for %s via http', + content_type.value, ident) + global REMOTE # This is fine for now since this is single-threaded. + if REMOTE is None: + REMOTE = RemoteSourceWithHead(remote) + + # The .dvi extension is not supported in the classic /dvi route. + if content_type == D.ContentType.dvi: + path = f'{content_type.value}/{ident}' + else: + path = f'{content_type.value}/{ident}.{content_type.ext}' + + cf = REMOTE.head(D.URI(f'https://arxiv.org/{path}'), content_type) + if cf is not None: + cf.filename = content_type.make_filename(ident) + return cf + + +class RemoteSourceWithHead(RemoteSource): + def head(self, key: D.URI, content_type: D.ContentType) \ + -> Optional[D.CanonicalFile]: + response = self._session.head(key, allow_redirects=True) + # arXiv may need to rebuild the product. + while response.status_code == 200 and 'Refresh' in response.headers: + time.sleep(int(response.headers['Refresh'])) + response = self._session.head(key, allow_redirects=True) + if response.status_code != 200: + logger.error('%i: %s', response.status_code, response.headers) + raise IOError(f'Could not retrieve {key}: {response.status_code}') + + # At this point, we are most likely encountering the "unavailable" + # page, which (intriguingly) returns 200 instead of 404. + if 'Last-Modified' not in response.headers: + return None + + mtime = datetime.strptime(response.headers['Last-Modified'], + '%a, %d %b %Y %H:%M:%S %Z').astimezone(ET) + + # Oddly, arxiv.org may return compressed content (i.e. not just for + # transport). We've been around for a while! + is_gzipped = bool(response.headers.get('Content-Encoding') == 'x-gzip') + return D.CanonicalFile( + modified=mtime, + size_bytes=int(response.headers['Content-Length']), + content_type=content_type, + ref=D.URI(response.url), # There may have been redirects. + is_gzipped=is_gzipped + ) + + +def _latest(data: str, ident: D.Identifier, filename: str) -> str: + cat = ident.category_part if ident.is_old_style else 'arxiv' + return os.path.join(data, 'ftp', cat, 'papers', ident.yymm, filename) + + +def _orig(data: str, ident: D.VersionedIdentifier, filename: str) -> str: + cat = ident.category_part if ident.is_old_style else 'arxiv' + return os.path.join(data, 'orig', cat, 'papers', ident.yymm, filename) + + +def _cache(content_type: D.ContentType, ps_cache_path: str, + ident: D.VersionedIdentifier, ext: str) -> str: + if ident.is_old_style: + filename = f'{ident.numeric_part}v{ident.version}.{ext}' + else: + filename = f'{ident}.{ext}' + cat = ident.category_part if ident.is_old_style else 'arxiv' + return os.path.join(ps_cache_path, 'ps_cache', cat, content_type.value, + ident.yymm, filename) + + +def _latest_source(path: str, ident: D.Identifier, ext: str) -> str: + if ident.is_old_style: + fname = f'{ident.numeric_part}.{ext.lstrip(".")}' + else: + fname = f'{ident}.{ext.lstrip(".")}' + return _latest(path, ident, fname) + + +def _orig_source(path: str, ident: D.VersionedIdentifier, ext: str) -> str: + if ident.is_old_style: + fname = f'{ident.numeric_part}v{ident.version}.{ext.lstrip(".")}' + else: + fname = f'{ident}.{ext.lstrip(".")}' + return _orig(path, ident, fname) + + +def _get_path(get_orig: Callable[[str, D.VersionedIdentifier, str], str], + get_latest: Callable[[str, D.Identifier, str], str], + dpath: str, + ident: D.VersionedIdentifier, + ext: str) -> Optional[str]: + """ + Generic logic for finding the path to a resource. + + Resources for the latest version are stored separately from resources + for prior versions. But resources for the latest version are not named + with their version number affix. + + A second challenge is that in some cases we do not know ahead of time what + file format (and hence filename) we are looking for. + + So it takes a bit of a dance to figure out whether a respondant resource + exists, and where it is located. + """ + # For versions prior to the latest, resources are named with their + # version affix. + orig = get_orig(dpath, ident, ext) + logger.debug(orig) + if os.path.exists(orig): + logger.debug(f'found orig path: {orig}') + return orig + + # If this is the first version, the only other place it could be is + # in the "latest" section. + + latest = get_latest(dpath, ident.arxiv_id, ext) + if ident.version == 1 and os.path.exists(latest): + logger.debug(f'can only be in latest: {latest}') + return latest + + # If the prior version exists in the "original" section, then the latest + # version must be the one that we are working with. + prior = D.VersionedIdentifier.from_parts(ident.arxiv_id, ident.version - 1) + # Have to check for the abs file, since we don't know what format the + # previous version was in. + if os.path.exists(get_orig(dpath, prior, 'abs')): + if os.path.exists(latest): + logger.debug(f'prior version in orig; must be latest: {latest}') + return latest + + return None \ No newline at end of file diff --git a/arxiv/canonical/serialize/classic/daily.py b/arxiv/canonical/classic/daily.py similarity index 55% rename from arxiv/canonical/serialize/classic/daily.py rename to arxiv/canonical/classic/daily.py index eebc8e5..42c88f5 100644 --- a/arxiv/canonical/serialize/classic/daily.py +++ b/arxiv/canonical/classic/daily.py @@ -1,52 +1,74 @@ """ Parser for the daily.log file. -From the original arXiv::Updates::DailyLog: - -``` -Module to provide information about updates to the archive -over specified periods. This should be the only section -of code that reads the daily.log file. - - Simeon Warner - 6Jan2000... - 25Jan2000 - modified so that undef $startdate or $enddate select - the beginning or end of time respectively. - 25Jan2000 - modified so that by simply removing the `-' from - and ISO8601 date we get YYYYMMDD from YYYY-MM-DD - 16Oct2000 - to allow easy resumption in the OAI1 interface and - because it seems that it might be useful in other contexts the - number limited behaviour has been changed. query_daily_log() and - hence all other routines now stop at then end of a day and - returns the that day (in the form YYYY-MM-DD) as the value - if limited, undef otherwise. - -Thoughts: If this is to be used on the mirror sites then we will need -to mirror the daily log. This probably means that that file -should be split up. - - [CVS: $Id: DailyLog.pm,v 1.6 2010/03/23 03:53:09 arxiv Exp $] -``` - The main goal of this implementation is parsing the log file for the purpose of transforming it into the arXiv Canonical format. Specifically, we want to use this legacy data structure to generate :class:`.Event` data that can be serialized in the daily listing files. -""" +From the original ``arXiv::Updates::DailyLog``: -from typing import Tuple, List, Mapping, Iterable -from collections import defaultdict -from datetime import date -import string +.. code-block:: plain + + Module to provide information about updates to the archive + over specified periods. This should be the only section + of code that reads the daily.log file. + + Simeon Warner - 6Jan2000... + 25Jan2000 - modified so that undef $startdate or $enddate select + the beginning or end of time respectively. + 25Jan2000 - modified so that by simply removing the `-' from + and ISO8601 date we get YYYYMMDD from YYYY-MM-DD + 16Oct2000 - to allow easy resumption in the OAI1 interface and + because it seems that it might be useful in other contexts the + number limited behaviour has been changed. query_daily_log() and + hence all other routines now stop at then end of a day and + returns the that day (in the form YYYY-MM-DD) as the value + if limited, undef otherwise. + + Thoughts: If this is to be used on the mirror sites then we will need + to mirror the daily log. This probably means that that file + should be split up. + + [CVS: $Id: DailyLog.pm,v 1.6 2010/03/23 03:53:09 arxiv Exp $] +``` + +""" +import logging +import os import re +import string +import tempfile +import warnings +from operator import attrgetter +from typing import Any, Dict, Tuple, List, Mapping, MutableMapping, Iterable, \ + NamedTuple, Optional, Type +from collections import defaultdict +from datetime import date, datetime from itertools import chain, groupby -import warnings +from arxiv.base.logging import getLogger + +from ..domain import Event, Identifier, InvalidIdentifier, \ + VersionedIdentifier, EventType +from .util import PersistentMultifileIndex + -from ...domain import Event +def _showwarning(message: str, + *args: Any, + category: Type[Exception] = UserWarning, + filename: str = '', + lineno: int = -1, + **kwargs: Any) -> None: + print(message) -Entry = Tuple[str, str] -"""An ``arxiv_id, category`` tuple.""" +warnings.showwarning = _showwarning + +logger = logging.getLogger(__name__) +logger.setLevel(int(os.environ.get('LOGLEVEL', '40'))) + +Entry = Tuple[Identifier, EventType, str] +MergedEntry = Tuple[Identifier, EventType, List[str]] LINE = re.compile(r'^(?P\d{6})\|(?P[a-z-]+)' r'\|(?P.*)$') @@ -99,10 +121,20 @@ """ +class EventData(NamedTuple): + """Data about events that can be extracted from the daily log.""" + + arxiv_id: Identifier + event_date: date + event_type: EventType + version: int + categories: List[str] + + class DailyLogParser: """Parses the daily log file.""" - def __init__(self): + def __init__(self) -> None: """Initialize both styles of parsers.""" self.newstyle_parser = NewStyleLineParser() self.oldstyle_parser = OldStyleLineParser() @@ -118,7 +150,14 @@ def _parse_date(self, event_date_raw: str) -> date: event_date = date(year=year, month=month, day=day) return event_date - def parse(self, path: str) -> Iterable[Event]: + def _parse_date_only(self, line: str) -> Optional[date]: + match = LINE.match(line) + if match is None: + return None + return self._parse_date(match.group('event_date')) + + def parse(self, path: str, for_date: Optional[date] = None) \ + -> Iterable[EventData]: """ Parse the daily log file. @@ -130,13 +169,58 @@ def parse(self, path: str) -> Iterable[Event]: Returns ------- iterable - Each item is an :class:`.Event` from the log file. + Each item is an :class:`.EventData` from the log file. """ - return chain.from_iterable((self.parse_line(line) - for line in open(path, 'r', -1))) + return self._merge(chain.from_iterable( + (self.parse_line(line) for line in open(path, 'r', -1) + if for_date is None + or for_date == self._parse_date_only(line)) + )) - def parse_line(self, raw: str) -> Iterable[Event]: + def _merge(self, entries: Iterable[EventData]) -> Iterable[EventData]: + """ + + It is possible for a singular event to be represented in multiple + archive sections. For example ``math-ph/0702031`` was replaced + on 2007-02-13; on that day, it is listed in both ``math-ph`` and + ``math`` archive sections of the record. + + This function takes a series of entries from a given day that may + contain multiple entries per event, and returns a series of entries + that correspond directly to unique announcement events. + """ + _event_date = attrgetter('event_date') + _identifier = attrgetter('arxiv_id') + + def _event_type(datum: EventData) -> Tuple[int, EventType]: + order = {EventType.NEW: 0, + EventType.REPLACED: 1, + EventType.CROSSLIST: 2, + EventType.UPDATED_METADATA: 4} + return order[datum.event_type], datum.event_type + + + # We assume that the entries are sorted by date already. + for event_date, day_entries in groupby(entries, key=_event_date): + # These will be coming in one archive at a time, so we need to + # sort and group by identifier and event type to merge + # appropriately. + grouped_by_id = groupby(sorted(day_entries, key=_identifier), + key=_identifier) + for identifier, i_entries in grouped_by_id: + grouped_by_etype = groupby(sorted(i_entries, key=_event_type), + key=_event_type) + for (_, event_type), e_entries in grouped_by_etype: + yield EventData( + arxiv_id=identifier, + event_date=event_date, + event_type=EventType(event_type), + version=1 if event_type == EventType.NEW else -1, + categories=[c for e in e_entries for c in e.categories] + ) + + def parse_line(self, raw: str) -> Iterable[EventData]: """ Parse a single line from the daily log file. @@ -148,10 +232,12 @@ def parse_line(self, raw: str) -> Iterable[Event]: Returns ------- iterable - Yields :class:`.Event` instances from the line. + Yields :class:`.EventData` instances from the line. """ match = LINE.match(raw) + if match is None: + raise ValueError(f'Line is malformed: {raw}') archive = match.group('archive') data = match.group('data') event_date = self._parse_date(match.group('event_date')) @@ -164,32 +250,53 @@ def parse_line(self, raw: str) -> Iterable[Event]: class LineParser: """Shared behavior among newstyle and oldstyle line parsing.""" - def _to_events(self, e_date: date, e_type: Event.Type, - entries: Iterable[Entry], - version: int = -1) -> Iterable[Event]: - for paper_id, entries in groupby(entries, key=lambda o: o[0]): - yield Event(arxiv_id=paper_id, event_date=e_date, - event_type=e_type, version=version, - categories=[category for _, category in entries]) + def _merge(self, entries: Iterable[Entry]) -> Iterable[MergedEntry]: + """ + Merge entries within an archive for a particular day. - def parse(self, e_date: date, archive: str, data: str) -> Iterable[Event]: + There is one entry per category, so multiple entries may belong to the + same announcement event. + """ + def _identifier(entry: Entry) -> Identifier: + return entry[0] + + def _event_type(entry: Entry) -> str: + return str(entry[1].value) + + for ident, ent \ + in groupby(sorted(entries, key=_identifier), key=_identifier): + for event_type, ev_ent \ + in groupby(sorted(ent, key=_event_type), key=_event_type): + yield ident, EventType(event_type), [c for _, _, c in ev_ent] + + def _to_events(self, + e_date: date, + entries: Iterable[MergedEntry], + version: int = -1) -> Iterable[EventData]: + event_date = date(e_date.year, e_date.month, e_date.day) + for paper_id, event_type, categories in entries: + yield EventData(paper_id, + event_date, + event_type, + version, + categories) + + def parse(self, e_date: date, archive: str, data: str) \ + -> Iterable[EventData]: """Parse data from a daily log file line.""" new, cross, replace = data.split('|') - return chain(self._to_events(e_date, Event.Type.NEW, - self.parse_new(archive, new), 1), - self._to_events(e_date, Event.Type.CROSSLIST, - self.parse_cross(archive, cross)), - self._to_events(e_date, Event.Type.REPLACED, - self.parse_replace(archive, replace))) - + return chain(self._to_events(e_date, self._merge(self.parse_new(archive, new)), 1), + self._to_events(e_date, self._merge(self.parse_cross(archive, cross))), + self._to_events(e_date, self._merge(self.parse_replace(archive, replace)))) + def parse_new(self, archive: str, fragment: str) -> Iterable[Entry]: """Parse entries for new e-prints.""" raise NotImplementedError('Not implemented in this base class') - + def parse_cross(self, archive: str, fragment: str) -> Iterable[Entry]: """Parse entries for cross-list e-prints.""" raise NotImplementedError('Not implemented in this base class') - + def parse_replace(self, archive: str, fragment: str) -> Iterable[Entry]: """Parse entries for replacements.""" raise NotImplementedError('Not implemented in this base class') @@ -228,12 +335,20 @@ def parse_new(self, archive: str, fragment: str) -> Iterable[Entry]: if match_range: start_id = int(match_range.group('start_id')) end_id = int(match_range.group('end_id')) - for identifier in range(start_id, end_id + 1): # Inclusive. - identifier = str(identifier).zfill(7) + for _identifier in range(start_id, end_id + 1): # Inclusive. + identifier = str(_identifier).zfill(7) paper_id = f'{archive}/{identifier}' - yield paper_id, archive + try: + yield Identifier(paper_id), EventType.NEW, archive + except InvalidIdentifier as e: + warnings.warn(f'Skipping: {e}') + continue elif SINGLE_IDENTIFIER.match(fragment): - yield f'{archive}/{fragment}', archive + paper_id = f'{archive}/{fragment}' + try: + yield Identifier(paper_id), EventType.NEW, archive + except InvalidIdentifier as e: + warnings.warn(f'Skipping: {e}') elif re.match(r'\S', fragment) is None: # Blank is OK pass else: @@ -266,7 +381,11 @@ def parse_cross(self, archive: str, fragment: str) -> Iterable[Entry]: category = match.group('category') if category: crossed_to += category - yield paper_id, crossed_to + try: + yield Identifier(paper_id), EventType.CROSSLIST, crossed_to + except InvalidIdentifier as e: + warnings.warn(f'Skipping: {e}') + continue else: warnings.warn(f'Failed parsing cross (old style): {paper_id}') @@ -288,10 +407,12 @@ def parse_replace(self, archive: str, fragment: str) -> Iterable[Entry]: """ for paper_id in fragment.strip().split(): + abs_only = False if paper_id.endswith('.abs'): abs_only = True paper_id = paper_id[:-4] + match_threepart = THREEPART_REPLACEMENT.match(paper_id) match_fourpart = FOURPART_REPLACEMENT.match(paper_id) match_weird = WEIRD_INVERTED_ENTRY.match(paper_id) @@ -315,7 +436,14 @@ def parse_replace(self, archive: str, fragment: str) -> Iterable[Entry]: else: warnings.warn(f'Failed parsing repl (old style): {paper_id}') continue - yield paper_id, crossed_to + if abs_only: + event_type = EventType.UPDATED_METADATA + else: + event_type = EventType.REPLACED + try: + yield Identifier(paper_id), event_type, crossed_to + except InvalidIdentifier as e: + warnings.warn(f'Skipping: {e}') class NewStyleLineParser(LineParser): @@ -354,7 +482,10 @@ def parse_new(self, archive: str, fragment: str) -> Iterable[Entry]: warnings.warn(f'Failed parsing new (new style): {paper_id}') continue for category in categories: - yield paper_id, category + try: + yield Identifier(paper_id), EventType.NEW, category + except InvalidIdentifier as e: + warnings.warn(f'Skipping: {e}') def parse_cross(self, archive: str, fragment: str) -> Iterable[Entry]: """ @@ -382,7 +513,10 @@ def parse_cross(self, archive: str, fragment: str) -> Iterable[Entry]: warnings.warn(f'Failed parsing cross (new style): {paper_id}') continue for crossed_to in categories: - yield paper_id, crossed_to + try: + yield Identifier(paper_id), EventType.CROSSLIST, crossed_to + except InvalidIdentifier as e: + warnings.warn(f'Skipping: {e}') def parse_replace(self, archive: str, fragment: str) -> Iterable[Entry]: """ @@ -409,8 +543,15 @@ def parse_replace(self, archive: str, fragment: str) -> Iterable[Entry]: except AssertionError: warnings.warn(f'Failed parsing repl (new style): {paper_id}') continue + if abs_only: + event_type = EventType.UPDATED_METADATA + else: + event_type = EventType.REPLACED for category in categories: - yield paper_id, category + try: + yield Identifier(paper_id), event_type, category + except InvalidIdentifier as e: + warnings.warn(f'Skipping: {e}') def _parse_entry(self, entry: str) -> Tuple[str, bool, List[str]]: """ @@ -426,10 +567,74 @@ def _parse_entry(self, entry: str) -> Tuple[str, bool, List[str]]: abs_only = True entry = entry[:-4] paper_id, categories = entry.split(':', 1) - categories = categories.split(':') + categories_list = categories.split(':') # unsquash old identifier, if squashed squashed = SQUASHED_IDENTIFIER.match(paper_id) if squashed: paper_id = '/'.join(squashed.groups()) assert IDENTIFIER.match(paper_id) is not None - return paper_id, abs_only, categories + return paper_id, abs_only, categories_list + + +EVENT_DATA: Optional[Mapping[str, Iterable[EventData]]] = None + + +def parse(path: str, for_date: Optional[date] = None, + cache_path: Optional[str] = None) -> Iterable[EventData]: + """ + Parse the daily log file. + + Parameters + ---------- + path : str + Path to the daily log file. + + Returns + ------- + iterable + Each item is an :class:`.EventData` from the log file. + + """ + global EVENT_DATA + if cache_path is None: + cache_path = tempfile.mkdtemp() + + if EVENT_DATA is None: + EVENT_DATA = PersistentMultifileIndex() + EVENT_DATA.load(cache_path) + + if EVENT_DATA: + logger.debug('Load events from cache') + if for_date: + for e in EVENT_DATA[for_date.isoformat()]: + yield e + else: + for events in EVENT_DATA.values(): + for e in events: + yield e + return + + logger.debug('Parse events for the first time') + year: Optional[int] = None + last = 0 + for i, e in enumerate(DailyLogParser().parse(path)): + if e.event_date.year != year: + if year is not None: + logger.info('Parsed %i events in %i', i + 1 - last, year) + year = e.event_date.year + last = i + + cache_key = e.event_date.isoformat() + if cache_key not in EVENT_DATA: + EVENT_DATA[cache_key] = [] + EVENT_DATA[cache_key].append(e) + logger.debug('Parsed %i events', i + 1) + EVENT_DATA.save() + for e in parse(path, for_date=for_date, cache_path=cache_path): + yield e + + +def scan(path: str, identifier: Identifier, cache_path: Optional[str] = None) \ + -> Iterable[EventData]: + return (ed for ed in parse(path, cache_path=cache_path) + if ed.arxiv_id == identifier) \ No newline at end of file diff --git a/arxiv/canonical/serialize/classic/tests/__init__.py b/arxiv/canonical/classic/tests/__init__.py similarity index 100% rename from arxiv/canonical/serialize/classic/tests/__init__.py rename to arxiv/canonical/classic/tests/__init__.py diff --git a/arxiv/canonical/classic/tests/data/ftp/adap-org/papers/9509/9509003.abs b/arxiv/canonical/classic/tests/data/ftp/adap-org/papers/9509/9509003.abs new file mode 100644 index 0000000..bc9e6fd --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/adap-org/papers/9509/9509003.abs @@ -0,0 +1,28 @@ +------------------------------------------------------------------------------ +\\ +arXiv:adap-org/9509003 +From: Yu Shi +Date: Thu, 5 Oct 1995 17:35:17 GMT (0kb,I) +Date (revised v2): Fri, 6 Oct 1995 17:59:42 GMT (0kb,I) +Date (revised v3): Mon, 20 Nov 1995 09:59:40 GMT (10kb) + +Title: Self-organization, ergodicity breaking, phase transition and + synchronization in two-dimensional traffic-flow model +Authors: Yu Shi (Dept. of Phys., fudan University, China) +Categories: adap-org nlin.AO +Comments: 16 pages. LaTeX, some minor errors corrected +\\ + Analytical investigation is made on the two-dimensional traffic-flow model +with alternative movement and exclude-volume effect between right and up arrows +[Phys. Rev. {\bf A} 46 R6124 (1992)]. Several exact results are obtained, +including the upper critical density above which there are only jamming +configurations, and the lower critical density below which there are only +moving configurations. The observed jamming transition takes place at another +critical density $p_{c}(N)$, which is in the intermidiate region between the +lower and upper critical densities. It is derived that +$p_{c}(N)\,=\,CN^{\alpha}$, where $C$ and $\alpha$ are determined to be +respectively $0.76$ and $-0.14$ from previous numerical simulation. This +transition is suggested to be a second-order phase transition, the order +parameter is found. The nature of self-organization, ergodicity breaking and +synchronization are discussed, Comparison with the sandpile model is made. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0704/0704.0001.abs b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0704/0704.0001.abs new file mode 100644 index 0000000..13fceaf --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0704/0704.0001.abs @@ -0,0 +1,31 @@ +------------------------------------------------------------------------------ +\\ +arXiv:0704.0001 +From: Pavel Nadolsky +Date: Mon, 2 Apr 2007 19:18:42 GMT (443kb) +Date (revised v2): Tue, 24 Jul 2007 20:10:27 GMT (370kb) + +Title: Calculation of prompt diphoton production cross sections at Tevatron and + LHC energies +Authors: C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan +Categories: hep-ph +Comments: 37 pages, 15 figures; published version +Report-no: ANL-HEP-PR-07-12 +Journal-ref: Phys.Rev.D76:013009,2007 +DOI: 10.1103/PhysRevD.76.013009 +\\ + A fully differential calculation in perturbative quantum chromodynamics is +presented for the production of massive photon pairs at hadron colliders. All +next-to-leading order perturbative contributions from quark-antiquark, +gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as +all-orders resummation of initial-state gluon radiation valid at +next-to-next-to-leading logarithmic accuracy. The region of phase space is +specified in which the calculation is most reliable. Good agreement is +demonstrated with data from the Fermilab Tevatron, and predictions are made for +more detailed tests with CDF and DO data. Predictions are shown for +distributions of diphoton pairs produced at the energy of the Large Hadron +Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs +boson are contrasted with those produced from QCD processes at the LHC, showing +that enhanced sensitivity to the signal can be obtained with judicious +selection of events. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0801/0801.1021.abs b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0801/0801.1021.abs new file mode 100644 index 0000000..4e00751 --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0801/0801.1021.abs @@ -0,0 +1,25 @@ +------------------------------------------------------------------------------ +\\ +arXiv:0801.1021 +From: Brihaye Yves +Date: Mon, 7 Jan 2008 16:12:30 GMT (29kb) +Date (revised v2): Mon, 28 Jan 2008 14:59:28 GMT (30kb) +Date (revised v3): Mon, 3 Mar 2008 16:29:52 GMT (30kb) +Date (revised v4): Tue, 4 Mar 2008 14:48:10 GMT (30kb) + +Title: Five-dimensional rotating black holes in Einstein-Gauss-Bonnet theory +Authors: Y. Brihaye, E. Radu +Categories: hep-th gr-qc +Comments: 10 pages, 6 figures, references added, minor modifications, typos + corrected +Journal-ref: Phys.Lett.B661:167-174,2008 +DOI: 10.1016/j.physletb.2008.02.005 +\\ + We present arguments for the existence of five-dimensional rotating black +holes with equal magnitude angular momenta in Einstein-Gauss-Bonnet theory with +negative cosmological constant. These solutions posses a regular horizon of +spherical topology and approach asymptotically an Anti-de Sitter spacetime +background. We discuss the general properties of these solutions and, using an +adapted counterterm prescription, we compute their entropy and conserved +charges. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0802/0802.0193.abs b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0802/0802.0193.abs new file mode 100644 index 0000000..d1ca11f --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0802/0802.0193.abs @@ -0,0 +1,26 @@ +------------------------------------------------------------------------------ +\\ +arXiv:0802.0193 +From: Eugene Heifets +Date: Fri, 1 Feb 2008 20:21:07 GMT (670kb) +Date (revised v2): Mon, 4 Feb 2008 20:05:24 GMT (670kb) + +Title: First principles modeling of oxygen adsorption on LaMnO3 (001) surface +Authors: Eugene A. Kotomin, Yuri A. Mastrikov, Eugene Heifets and Joachim Maier +Categories: cond-mat.mtrl-sci +Comments: 5 pages, 2 figures, 3 tables, 24 references, corrected misprint in + author's names +\\ + We present and discuss the results of ab initio DFT plane-wave supercell +calculations of the atomic and molecular oxygen adsorption and diffusion on the +LaMnO3 (001) surface which serves as a model material for a cathode of solid +oxide fuel cells. The dissociative adsorption of O2 molecules from the gas +phase is energetically favorable on surface Mn ions even on a defect-free +surface. The surface migration energy for adsorbed O ions is found to be quite +high, 1.6 eV. We predict that the adsorbed O atoms could penetrate into +electrode first plane when much more mobile surface oxygen vacancies (migration +energy of 0.69 eV) approach the O ions strongly bound to the surface Mn ions. +Ab initio thermodynamics predicts that at typical SOFC operation temperatures +(~1200 K) the MnO2 (001) surface with adsorbed O atoms is the most stable in a +very wide range of oxygen gas pressures (above 10^2 atm). +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0808/0808.4142.abs b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0808/0808.4142.abs new file mode 100644 index 0000000..11b3cd7 --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0808/0808.4142.abs @@ -0,0 +1,25 @@ +------------------------------------------------------------------------------ +\\ +arXiv:0808.4142 +From: Eduard De La Cruz Burelo +Date: Fri, 29 Aug 2008 18:06:12 GMT (22kb) +Date (revised v2): Mon, 8 Dec 2008 16:07:38 GMT (22kb) + +Title: Observation of the doubly strange b baryon Omega_b- +Authors: D0 Collaboration: V. Abazov, et al +Categories: hep-ex +Comments: v2: As published in Phys. Rev. Letters +Report-no: Fermilab-Pub-08/335-E +Journal-ref: Phys.Rev.Lett.101:232002,2008 +DOI: 10.1103/PhysRevLett.101.232002 +License: http://arxiv.org/licenses/nonexclusive-distrib/1.0/ +\\ + We report the observation of the doubly strange b baryon Omega_b- in the +decay channel Omega_b- to J/psi Omega-, with J/psi to mu+ mu- and Omega- to +Lambda K-, in $p\bar{p}$ collisions at $\sqrt{s}=1.96$ TeV. Using approximately +1.3 fb$^{-1}$ of data collected with the D0 detector at the Fermilab Tevatron +Collider, we observe 17.8 +/- 4.9 (stat) +/- 0.8 (syst) Omega_b- signal events +at a mass of 6.165 +/- 0.010 (stat) +/- 0.013(syst.) GeV. The significance of +the observed signal is 5.4 sigma, corresponding to a probability of 6.7 x +10^{-8} of it arising from a background fluctuation. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0905/0905.2326.abs b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0905/0905.2326.abs new file mode 100644 index 0000000..9d1c156 --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0905/0905.2326.abs @@ -0,0 +1,26 @@ +\\ +arXiv:0905.2326 +From: Zvi Bern +Date: Thu, 14 May 2009 17:04:44 GMT (2546kb,A) +Date (revised v2): Sun, 8 Jan 2012 13:27:40 GMT (2609kb,A) + +Title: The Ultraviolet Behavior of N=8 Supergravity at Four Loops +Authors: Z. Bern, J. J. Carrasco, L. J. Dixon, H. Johansson, R. Roiban +Categories: hep-th +Comments: 5 pages, 4 figures. v2 contains minor corrections, including flipping + sign of eq. (1). Complete results, including mathematica readable form, + presented in the directory aux/ included in the source of this manuscript. As + certain computer operating systems (e.g. Windows) preclude the naming of + directories "aux" we also host this data at: + http://www.physics.ucla.edu/~jjmc/auxiliaryData.tgz +Report-no: SLAC-PUB-13608, UCLA/09/TEP/09/47 +Journal-ref: Phys.Rev.Lett.103:081301,2009 +DOI: 10.1103/PhysRevLett.103.081301 +License: http://arxiv.org/licenses/nonexclusive-distrib/1.0/ +\\ + We describe the construction of the complete four-loop four-particle +amplitude of N=8 supergravity. The amplitude is ultraviolet finite, not only in +four dimensions, but in five dimensions as well. The observed extra +cancellations provide additional non-trivial evidence that N=8 supergravity in +four dimensions may be ultraviolet finite to all orders of perturbation theory. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0906/0906.2112.abs b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0906/0906.2112.abs new file mode 100644 index 0000000..47ec345 --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0906/0906.2112.abs @@ -0,0 +1,23 @@ +\\ +arXiv:0906.2112 +From: Robin de Jong +Date: Thu, 11 Jun 2009 14:09:14 GMT (20kb) +Date (revised v2): Mon, 9 Aug 2010 13:12:58 GMT (21kb) +Date (revised v3): Wed, 28 Mar 2012 08:04:12 GMT (21kb) + +Title: Symmetric roots and admissible pairing +Authors: Robin de Jong +Categories: math.AG math.NT +Comments: 21 pages +MSC-class: 14G40, 11G20 +Journal-ref: Transactions of the AMS 363 (2011), 4263--4283 +License: http://arxiv.org/licenses/nonexclusive-distrib/1.0/ +\\ + Using the discriminant modular form and the Noether formula it is possible to +write the admissible self-intersection of the relative dualising sheaf of a +semistable hyperelliptic curve over a number field or function field as a sum, +over all places, of a certain adelic invariant. We provide a simple geometric +interpretation for this invariant, based on the arithmetic of symmetric roots. +We propose the conjecture that the invariant introduced in this paper coincides +with an invariant introduced in a recent paper by S.-W. Zhang. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0906/0906.3336.abs b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0906/0906.3336.abs new file mode 100644 index 0000000..c3e116f --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0906/0906.3336.abs @@ -0,0 +1,16 @@ +------------------------------------------------------------------------------ +\\ +arXiv:0906.3336 +From: Huang Guangyue +Date: Thu, 18 Jun 2009 02:19:29 GMT (8kb) +Date (revised v2): Fri, 19 Jun 2009 14:29:19 GMT (0kb,I) + +Title: Lower order eigenvalues of the biharmonic operator +Authors: Guangyue Huang, Bingqing Ma +Categories: math.DG +Comments: This paper has been withdrawn +MSC-class: 35P15; 53C20 +License: http://creativecommons.org/licenses/by/3.0/ +\\ + This paper has been withdrawn since the results are not satisfied. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0906/0906.3421.abs b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0906/0906.3421.abs new file mode 100644 index 0000000..a5f426c --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0906/0906.3421.abs @@ -0,0 +1,27 @@ +------------------------------------------------------------------------------ +\\ +arXiv:0906.3421 +From: Philippe Di Francesco +Date: Thu, 18 Jun 2009 12:03:57 GMT (67kb) +Date (revised v2): Mon, 14 Sep 2009 09:39:31 GMT (67kb) +Date (revised v3): Tue, 2 Feb 2010 06:38:00 GMT (383kb,D) + +Title: Q-system Cluster Algebras, Paths and Total Positivity +Authors: Philippe Di Francesco and Rinat Kedem +Categories: math.CO cond-mat.stat-mech math-ph math.MP +Comments: 36 pages, 16 Postscript figures, typos corrected and one reference + added +Report-no: t09-083 +Journal-ref: SIGMA 6 (2010), 014, 36 pages +DOI: 10.3842/SIGMA.2010.014 +License: http://creativecommons.org/licenses/by-nc-sa/3.0/ +\\ + In the first part of this paper, we provide a concise review of our method of +solution of the $A_r$ Q-systems in terms of the partition function of paths on +a weighted graph. In the second part, we show that it is possible to modify the +graphs and transfer matrices so as to provide an explicit connection to the +theory of planar networks introduced in the context of totally positive +matrices by Fomin and Zelevinsky. As an illustration of the further generality +of our method, we apply it to give a simple solution for the rank 2 affine +cluster algebras studied by Caldero and Zelevinsky. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0906/0906.5132.abs b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0906/0906.5132.abs new file mode 100644 index 0000000..55bf151 --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0906/0906.5132.abs @@ -0,0 +1,33 @@ +------------------------------------------------------------------------------ +\\ +arXiv:0906.5132 +From: Vladimir P. Mineev +Date: Sun, 28 Jun 2009 11:24:35 GMT (17kb) +Date (revised v2): Tue, 21 Jul 2009 09:45:44 GMT (17kb) +Date (revised v3): Wed, 29 Jul 2009 11:13:43 GMT (17kb) +Date (revised v4): Thu, 8 Oct 2009 13:10:42 GMT (16kb) + +Title: Recent developments in unconventional superconductivity theory +Authors: V.P.Mineev +Categories: cond-mat.supr-con cond-mat.mtrl-sci +Comments: 15 pages +DOI: 10.1007/s10909-009-0032-7 +License: http://arxiv.org/licenses/nonexclusive-distrib/1.0/ +\\ + The review of recent developments in the unconventional superconductivity +theory is given. In the fist part I consider the physical origin of the Kerr +rotation polarization of light reflected from the surface of superconducting +$Sr_2RuO_4$. Then the comparison of magneto-optical responses in +superconductors with orbital and spin spontaneous magnetization is presented. +The latter result is applied to the estimation of the magneto-optical +properties of neutral superfluids with spontaneous magnetization. The second +part is devoted to the natural optical activity or gyrotropy properties of +noncentrosymmetric metals in their normal and superconducting states. The +temperature behavior of the gyrotropy coefficient is compared with the +temperature behavior of paramagnetic susceptibility determining the noticeable +increase of the paramagnetic limiting field in noncentrosymmetric +superconductors. In the last chapter I describe the order parameter and the +symmetry of superconducting state in the itinerant ferromagnet with +orthorhombic symmetry. Finally the Josephson coupling between two adjacent +ferromagnet superconducting domains is discussed. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0906/0906.5504.abs b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0906/0906.5504.abs new file mode 100644 index 0000000..f24205b --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/0906/0906.5504.abs @@ -0,0 +1,26 @@ +------------------------------------------------------------------------------ +\\ +arXiv:0906.5504 +From: Joachim Schr\"oter +Date: Tue, 30 Jun 2009 12:52:56 GMT (57kb) +Date (revised v2): Thu, 2 Jul 2009 11:54:11 GMT (57kb) +Date (revised v3): Fri, 3 Jul 2009 17:16:17 GMT (57kb) + +Title: An Extension of Friedmann-Robertson-Walker Theory beyond Big Bang +Authors: Joachim Schroeter +Categories: gr-qc +Comments: 28 pages, 2 figures +License: http://arxiv.org/licenses/nonexclusive-distrib/1.0/ +\\ + Starting from the classic Friedmann-Robertson-Walker theory with big bang it +is shown that the solutions of the field equations can be extended to negative +times. Choosing a new cosmic time scale instead of proper time one achieves +complete differentiability of the scale factor and of suitable thermodynamic +quantities equivalent to pressure and energy density. Then, the singularity of +big bang manifests itself only by the vanishing of the scale factor at time +zero. Moreover, all solutions of the field equations are defined for all times +from -infinity to +infinity. In a separate chapter the horizon structure of the +extended theory is studied. Some weak assumptions guarantee that there are no +horizons. Hence, the horizon problem in a strict sence disappears. An intensive +discussion of the results is given at the end of the paper. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/1210/1210.8438.abs b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/1210/1210.8438.abs new file mode 100644 index 0000000..d3bc17b --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/1210/1210.8438.abs @@ -0,0 +1,23 @@ +------------------------------------------------------------------------------ +\\ +arXiv:1210.8438 +From: Matei Ioan Radulescu +Date: Wed, 31 Oct 2012 18:55:27 GMT (122969kb,AD) +Date (revised v2): Thu, 1 Nov 2012 02:28:57 GMT (122970kb,AD) + +Title: Dynamics of unconfined spherical flames +Authors: Louis Leblanc, Maha Manoubi, Kadeem Dennis, Zhe (Rita) Liang, Matei I. + Radulescu +Categories: physics.flu-dyn +Comments: 2 videos are included; video submission at the 65th Annual Meeting of + the American Physical Society Division of Fluid Dynamics as part of the + Gallery of Fluid Motion +License: http://arxiv.org/licenses/nonexclusive-distrib/1.0/ +\\ + Using the soap bubble technique, we visualize the dynamics of unconfined +hydrogen-air flames using high speed schlieren video. We show that for +sufficiently weak mixtures, i.e., low flame speeds, buoyancy effects become +important. Flame balls of a critical dimension begin to rise. The experiments +are found in very good agreement with the scaling laws proposed by Zingale and +Dursi. We report the results in a fluid dynamics video. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/1605/1605.09669.abs b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/1605/1605.09669.abs new file mode 100644 index 0000000..a4ef63a --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/1605/1605.09669.abs @@ -0,0 +1,27 @@ +------------------------------------------------------------------------------ +\\ +arXiv:1605.09669 +From: Hasan Dalman +Date: Sun, 29 May 2016 17:36:46 GMT (710kb) +Date (revised v2): Wed, 1 Jun 2016 20:15:25 GMT (17kb) + +Title: Interactive Fuzzy Goal Programming Based on Taylor Series to Solve + Multiobjective Nonlinear Programming Problems with Interval Type 2 Fuzzy + Numbers +Authors: Hasan Dalman +Categories: math.OC +MSC-class: 68T37, 68T20, 68T27 +License: http://arxiv.org/licenses/nonexclusive-distrib/1.0/ +\\ + This paper presents an interactive fuzzy goal programming (FGP) approach for +solving multiobjective nonlinear programming problems (MONLPP) with interval +type 2 fuzzy numbers (IT2 FNs). The cost and time of the objective functions, +the resources, and the requirements of each kind of resources are taken to be +trapezoidal IT2 FNs. Here, the considered problem is first transformed into an +equivalent crisp MONLPP, and then the transformed MONLPP is converted into an +equivalent Multiobjective Linear Programming Problem (MOLPP). By using a +procedure based on Taylor series, this problem is reduced into a single +objective linear programming problem (LPP) which can be easily solved by Maple +18.02 optimization toolbox. Finally, the proposed solution procedure is +illustrated by two numerical examples. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/1607/1607.08199.abs b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/1607/1607.08199.abs new file mode 100644 index 0000000..a0a016c --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/arxiv/papers/1607/1607.08199.abs @@ -0,0 +1,25 @@ +------------------------------------------------------------------------------ +\\ +arXiv:1607.08199 +From: Benjamin Schmidt +Date: Wed, 27 Jul 2016 17:52:31 GMT (39kb,D) +Date (revised v2): Thu, 18 Aug 2016 16:20:16 GMT (39kb,D) +Date (revised v3): Mon, 9 Jan 2017 18:52:20 GMT (41kb,D) +Date (revised v4): Mon, 20 Mar 2017 23:56:56 GMT (41kb,D) +Date (revised v5): Wed, 5 Apr 2017 18:19:17 GMT (122kb,D) + +Title: Bridgeland Stability Conditions on Fano Threefolds +Authors: Marcello Bernardara, Emanuele Macr\`i, Benjamin Schmidt, Xiaolei Zhao +Categories: math.AG +Comments: 24 pages, 1 figure. Fifth version: Official version of the journal +MSC-class: 14F05 (Primary), 14J30, 18E30 (Secondary) +Journal-ref: ?pijournal de G?om?trie Alg?brique, Volume 1 (September 1, 2017) + epiga:3255 +License: http://creativecommons.org/licenses/by-sa/4.0/ +\\ + We show the existence of Bridgeland stability conditions on all Fano +threefolds, by proving a modified version of a conjecture by Bayer, Toda, and +the second author. The key technical ingredient is a strong Bogomolov +inequality, proved recently by Chunyi Li. Additionally, we prove the original +conjecture for some toric threefolds by using the toric Frobenius morphism. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/cond-mat/papers/9805/9805021.abs b/arxiv/canonical/classic/tests/data/ftp/cond-mat/papers/9805/9805021.abs new file mode 100644 index 0000000..c6b06c8 --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/cond-mat/papers/9805/9805021.abs @@ -0,0 +1,25 @@ +------------------------------------------------------------------------------ +\\ +arXiv:cond-mat/9805021 +From: Semjon Stepanow +Date: Sat, 2 May 1998 14:46:43 GMT (277kb) +Date (revised v2): Thu, 14 Jan 1999 14:47:38 GMT (8kb) + +Title: Localization transition of random copolymers at interfaces +Authors: Semjon Stepanow, Jens-Uwe Sommer, and Igor Ya. Erukhimovich +Categories: cond-mat.soft +Comments: 5 pages +Journal-ref: Phys. Rev. Lett. 81, 4412 (1998) +DOI: 10.1103/PhysRevLett.81.4412 +\\ + We consider adsorption of random copolymer chains onto an interface within +the model of Garel et al. Europhysics Letters 8, 9 (1989). By using the replica +method the adsorption of the copolymer at the interface is mapped onto the +problem of finding the ground state of a quantum mechanical Hamiltonian. To +study this ground state we introduce a novel variational principle for the +Green's function, which generalizes the well-known Rayleigh-Ritz method of +Quantum Mechanics to nonstationary states. Minimization with an appropriate +trial Green's function enables us to find the phase diagram for the +localization-delocalization transition for an ideal random copolymer at the +interface. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/funct-an/papers/9301/9301001.abs b/arxiv/canonical/classic/tests/data/ftp/funct-an/papers/9301/9301001.abs new file mode 100644 index 0000000..fb3934c --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/funct-an/papers/9301/9301001.abs @@ -0,0 +1,35 @@ +------------------------------------------------------------------------------ +\\ +arXiv:funct-an/9301001 +From: Alexander Turbiner +Date: Sat, 16 Jan 1993 11:49:26 GMT (0kb,I) +Date (revised v2): Tue, 8 Sep 1998 18:01:31 GMT (38kb) + +Title: Lie-algebras and linear operators with invariant subspaces +Authors: Alexander Turbiner +Categories: funct-an math.OA +Comments: 47pp, AMS-LaTeX +Report-no: I.H.E.S./P/92/95 +MSC-class: 81C05 (Primary) 81C40, 17B15 (Secondary) +Journal-ref: in {\em Lie algebras, cohomologies and new findings in quantum + mechanics} (N. Kamran and P. J. Olver, eds.), AMS {\it Contemporary + Mathematics}, vol. 160, pp. 263--310, 1994 +\\ + A general classification of linear differential and finite-difference +operators possessing a finite-dimensional invariant subspace with a polynomial +basis (the generalized Bochner problem) is given. The main result is that any +operator with the above property must have a representation as a polynomial +element of the universal enveloping algebra of some algebra of differential +(difference) operators in finite-dimensional representation plus an operator +annihilating the finite-dimensional invariant subspace. In low dimensions a +classification is given by algebras $sl_2({\bold R})$ (for differential +operators in ${\bold R}$) and $sl_2({\bold R})_q$ (for finite-difference +operators in ${\bold R}$), $osp(2,2)$ (operators in one real and one Grassmann +variable, or equivalently, $2 \times 2$ matrix operators in ${\bold R}$), +$sl_3({\bold R})$, $sl_2({\bold R}) \oplus sl_2({\bold R})$ and $gl_2 ({\bold +R}) \ltimes {\bold R}^{r+1}\ , r$ a natural number (operators in ${\bold +R^2}$). A classification of linear operators possessing infinitely many +finite-dimensional invariant subspaces with a basis in polynomials is +presented. A connection to the recently-discovered quasi-exactly-solvable +spectral problems is discussed. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/hep-th/papers/9709/9709111.abs b/arxiv/canonical/classic/tests/data/ftp/hep-th/papers/9709/9709111.abs new file mode 100644 index 0000000..f2d9b5d --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/hep-th/papers/9709/9709111.abs @@ -0,0 +1,34 @@ +------------------------------------------------------------------------------ +\\ +arXiv:hep-th/9709111 +From: +Date: Tue, 16 Sep 1997 14:44:58 GMT (11kb) +Date (revised v2): Tue, 23 Sep 1997 17:34:41 GMT (11kb) +Date (revised v3): Mon, 3 Nov 1997 19:27:30 GMT (14kb) + +Title: On the effective interactions of a light gravitino with matter fermions +Authors: Andrea Brignole (CERN-TH), Ferruccio Feruglio (Padua Univ.), Fabio + Zwirner (INFN-Padua) +Categories: hep-th hep-ph +Comments: 12 pages, 1 figure, plain LaTeX. An important proof added in section + 5. Final version to be published in JHEP +Report-no: CERN-TH/97-244, DFPD~97/TH/35 +Journal-ref: JHEP 9711:001,1997 +DOI: 10.1088/1126-6708/1997/11/001 +\\ + If the gravitino is light and all the other supersymmetric particles are +heavy, we can consider the effective theory describing the interactions of its +goldstino components with ordinary matter. To discuss the model-dependence of +these interactions, we take the simple case of spontaneously broken +supersymmetry and only two chiral superfields, associated with the goldstino +and a massless matter fermion. We derive the four-point effective coupling +involving two matter fermions and two goldstinos, by explicit integration of +the heavy spin-0 degrees of freedom in the low-energy limit. Surprisingly, our +result is not equivalent to the usual non-linear realization of supersymmetry, +where a pair of goldstinos couples to the energy-momentum tensor of the matter +fields. We solve the puzzle by enlarging the non-linear realization to include +a second independent invariant coupling, and we show that there are no other +independent couplings of this type up to this order in the low-energy +expansion. We conclude by commenting on the interpretation of our results and +on their possible phenomenological implications. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/hep-th/papers/9901/9901001.abs b/arxiv/canonical/classic/tests/data/ftp/hep-th/papers/9901/9901001.abs new file mode 100644 index 0000000..2d13556 --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/hep-th/papers/9901/9901001.abs @@ -0,0 +1,20 @@ +------------------------------------------------------------------------------ +\\ +arXiv:hep-th/9901001 +From: Yosuke Imamura +Date: Fri, 1 Jan 1999 01:01:10 GMT (15kb) +Date (revised v2): Tue, 5 Jan 1999 21:36:42 GMT (15kb) +Date (revised v3): Mon, 10 May 1999 04:45:54 GMT (15kb) + +Title: String Junctions and Their Duals in Heterotic String Theory +Authors: Yosuke Imamura +Categories: hep-th +Comments: 13 pages + 4 eps figures, PTPTeX, typographical errors corrected +Report-no: YITP-99-1 +Journal-ref: Prog.Theor.Phys.101:1155-1164,1999 +DOI: 10.1143/PTP.101.1155 +\\ + We explicitly give the correspondence between spectra of heterotic string +theory compactified on $T^2$ and string junctions in type IIB theory +compactified on $S^2$. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/math-ph/papers/0702/0702031.abs b/arxiv/canonical/classic/tests/data/ftp/math-ph/papers/0702/0702031.abs new file mode 100644 index 0000000..c42d46c --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/math-ph/papers/0702/0702031.abs @@ -0,0 +1,24 @@ +------------------------------------------------------------------------------ +\\ +arXiv:math-ph/0702031 +From: Paulo Mendon\c{c}a +Date: Fri, 9 Feb 2007 20:39:05 GMT (8kb) +Date (revised v2): Fri, 9 Feb 2007 22:31:21 GMT (8kb) +Date (revised v3): Mon, 14 May 2007 02:44:33 GMT (10kb) +Date (revised v4): Tue, 15 May 2007 03:47:31 GMT (11kb) + +Title: Probability Distribution of Curvatures of Isosurfaces in Gaussian Random + Fields +Authors: Paulo R. S. Mendonca, Rahul Bhotika and James V. Miller +Categories: math-ph math.MP +Comments: 10 pages, six references. Fuller version with correct proof of + important theorem +MSC-class: 60D05 +\\ + An expression for the joint probability distribution of the principal +curvatures at an arbitrary point in the ensemble of isosurfaces defined on +isotropic Gaussian random fields on Rn is derived. The result is obtained by +deriving symmetry properties of the ensemble of second derivative matrices of +isotropic Gaussian random fields akin to those of the Gaussian orthogonal +ensemble. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/math/papers/0202/0202001.abs b/arxiv/canonical/classic/tests/data/ftp/math/papers/0202/0202001.abs new file mode 100644 index 0000000..cc1bbcc --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/math/papers/0202/0202001.abs @@ -0,0 +1,37 @@ +------------------------------------------------------------------------------ +\\ +arXiv:math/0202001 +From: Laurent Bartholdi +Date: Fri, 1 Feb 2002 02:02:03 GMT (391kb) +Date (revised v2): Sun, 3 Feb 2002 21:43:30 GMT (439kb) +Date (revised v3): Sat, 13 Jul 2002 22:44:27 GMT (441kb) +Date (revised v4): Fri, 13 Sep 2002 18:26:29 GMT (442kb) + +Title: From fractal groups to fractal sets +Authors: Laurent Bartholdi, Rostislav I. Grigorchuk and Volodymyr V. + Nekrashevych +Categories: math.GR math.DS +Comments: about 75 pages, with many postscript figures +MSC-class: 20E08; 37B10; 28A80 +Journal-ref: Fractals in Graz 2001, 25--118, Trends Math., Birkhauser, Basel, + 2003 +\\ + This paper is a survey, with few proofs, of ideas and notions related to +self-similarity of groups, semi-groups and their actions. It attempts to relate +these concepts to more familiar ones, such as fractals, self-similar sets, and +renormalizable dynamical systems. In particular, it presents a plausible +definition of what a "fractal group" should be, and gives many examples of such +groups. + A particularly interesting class of examples, derived from monodromy groups +of iterated branch coverings, or equivalently from Galois groups of iterated +polynomials, is presented. This class contains interesting groups from an +algebraic point of view (just-non-solvable groups, groups of intermediate +growth, branch groups,...), and at the same time the geometry of the group is +apparent in that a limit of the group identifies naturally with the Julia set +of the covering map. + In its survey, the paper discusses finite-state transducers, growth of groups +and languages, limit spaces of groups, hyperbolic spaces and groups, dynamical +systems, Hecke-type operators, C^*-algebras, random matrices, ergodic theorems +and entropy of non-commuting transformations. Self-similar groups appear then +as a natural weaving thread through these seemingly different topics. +\\ diff --git a/arxiv/canonical/classic/tests/data/ftp/physics/papers/9707/9707012.abs b/arxiv/canonical/classic/tests/data/ftp/physics/papers/9707/9707012.abs new file mode 100644 index 0000000..bf8c4b7 --- /dev/null +++ b/arxiv/canonical/classic/tests/data/ftp/physics/papers/9707/9707012.abs @@ -0,0 +1,24 @@ +------------------------------------------------------------------------------ +\\ +arXiv:physics/9707012 +From: Haret Rosu +Date: Thu, 10 Jul 1997 21:14:20 GMT (3kb) +Date (revised v2): Thu, 10 Jul 1997 23:50:03 GMT (0kb,I) +Date (revised v3): Sun, 31 Aug 1997 22:36:45 GMT (0kb,I) +Date (revised v4): Mon, 4 May 1998 20:50:41 GMT (4kb) + +Title: Supersymmetric partner chirping of Newtonian free damping +Authors: H.C. Rosu, J.L. Romero, J. Socorro +Categories: math-ph math.MP quant-ph +Comments: 3 pages in LaTex +Journal-ref: Nuovo Cimento B 113 (April 1998) 549-552 +\\ + We connect the classical free damping cases by means of Rosner's construction +in supersymmetric quantum mechanics. Starting with the critical damping, one +can obtain in the underdamping case a chirping of instantaneous physical +frequency \omega ^{2}(t) \propto \omega_{u}^{2}sech^2(\omega_{u}t), whereas in +the overdamped case the "chirping" is of the (unphysical) type \omega +^{2}(t)\propto\omega_{o}^{2}sec^{2}(\omega_{o}t), where \omega_{u}$ and +$\omega_{o} are the underdamped and overdamped frequency parameters, +respectively +\\ diff --git a/arxiv/canonical/serialize/classic/tests/data/new.daily.log b/arxiv/canonical/classic/tests/data/new.daily.log similarity index 100% rename from arxiv/canonical/serialize/classic/tests/data/new.daily.log rename to arxiv/canonical/classic/tests/data/new.daily.log diff --git a/arxiv/canonical/tests/data/orig/adap-org/papers/9509/9509003v1.abs b/arxiv/canonical/classic/tests/data/orig/adap-org/papers/9509/9509003v1.abs similarity index 97% rename from arxiv/canonical/tests/data/orig/adap-org/papers/9509/9509003v1.abs rename to arxiv/canonical/classic/tests/data/orig/adap-org/papers/9509/9509003v1.abs index 36bbb1c..399d15f 100644 --- a/arxiv/canonical/tests/data/orig/adap-org/papers/9509/9509003v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/adap-org/papers/9509/9509003v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:adap-org/9509003 -From: Yu Shi +From: Yu Shi Date: Thu, 5 Oct 1995 17:35:17 GMT (0kb,I) Title: Self-organization, ergodicity breaking, phase transition and diff --git a/arxiv/canonical/tests/data/orig/adap-org/papers/9509/9509003v2.abs b/arxiv/canonical/classic/tests/data/orig/adap-org/papers/9509/9509003v2.abs similarity index 97% rename from arxiv/canonical/tests/data/orig/adap-org/papers/9509/9509003v2.abs rename to arxiv/canonical/classic/tests/data/orig/adap-org/papers/9509/9509003v2.abs index 4f0e1ea..78d8f9f 100644 --- a/arxiv/canonical/tests/data/orig/adap-org/papers/9509/9509003v2.abs +++ b/arxiv/canonical/classic/tests/data/orig/adap-org/papers/9509/9509003v2.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:adap-org/9509003 -From: Yu Shi +From: Yu Shi Date: Thu, 5 Oct 1995 17:35:17 GMT (0kb,I) Date (revised v2): Fri, 6 Oct 1995 17:59:42 GMT (0kb,I) diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/0704/0704.0001v1.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0704/0704.0001v1.abs similarity index 96% rename from arxiv/canonical/tests/data/orig/arxiv/papers/0704/0704.0001v1.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/0704/0704.0001v1.abs index 1525d4e..671dfbe 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/0704/0704.0001v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0704/0704.0001v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:0704.0001 -From: Pavel Nadolsky +From: Pavel Nadolsky Date: Mon, 2 Apr 2007 19:18:42 GMT (443kb) Title: Calculation of prompt diphoton production cross sections at Tevatron and diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/0801/0801.1021v1.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0801/0801.1021v1.abs similarity index 95% rename from arxiv/canonical/tests/data/orig/arxiv/papers/0801/0801.1021v1.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/0801/0801.1021v1.abs index f843757..09ba7f9 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/0801/0801.1021v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0801/0801.1021v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:0801.1021 -From: Brihaye Yves +From: Brihaye Yves Date: Mon, 7 Jan 2008 16:12:30 GMT (29kb) Title: Five-dimensional rotating black holes in Einstein-Gauss-Bonnet theory diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/0801/0801.1021v2.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0801/0801.1021v2.abs similarity index 95% rename from arxiv/canonical/tests/data/orig/arxiv/papers/0801/0801.1021v2.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/0801/0801.1021v2.abs index 1b097cf..f39a84b 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/0801/0801.1021v2.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0801/0801.1021v2.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:0801.1021 -From: Brihaye Yves +From: Brihaye Yves Date: Mon, 7 Jan 2008 16:12:30 GMT (29kb) Date (revised v2): Mon, 28 Jan 2008 14:59:28 GMT (30kb) diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/0801/0801.1021v3.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0801/0801.1021v3.abs similarity index 95% rename from arxiv/canonical/tests/data/orig/arxiv/papers/0801/0801.1021v3.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/0801/0801.1021v3.abs index 93c3c55..6f0d874 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/0801/0801.1021v3.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0801/0801.1021v3.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:0801.1021 -From: Brihaye Yves +From: Brihaye Yves Date: Mon, 7 Jan 2008 16:12:30 GMT (29kb) Date (revised v2): Mon, 28 Jan 2008 14:59:28 GMT (30kb) Date (revised v3): Mon, 3 Mar 2008 16:29:52 GMT (30kb) diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/0802/0802.0193v1.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0802/0802.0193v1.abs similarity index 96% rename from arxiv/canonical/tests/data/orig/arxiv/papers/0802/0802.0193v1.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/0802/0802.0193v1.abs index 38821df..a25512b 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/0802/0802.0193v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0802/0802.0193v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:0802.0193 -From: Eugene Heifets +From: Eugene Heifets Date: Fri, 1 Feb 2008 20:21:07 GMT (670kb) Title: First principles modeling of oxygen adsorption on LaMnO3 (001) surface diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/0808/0808.4142v1.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0808/0808.4142v1.abs similarity index 94% rename from arxiv/canonical/tests/data/orig/arxiv/papers/0808/0808.4142v1.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/0808/0808.4142v1.abs index 0947dd9..9dd6a5a 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/0808/0808.4142v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0808/0808.4142v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:0808.4142 -From: Eduard De La Cruz Burelo +From: Eduard De La Cruz Burelo Date: Fri, 29 Aug 2008 18:06:12 GMT (22kb) Title: Observation of the doubly strange b baryon Omega_b- diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/0905/0905.2326v1.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0905/0905.2326v1.abs similarity index 95% rename from arxiv/canonical/tests/data/orig/arxiv/papers/0905/0905.2326v1.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/0905/0905.2326v1.abs index 5dc9a04..baa4d74 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/0905/0905.2326v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0905/0905.2326v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:0905.2326 -From: John Joseph Carrasco <0905.2326@example.org> +From: John Joseph Carrasco Date: Thu, 14 May 2009 17:04:44 GMT (2546kb,A) Title: The Ultraviolet Behavior of N=8 Supergravity at Four Loops diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.2112v1.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.2112v1.abs similarity index 94% rename from arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.2112v1.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.2112v1.abs index 179e82b..3352baa 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.2112v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.2112v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:0906.2112 -From: Robin de Jong +From: Robin de Jong Date: Thu, 11 Jun 2009 14:09:14 GMT (20kb) Title: Symmetric roots and admissible pairing diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.2112v2.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.2112v2.abs similarity index 96% rename from arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.2112v2.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.2112v2.abs index 9ba7f0b..75dac98 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.2112v2.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.2112v2.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:0906.2112 -From: Robin de Jong +From: Robin de Jong Date: Thu, 11 Jun 2009 14:09:14 GMT (20kb) Date (revised v2): Mon, 9 Aug 2010 13:12:58 GMT (21kb) diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.3336v1.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.3336v1.abs similarity index 94% rename from arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.3336v1.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.3336v1.abs index 00db722..031cc6d 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.3336v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.3336v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:0906.3336 -From: Huang Guangyue +From: Huang Guangyue Date: Thu, 18 Jun 2009 02:19:29 GMT (8kb) Title: Lower order eigenvalues of the biharmonic operator diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.3421v1.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.3421v1.abs similarity index 92% rename from arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.3421v1.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.3421v1.abs index c38b15b..36beebe 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.3421v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.3421v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:0906.3421 -From: Philippe Di Francesco +From: Philippe Di Francesco Date: Thu, 18 Jun 2009 12:03:57 GMT (67kb) Title: Q-system Cluster Algebras, Paths and Total Positivity diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.3421v2.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.3421v2.abs similarity index 93% rename from arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.3421v2.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.3421v2.abs index 194518f..3abc14b 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.3421v2.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.3421v2.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:0906.3421 -From: Philippe Di Francesco +From: Philippe Di Francesco Date: Thu, 18 Jun 2009 12:03:57 GMT (67kb) Date (revised v2): Mon, 14 Sep 2009 09:39:31 GMT (67kb) diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.5132v1.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.5132v1.abs similarity index 96% rename from arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.5132v1.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.5132v1.abs index 729466b..7c7662e 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.5132v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.5132v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:0906.5132 -From: Vladimir P. Mineev +From: Vladimir P. Mineev Date: Sun, 28 Jun 2009 11:24:35 GMT (17kb) Title: Recent developments in unconventional superconductivity theory diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.5132v2.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.5132v2.abs similarity index 96% rename from arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.5132v2.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.5132v2.abs index 553058f..83787cd 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.5132v2.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.5132v2.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:0906.5132 -From: Vladimir P. Mineev +From: Vladimir P. Mineev Date: Sun, 28 Jun 2009 11:24:35 GMT (17kb) Date (revised v2): Tue, 21 Jul 2009 09:45:44 GMT (17kb) diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.5132v3.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.5132v3.abs similarity index 97% rename from arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.5132v3.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.5132v3.abs index 2c016ed..d965d06 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.5132v3.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.5132v3.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:0906.5132 -From: Vladimir P. Mineev +From: Vladimir P. Mineev Date: Sun, 28 Jun 2009 11:24:35 GMT (17kb) Date (revised v2): Tue, 21 Jul 2009 09:45:44 GMT (17kb) Date (revised v3): Wed, 29 Jul 2009 11:13:43 GMT (17kb) diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.5504v1.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.5504v1.abs similarity index 96% rename from arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.5504v1.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.5504v1.abs index ac57b97..41f6806 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.5504v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.5504v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:0906.5504 -From: Joachim Schr\"oter +From: Joachim Schr\"oter Date: Tue, 30 Jun 2009 12:52:56 GMT (57kb) Title: An Extension of Friedmann-Robertson-Walker Theory beyond Big Bang diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.5504v2.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.5504v2.abs similarity index 96% rename from arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.5504v2.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.5504v2.abs index e8fb8d1..e0b8a41 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/0906/0906.5504v2.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/0906/0906.5504v2.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:0906.5504 -From: Joachim Schr\"oter <2@JiwAlD.7H> +From: Joachim Schr\"oter Date: Tue, 30 Jun 2009 12:52:56 GMT (57kb) Date (revised v2): Thu, 2 Jul 2009 11:54:11 GMT (57kb) diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/1210/1210.8438v1.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/1210/1210.8438v1.abs similarity index 95% rename from arxiv/canonical/tests/data/orig/arxiv/papers/1210/1210.8438v1.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/1210/1210.8438v1.abs index 9bab0a9..2aad40a 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/1210/1210.8438v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/1210/1210.8438v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:1210.8438 -From: Matei Ioan Radulescu +From: Matei Ioan Radulescu Date: Wed, 31 Oct 2012 18:55:27 GMT (122969kb,AD) Title: Dynamics of unconfined spherical flames diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/1605/1605.09669v1.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/1605/1605.09669v1.abs similarity index 96% rename from arxiv/canonical/tests/data/orig/arxiv/papers/1605/1605.09669v1.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/1605/1605.09669v1.abs index 65c3ef1..c16ba9e 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/1605/1605.09669v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/1605/1605.09669v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:1605.09669 -From: Hasan Dalman Dr <1605.09669@example.org> +From: Hasan Dalman Dr Date: Sun, 29 May 2016 17:36:46 GMT (710kb) Title: Interactive Fuzzy Goal Programming Based on Taylor Series to Solve diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/1607/1607.08199v1.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/1607/1607.08199v1.abs similarity index 94% rename from arxiv/canonical/tests/data/orig/arxiv/papers/1607/1607.08199v1.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/1607/1607.08199v1.abs index 3160c54..4b0ea21 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/1607/1607.08199v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/1607/1607.08199v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:1607.08199 -From: Benjamin Schmidt +From: Benjamin Schmidt Date: Wed, 27 Jul 2016 17:52:31 GMT (39kb,D) Title: Bridgeland Stability Conditions on Fano Threefolds diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/1607/1607.08199v2.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/1607/1607.08199v2.abs similarity index 95% rename from arxiv/canonical/tests/data/orig/arxiv/papers/1607/1607.08199v2.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/1607/1607.08199v2.abs index 9e85d77..475f2da 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/1607/1607.08199v2.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/1607/1607.08199v2.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:1607.08199 -From: Marcello Bernardara +From: Marcello Bernardara Date: Wed, 27 Jul 2016 17:52:31 GMT (39kb,D) Date (revised v2): Thu, 18 Aug 2016 16:20:16 GMT (39kb,D) diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/1607/1607.08199v3.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/1607/1607.08199v3.abs similarity index 95% rename from arxiv/canonical/tests/data/orig/arxiv/papers/1607/1607.08199v3.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/1607/1607.08199v3.abs index 0d525a9..9856971 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/1607/1607.08199v3.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/1607/1607.08199v3.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:1607.08199 -From: Benjamin Schmidt +From: Benjamin Schmidt Date: Wed, 27 Jul 2016 17:52:31 GMT (39kb,D) Date (revised v2): Thu, 18 Aug 2016 16:20:16 GMT (39kb,D) Date (revised v3): Mon, 9 Jan 2017 18:52:20 GMT (41kb,D) diff --git a/arxiv/canonical/tests/data/orig/arxiv/papers/1607/1607.08199v4.abs b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/1607/1607.08199v4.abs similarity index 95% rename from arxiv/canonical/tests/data/orig/arxiv/papers/1607/1607.08199v4.abs rename to arxiv/canonical/classic/tests/data/orig/arxiv/papers/1607/1607.08199v4.abs index e935373..35df94b 100644 --- a/arxiv/canonical/tests/data/orig/arxiv/papers/1607/1607.08199v4.abs +++ b/arxiv/canonical/classic/tests/data/orig/arxiv/papers/1607/1607.08199v4.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:1607.08199 -From: Benjamin Schmidt +From: Benjamin Schmidt Date: Wed, 27 Jul 2016 17:52:31 GMT (39kb,D) Date (revised v2): Thu, 18 Aug 2016 16:20:16 GMT (39kb,D) Date (revised v3): Mon, 9 Jan 2017 18:52:20 GMT (41kb,D) diff --git a/arxiv/canonical/tests/data/orig/cond-mat/papers/9805/9805021v1.abs b/arxiv/canonical/classic/tests/data/orig/cond-mat/papers/9805/9805021v1.abs similarity index 96% rename from arxiv/canonical/tests/data/orig/cond-mat/papers/9805/9805021v1.abs rename to arxiv/canonical/classic/tests/data/orig/cond-mat/papers/9805/9805021v1.abs index 7fab206..50744e6 100644 --- a/arxiv/canonical/tests/data/orig/cond-mat/papers/9805/9805021v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/cond-mat/papers/9805/9805021v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:cond-mat/9805021 -From: Semjon Stepanow +From: Semjon Stepanow Date: Sat, 2 May 1998 14:46:43 GMT (277kb) Title: Localization transition of random copolymers at interfaces diff --git a/arxiv/canonical/tests/data/orig/cond-mat/papers/9805/9805021v1.ps.gz b/arxiv/canonical/classic/tests/data/orig/cond-mat/papers/9805/9805021v1.ps.gz similarity index 100% rename from arxiv/canonical/tests/data/orig/cond-mat/papers/9805/9805021v1.ps.gz rename to arxiv/canonical/classic/tests/data/orig/cond-mat/papers/9805/9805021v1.ps.gz diff --git a/arxiv/canonical/tests/data/orig/funct-an/papers/9301/9301001v1.abs b/arxiv/canonical/classic/tests/data/orig/funct-an/papers/9301/9301001v1.abs similarity index 97% rename from arxiv/canonical/tests/data/orig/funct-an/papers/9301/9301001v1.abs rename to arxiv/canonical/classic/tests/data/orig/funct-an/papers/9301/9301001v1.abs index 7980f85..3c6edf3 100644 --- a/arxiv/canonical/tests/data/orig/funct-an/papers/9301/9301001v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/funct-an/papers/9301/9301001v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:funct-an/9301001 -From: Alexander Turbiner +From: Alexander Turbiner Date: Sat, 16 Jan 1993 11:49:26 GMT (0kb,I) Title: Lie-algebras and linear operators with invariant subspaces diff --git a/arxiv/canonical/tests/data/orig/hep-th/papers/9709/9709111v1.abs b/arxiv/canonical/classic/tests/data/orig/hep-th/papers/9709/9709111v1.abs similarity index 98% rename from arxiv/canonical/tests/data/orig/hep-th/papers/9709/9709111v1.abs rename to arxiv/canonical/classic/tests/data/orig/hep-th/papers/9709/9709111v1.abs index 8881014..14dcda0 100644 --- a/arxiv/canonical/tests/data/orig/hep-th/papers/9709/9709111v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/hep-th/papers/9709/9709111v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:hep-th/9709111 -From: +From: Date: Tue, 16 Sep 1997 14:44:58 GMT (11kb) Title: On the effective interactions of a light gravitino with matter fermions diff --git a/arxiv/canonical/tests/data/orig/hep-th/papers/9709/9709111v2.abs b/arxiv/canonical/classic/tests/data/orig/hep-th/papers/9709/9709111v2.abs similarity index 98% rename from arxiv/canonical/tests/data/orig/hep-th/papers/9709/9709111v2.abs rename to arxiv/canonical/classic/tests/data/orig/hep-th/papers/9709/9709111v2.abs index 245142b..ef2120f 100644 --- a/arxiv/canonical/tests/data/orig/hep-th/papers/9709/9709111v2.abs +++ b/arxiv/canonical/classic/tests/data/orig/hep-th/papers/9709/9709111v2.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:hep-th/9709111 -From: +From: Date: Tue, 16 Sep 1997 14:44:58 GMT (11kb) Date (revised v2): Tue, 23 Sep 1997 17:34:41 GMT (11kb) diff --git a/arxiv/canonical/tests/data/orig/hep-th/papers/9901/9901001v1.abs b/arxiv/canonical/classic/tests/data/orig/hep-th/papers/9901/9901001v1.abs similarity index 89% rename from arxiv/canonical/tests/data/orig/hep-th/papers/9901/9901001v1.abs rename to arxiv/canonical/classic/tests/data/orig/hep-th/papers/9901/9901001v1.abs index 7fb391b..63291ab 100644 --- a/arxiv/canonical/tests/data/orig/hep-th/papers/9901/9901001v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/hep-th/papers/9901/9901001v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:hep-th/9901001 -From: "Yosuke Imamura" +From: "Yosuke Imamura" Date: Fri, 1 Jan 1999 01:01:10 GMT (15kb) Title: String Junctions and Their Duals in Heterotic String Theory diff --git a/arxiv/canonical/tests/data/orig/hep-th/papers/9901/9901001v2.abs b/arxiv/canonical/classic/tests/data/orig/hep-th/papers/9901/9901001v2.abs similarity index 91% rename from arxiv/canonical/tests/data/orig/hep-th/papers/9901/9901001v2.abs rename to arxiv/canonical/classic/tests/data/orig/hep-th/papers/9901/9901001v2.abs index 46c478e..fe2a24c 100644 --- a/arxiv/canonical/tests/data/orig/hep-th/papers/9901/9901001v2.abs +++ b/arxiv/canonical/classic/tests/data/orig/hep-th/papers/9901/9901001v2.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:hep-th/9901001 -From: "Yosuke Imamura" +From: "Yosuke Imamura" Date: Fri, 1 Jan 1999 01:01:10 GMT (15kb) Date (revised v2): Tue, 5 Jan 1999 21:36:42 GMT (15kb) diff --git a/arxiv/canonical/tests/data/orig/math-ph/papers/0702/0702031v1.abs b/arxiv/canonical/classic/tests/data/orig/math-ph/papers/0702/0702031v1.abs similarity index 92% rename from arxiv/canonical/tests/data/orig/math-ph/papers/0702/0702031v1.abs rename to arxiv/canonical/classic/tests/data/orig/math-ph/papers/0702/0702031v1.abs index 5809d6d..eb7920c 100644 --- a/arxiv/canonical/tests/data/orig/math-ph/papers/0702/0702031v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/math-ph/papers/0702/0702031v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:math-ph/0702031 -From: Paulo Mendon\c{c}a +From: Paulo Mendon\c{c}a Date: Fri, 9 Feb 2007 20:39:05 GMT (8kb) Title: Probability Distribution of Curvatures of Isosurfaces in Gaussian Random diff --git a/arxiv/canonical/tests/data/orig/math-ph/papers/0702/0702031v2.abs b/arxiv/canonical/classic/tests/data/orig/math-ph/papers/0702/0702031v2.abs similarity index 93% rename from arxiv/canonical/tests/data/orig/math-ph/papers/0702/0702031v2.abs rename to arxiv/canonical/classic/tests/data/orig/math-ph/papers/0702/0702031v2.abs index bbfb3a5..6378486 100644 --- a/arxiv/canonical/tests/data/orig/math-ph/papers/0702/0702031v2.abs +++ b/arxiv/canonical/classic/tests/data/orig/math-ph/papers/0702/0702031v2.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:math-ph/0702031 -From: Paulo Mendon\c{c}a +From: Paulo Mendon\c{c}a Date: Fri, 9 Feb 2007 20:39:05 GMT (8kb) Date (revised v2): Fri, 9 Feb 2007 22:31:21 GMT (8kb) diff --git a/arxiv/canonical/tests/data/orig/math-ph/papers/0702/0702031v3.abs b/arxiv/canonical/classic/tests/data/orig/math-ph/papers/0702/0702031v3.abs similarity index 94% rename from arxiv/canonical/tests/data/orig/math-ph/papers/0702/0702031v3.abs rename to arxiv/canonical/classic/tests/data/orig/math-ph/papers/0702/0702031v3.abs index 859eece..c493bef 100644 --- a/arxiv/canonical/tests/data/orig/math-ph/papers/0702/0702031v3.abs +++ b/arxiv/canonical/classic/tests/data/orig/math-ph/papers/0702/0702031v3.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:math-ph/0702031 -From: Paulo Mendon\c{c}a +From: Paulo Mendon\c{c}a Date: Fri, 9 Feb 2007 20:39:05 GMT (8kb) Date (revised v2): Fri, 9 Feb 2007 22:31:21 GMT (8kb) Date (revised v3): Mon, 14 May 2007 02:44:33 GMT (10kb) diff --git a/arxiv/canonical/tests/data/orig/math/papers/0202/0202001v1.abs b/arxiv/canonical/classic/tests/data/orig/math/papers/0202/0202001v1.abs similarity index 97% rename from arxiv/canonical/tests/data/orig/math/papers/0202/0202001v1.abs rename to arxiv/canonical/classic/tests/data/orig/math/papers/0202/0202001v1.abs index b41b9d8..3853b4e 100644 --- a/arxiv/canonical/tests/data/orig/math/papers/0202/0202001v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/math/papers/0202/0202001v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:math/0202001 -From: Laurent Bartholdi +From: Laurent Bartholdi Date: Fri, 1 Feb 2002 02:02:03 GMT (391kb) Title: From fractal groups to fractal sets diff --git a/arxiv/canonical/tests/data/orig/math/papers/0202/0202001v2.abs b/arxiv/canonical/classic/tests/data/orig/math/papers/0202/0202001v2.abs similarity index 97% rename from arxiv/canonical/tests/data/orig/math/papers/0202/0202001v2.abs rename to arxiv/canonical/classic/tests/data/orig/math/papers/0202/0202001v2.abs index b6381e0..0703e33 100644 --- a/arxiv/canonical/tests/data/orig/math/papers/0202/0202001v2.abs +++ b/arxiv/canonical/classic/tests/data/orig/math/papers/0202/0202001v2.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:math/0202001 -From: Laurent Bartholdi +From: Laurent Bartholdi Date: Fri, 1 Feb 2002 02:02:03 GMT (391kb) Date (revised v2): Sun, 3 Feb 2002 21:43:30 GMT (439kb) diff --git a/arxiv/canonical/tests/data/orig/math/papers/0202/0202001v3.abs b/arxiv/canonical/classic/tests/data/orig/math/papers/0202/0202001v3.abs similarity index 97% rename from arxiv/canonical/tests/data/orig/math/papers/0202/0202001v3.abs rename to arxiv/canonical/classic/tests/data/orig/math/papers/0202/0202001v3.abs index 9933fde..8f9884a 100644 --- a/arxiv/canonical/tests/data/orig/math/papers/0202/0202001v3.abs +++ b/arxiv/canonical/classic/tests/data/orig/math/papers/0202/0202001v3.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:math/0202001 -From: Laurent Bartholdi +From: Laurent Bartholdi Date: Fri, 1 Feb 2002 02:02:03 GMT (391kb) Date (revised v2): Sun, 3 Feb 2002 21:43:30 GMT (439kb) Date (revised v3): Sat, 13 Jul 2002 22:44:27 GMT (441kb) diff --git a/arxiv/canonical/tests/data/orig/physics/papers/9707/9707012v1.abs b/arxiv/canonical/classic/tests/data/orig/physics/papers/9707/9707012v1.abs similarity index 95% rename from arxiv/canonical/tests/data/orig/physics/papers/9707/9707012v1.abs rename to arxiv/canonical/classic/tests/data/orig/physics/papers/9707/9707012v1.abs index a906595..3bcd837 100644 --- a/arxiv/canonical/tests/data/orig/physics/papers/9707/9707012v1.abs +++ b/arxiv/canonical/classic/tests/data/orig/physics/papers/9707/9707012v1.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:physics/9707012 -From: Haret Rosu +From: Haret Rosu Date: Thu, 10 Jul 1997 21:14:20 GMT (3kb) Title: Supersymmetric partner chirping of Newtonian free damping diff --git a/arxiv/canonical/tests/data/orig/physics/papers/9707/9707012v2.abs b/arxiv/canonical/classic/tests/data/orig/physics/papers/9707/9707012v2.abs similarity index 96% rename from arxiv/canonical/tests/data/orig/physics/papers/9707/9707012v2.abs rename to arxiv/canonical/classic/tests/data/orig/physics/papers/9707/9707012v2.abs index ef542b0..b52e6d2 100644 --- a/arxiv/canonical/tests/data/orig/physics/papers/9707/9707012v2.abs +++ b/arxiv/canonical/classic/tests/data/orig/physics/papers/9707/9707012v2.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:physics/9707012 -From: Haret Rosu +From: Haret Rosu Date: Thu, 10 Jul 1997 21:14:20 GMT (3kb) Date (revised v2): Thu, 10 Jul 1997 23:50:03 GMT (0kb,I) diff --git a/arxiv/canonical/tests/data/orig/physics/papers/9707/9707012v3.abs b/arxiv/canonical/classic/tests/data/orig/physics/papers/9707/9707012v3.abs similarity index 96% rename from arxiv/canonical/tests/data/orig/physics/papers/9707/9707012v3.abs rename to arxiv/canonical/classic/tests/data/orig/physics/papers/9707/9707012v3.abs index b385100..3550c49 100644 --- a/arxiv/canonical/tests/data/orig/physics/papers/9707/9707012v3.abs +++ b/arxiv/canonical/classic/tests/data/orig/physics/papers/9707/9707012v3.abs @@ -1,7 +1,7 @@ ------------------------------------------------------------------------------ \\ arXiv:physics/9707012 -From: Haret Rosu +From: Haret Rosu Date: Thu, 10 Jul 1997 21:14:20 GMT (3kb) Date (revised v2): Thu, 10 Jul 1997 23:50:03 GMT (0kb,I) Date (revised v3): Sun, 31 Aug 1997 22:36:45 GMT (0kb,I) diff --git a/arxiv/canonical/classic/tests/data/withdrawn.abs b/arxiv/canonical/classic/tests/data/withdrawn.abs new file mode 100644 index 0000000..7f56d82 --- /dev/null +++ b/arxiv/canonical/classic/tests/data/withdrawn.abs @@ -0,0 +1,35 @@ +------------------------------------------------------------------------------ +\\ +arXiv:1606.01467 +From: Jie Fu +Date: Sun, 5 Jun 2016 06:42:56 GMT (352kb,D) +Date (revised v2): Wed, 8 Jun 2016 17:02:49 GMT (380kb,D) +Date (revised v3): Mon, 1 Aug 2016 14:17:18 GMT (254kb,D) +Date (revised v4): Sun, 16 Oct 2016 04:20:25 GMT (42kb,D) +Date (revised v5): Mon, 7 Nov 2016 05:27:02 GMT (318kb,D) +Date (revised v6): Fri, 11 Nov 2016 06:22:25 GMT (313kb,D) +Date (revised v7): Thu, 17 Nov 2016 06:16:24 GMT (0kb,I) + +Title: Deep Q-Networks for Accelerating the Training of Deep Neural Networks +Authors: Jie Fu +Categories: cs.LG cs.NE +Comments: This paper has been withdrawn by the author due to a crucial error in + the source-code (the epsilon configuration), which makes the results invalid +License: http://arxiv.org/licenses/nonexclusive-distrib/1.0/ +\\ + In this paper, we propose a principled deep reinforcement learning (RL) +approach that is able to accelerate the convergence rate of general deep neural +networks (DNNs). With our approach, a deep RL agent (synonym for +\emph{optimizer} in this work) is used to automatically learn policies about +how to schedule learning rates during the optimization of a DNN. The state +features of the agent are learned from the weight statistics of the optimizee +during training. The reward function of this agent is designed to learn +policies that minimize the optimizee's training time given a certain +performance goal. The actions of the agent correspond to changing the learning +rate for the optimizee during training. As far as we know, this is the first +attempt to use deep RL to learn how to optimize a large-sized DNN. We perform +extensive experiments on a standard benchmark dataset and demonstrate the +effectiveness of the policies learned by our approach. All source code for +reproducing the experiments can be downloaded from +https://github.com/bigaidream-projects/qan +\\ diff --git a/arxiv/canonical/classic/tests/test_abs.py b/arxiv/canonical/classic/tests/test_abs.py new file mode 100644 index 0000000..e6fbf6f --- /dev/null +++ b/arxiv/canonical/classic/tests/test_abs.py @@ -0,0 +1,17 @@ +"""Tests for :mod:`arxiv.canonical.classic.abs`.""" + +import os +from unittest import TestCase, mock + +from ...domain import EventType +from .. import abs + + +class TestParseWithdrawn(TestCase): + """Parse abs file for a withdrawn e-print.""" + DATA = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data') + + def test_withdrawn(self): + """Parsed data should indicate withdrawn submission.""" + data = abs._parse(os.path.join(self.DATA, 'withdrawn.abs')) + self.assertEqual(data.submission_type, EventType.WITHDRAWN) \ No newline at end of file diff --git a/arxiv/canonical/classic/tests/test_backfill.py b/arxiv/canonical/classic/tests/test_backfill.py new file mode 100644 index 0000000..31637ad --- /dev/null +++ b/arxiv/canonical/classic/tests/test_backfill.py @@ -0,0 +1,702 @@ +"""Tests for :mod:`arxiv.canonical.classic.backfill`.""" + +import io +import json +import os +import tempfile +import cProfile as profile +from datetime import date, datetime +from pprint import pprint +from unittest import TestCase, mock + +from pytz import timezone + +from ...domain import ContentType, CanonicalFile, Category, EventType, \ + Identifier, License, URI, VersionedIdentifier +from ...log import Log +from ...register import IRegisterAPI, RegisterAPI +from ...services import InMemoryStorage, CanonicalFilesystem, Filesystem, \ + RemoteSource +from .. import backfill, abs, daily + +ET = timezone('US/Eastern') + + +class TestBackfillWithData(TestCase): + """ + This runs backfill on a subset of identifiers using daily.log. + + To run this test, set the environment variable DAILY_PATH to the + full path to daily.log. + """ + + __test__ = bool(os.environ.get('DAILY_PATH', None) is not None) + + def setUp(self): + self.state_path = tempfile.mkdtemp() + self.record_path = tempfile.mkdtemp() + # self.state_path = '/var/folders/l7/5ygyvtbs29340t2s2nsq4lh00000gp/T/tmpmrktp19_' + # self.record_path = '/var/folders/l7/5ygyvtbs29340t2s2nsq4lh00000gp/T/tmpengvav2y' + self.cache_path = './.cache' + print('state_path ::', self.state_path) + print('record_path ::', self.record_path) + # self.mock_source = mock.MagicMock() + # self.mock_source.can_resolve.return_value = True + # self.mock_source.load = \ + # lambda *a, **k: io.BytesIO(b'foocontent') + + self.abs_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + 'data' + ) + self.ps_cache_path = os.path.join(self.abs_path, 'cache') + + self.storage = CanonicalFilesystem(self.record_path) + self.classic = Filesystem(self.abs_path) + self.remote = RemoteSource('arxiv.org') + self.api = RegisterAPI(self.storage, + [self.storage, self.classic, self.remote]) + + self.daily_path = os.environ.get('DAILY_PATH') + self.identifiers = [ + Identifier('adap-org/9509003'), + Identifier('0704.0001'), + Identifier('0801.1021'), + Identifier('0802.0193'), + Identifier('0808.4142'), + Identifier('0905.2326'), + Identifier('0906.2112'), + Identifier('0906.3336'), + Identifier('0906.3421'), + Identifier('0906.5132'), + Identifier('0906.5504'), + Identifier('1210.8438'), + Identifier('1605.09669'), + Identifier('1607.08199'), + Identifier('cond-mat/9805021'), + Identifier('funct-an/9301001'), + Identifier('hep-th/9709111'), + Identifier('hep-th/9901001'), + Identifier('math/0202001'), + Identifier('math-ph/0702031'), + Identifier('physics/9707012') + ] + + def test_backfill_with_content(self): + try: + for e in backfill.backfill(self.api, + self.daily_path, + self.abs_path, + self.ps_cache_path, + self.state_path, + limit_to=set(self.identifiers), + cache_path=self.cache_path, + until=date(2000, 1, 1)): + print(e.identifier, e.event_type, e.event_date) + finally: + print('state_path ::', self.state_path) + print('record_path ::', self.record_path) + + + events, N = self.api.load_events(1997) + events = list(events) + self.assertEqual( + len([e for e in events if e.event_type == EventType.NEW]), + 2, 'There are two NEW events in 1997' + ) + + +# class TestBackfillRecord(TestCase): +# def setUp(self): +# """The classic record has two e-prints.""" +# # One of them was first announced prior to the daily record. +# self.ident = Identifier('1902.00123') +# # The other one was announced after the daily record began. +# self.ident2 = Identifier('1902.00125') + +# self.state_path = tempfile.mkdtemp() +# self.events = [ +# # The first event we have in the daily record for 1902.00123. +# daily.EventData( +# arxiv_id=self.ident, +# event_date=date(2019, 2, 9), +# event_type=EventType.CROSSLIST, +# version=-1, # Who knows what version this is? +# categories=[Category('cs.WT')], +# ), +# # Here is where 1902.00125 is first announced. +# daily.EventData( +# arxiv_id=self.ident2, +# event_date=date(2019, 2, 9), +# event_type=EventType.NEW, +# version=1, +# categories=[ +# Category('cs.DL'), +# Category('cs.IR'), +# ] +# ), +# # Here is where the second version of 1902.00123 is announced. +# daily.EventData( +# arxiv_id=self.ident, +# event_date=date(2019, 2, 10), +# event_type=EventType.REPLACED, +# version=-1, # Who knows what version this is? +# categories=[ +# Category('cs.DL'), +# Category('cs.IR'), +# Category('cs.WT'), +# Category('cs.FO') +# ] +# ) +# ] + +# # We have abs records for everything... +# self.abs = [ +# # The first version of 1902.00123 (pre-daily). +# abs.AbsData( +# identifier=VersionedIdentifier.from_parts(self.ident, 1), +# submitter=None, +# submitted_date=date(2019, 2, 1), +# announced_month='2019-02', +# updated_date=datetime.now(), +# license=License('http://foo.license'), +# primary_classification=Category('cs.DL'), +# title='foo title before daily.log existed', +# abstract='very abstract', +# authors='Ima N. Author', +# size_kilobytes=42, +# submission_type=EventType.NEW, +# secondary_classification=[ +# Category('cs.IR'), +# Category('cs.WT'), # <- This was added by a cross event! +# ], +# ), +# # The second version of 1902.00123, which was noted in daily.log. +# abs.AbsData( +# identifier=VersionedIdentifier.from_parts(self.ident, 2), +# submitter=None, +# submitted_date=date(2019, 2, 9), +# announced_month='2019-02', +# updated_date=datetime.now(), +# license=License('http://foo.license'), +# primary_classification=Category('cs.DL'), +# title='fooooo title after daily.log exists', +# abstract='very abstract', +# authors='Ima N. Author', +# size_kilobytes=42, +# submission_type=EventType.REPLACED, +# secondary_classification=[ +# Category('cs.IR'), +# Category('cs.WT'), +# Category('cs.FO') +# ], +# ), +# # The first version of 1902.00125, which was noted in daily.log. +# abs.AbsData( +# identifier=VersionedIdentifier.from_parts(self.ident2, 1), +# submitter=None, +# submitted_date=date(2019, 2, 9), +# announced_month='2019-02', +# updated_date=datetime.now(), +# license=License('http://foo.license'), +# primary_classification=Category('cs.DL'), +# title='another very cool title', +# abstract='very abstract', +# authors='Ima N. Author', +# size_kilobytes=42, +# submission_type=EventType.REPLACED, +# secondary_classification=[ +# Category('cs.IR'), +# ], +# ) +# ] + +# def _get_abs(path, identifier, *args, **kwargs): +# for a in self.abs: +# if a.identifier == identifier: +# return a +# raise RuntimeError(f'No such abs: {identifier}') + +# self._get_abs = _get_abs + +# @mock.patch(f'{backfill.__name__}.content', mock.MagicMock()) +# @mock.patch(f'{backfill.__name__}.daily') +# @mock.patch(f'{backfill.__name__}.abs') +# def test_backfill(self, mock_abs, mock_daily): +# register = mock.MagicMock(spec=IRegisterAPI) +# added_events = [] +# register.add_events.side_effect = added_events.append +# mock_daily.parse.side_effect = [ +# self.events, +# self.events[0:2], +# self.events, +# ] +# # This call will get events for a particular identifier during +# # parsing of pre-daily announcements. So we just return the events +# # for 1902.00123. +# mock_daily.scan.return_value = [self.events[0], self.events[2]] +# # Handle a call to list all of the identifiers prior to the first one +# # in daily.log. +# mock_abs.list_all.return_value = \ +# list(set([a.identifier.arxiv_id for a in self.abs[:2]])) +# mock_abs.iter_all.return_value = \ +# list(set([a.identifier.arxiv_id for a in self.abs[:2]])) + +# # Return an AbsData based on the requested identifier. +# mock_abs.get_path.side_effect = lambda b, i: i # Pass ID through. +# mock_abs.parse.side_effect = self._get_abs # Get AbsData by ID. + +# # This is called when parsing the pre-daily records, and gets all of +# # the AbsData of the e-print that was first announced prior to daily. +# mock_abs.parse_versions.return_value = self.abs[0:2] + +# list(backfill.backfill(register, '/daily', '/abs', '/cache', +# self.state_path)) + +# # We expect an ordered series of events that represents both what is +# # directly known from daily.log and what is inferred from the presence +# # of abs files and replacement events in daily.log. +# expected = [ +# (EventType.NEW, VersionedIdentifier('1902.00123v1')), +# (EventType.CROSSLIST, VersionedIdentifier('1902.00123v1')), +# (EventType.NEW, VersionedIdentifier('1902.00125v1')), +# (EventType.REPLACED, VersionedIdentifier('1902.00123v2')), +# ] +# for (expected_type, expected_id), event in zip(expected, added_events): +# self.assertEqual(expected_type, event.event_type) +# self.assertEqual(expected_id, event.identifier) + +# with open(os.path.join(self.state_path, 'first.json')) as f: +# first = json.load(f) + +# self.assertEqual(len(first), 2, 'Two entries in first announced index') +# self.assertIn(self.ident, first) +# self.assertIn(self.ident2, first) + +# with open(os.path.join(self.state_path, 'current.json')) as f: +# current = json.load(f) + +# self.assertEqual(len(current), 2, +# 'Two entries in current version index') +# self.assertIn(self.ident, current) +# self.assertEqual(current[self.ident], 2) +# self.assertIn(self.ident2, current) +# self.assertEqual(current[self.ident2], 1) + +# log = Log(self.state_path) +# log_entries = list(log.read_all()) +# self.assertEqual(len(log_entries), len(added_events), +# 'There is a log entry for each event') +# for entry in log_entries: +# self.assertEqual(entry.state, 'SUCCESS', 'All entries are SUCCESS') + +# for event, entry in zip(added_events, log_entries): +# self.assertEqual(event.event_id, entry.event_id, +# 'Log entries are in the same order as events') + +# @mock.patch(f'{backfill.__name__}.content', mock.MagicMock()) +# @mock.patch(f'{backfill.__name__}.daily') +# @mock.patch(f'{backfill.__name__}.abs') +# def test_backfill_with_errors(self, mock_abs, mock_daily): +# register = mock.MagicMock(spec=IRegisterAPI) +# added_events = [] +# register.add_events.side_effect = added_events.append + +# def _parse(path, for_date=None, **kwargs): +# if for_date is not None: +# return self.events[0:2] +# return self.events + +# mock_daily.parse.side_effect = _parse + +# # This call will get events for a particular identifier during +# # parsing of pre-daily announcements. So we just return the events +# # for 1902.00123. +# mock_daily.scan.return_value = [self.events[0], self.events[2]] +# # Handle a call to list all of the identifiers prior to the first one +# # in daily.log. +# mock_abs.list_all.return_value = \ +# list(set([a.identifier.arxiv_id for a in self.abs[:2]])) +# mock_abs.iter_all.return_value = \ +# list(set([a.identifier.arxiv_id for a in self.abs[:2]])) +# # Return an AbsData based on the requested identifier. But raise a +# # RuntimeError when handling one of the records! +# raise_an_error = [True] + +# def _get_abs(dpath, identifier, *args, **kwargs): +# if identifier == '1902.00125v1' and raise_an_error: +# raise_an_error.pop() +# raise RuntimeError('') +# for a in self.abs: +# if a.identifier == identifier: +# return a +# raise RuntimeError(f'No such abs: {identifier}') + +# mock_abs.parse.side_effect = _get_abs # Get AbsData by ID. + +# # This is called when parsing the pre-daily records, and gets all of +# # the AbsData of the e-print that was first announced prior to daily. +# mock_abs.parse_versions.return_value = self.abs[0:2] + +# # We gave generated a RuntimeError intentionally... +# with self.assertRaises(RuntimeError): +# list(backfill.backfill(register, '/fo', '/ba', '/bz', +# self.state_path)) + +# # ...and call backfill again to resume. +# list(backfill.backfill(register, '/fo', '/ba', '/bz', self.state_path)) + +# # We expect an ordered series of events that represents both what is +# # directly known from daily.log and what is inferred from the presence +# # of abs files and replacement events in daily.log. +# expected = [ +# (EventType.NEW, VersionedIdentifier('1902.00123v1')), +# (EventType.CROSSLIST, VersionedIdentifier('1902.00123v1')), +# (EventType.NEW, VersionedIdentifier('1902.00125v1')), +# (EventType.REPLACED, VersionedIdentifier('1902.00123v2')), +# ] +# for (expected_type, expected_id), event in zip(expected, added_events): +# self.assertEqual(expected_type, event.event_type) +# self.assertEqual(expected_id, event.identifier) + +# with open(os.path.join(self.state_path, 'first.json')) as f: +# first = json.load(f) + +# self.assertEqual(len(first), 2, 'Two entries in first announced index') +# self.assertIn(self.ident, first) +# self.assertIn(self.ident2, first) + +# with open(os.path.join(self.state_path, 'current.json')) as f: +# current = json.load(f) + +# self.assertEqual(len(current), 2, +# 'Two entries in current version index') +# self.assertIn(self.ident, current) +# self.assertEqual(current[self.ident], 2) +# self.assertIn(self.ident2, current) +# self.assertEqual(current[self.ident2], 1) + +# log = Log(self.state_path) +# log_entries = list(log.read_all()) +# self.assertEqual(len(log_entries) - 1, len(added_events), +# 'There is a log entry for each event, plus a' +# 'FAILED entry') +# success_entries = [e for e in log_entries if e.state == 'SUCCESS'] +# self.assertEqual(len(success_entries), len(added_events), +# 'There is one SUCCESS entry per event') +# failed_entries = [e for e in log_entries if e.state == 'FAILED'] +# self.assertEqual(len(failed_entries), 1, 'There is one FAILED entry') + +# for event, entry in zip(added_events, success_entries): +# self.assertEqual(event.event_id, entry.event_id, +# 'Log entries are in the same order as events') + + +# class TestLoadPredailyEvents(TestCase): +# """Load events from before there were events!""" + +# @mock.patch(f'{backfill.__name__}.content', mock.MagicMock()) +# @mock.patch(f'{backfill.__name__}.daily') +# @mock.patch(f'{backfill.__name__}.abs') +# def test_load_new_before_daily(self, mock_abs, mock_daily): +# """The first version of an e-print was announced prior to daily.log.""" +# ident = Identifier('1902.00123') +# mock_abs.parse_versions.return_value = [ +# abs.AbsData( +# identifier=VersionedIdentifier('1902.00123v1'), +# submitter=None, +# submitted_date=date(2019, 2, 1), +# announced_month='2019-02', +# updated_date=datetime.now(), +# license=License('http://foo.license'), +# primary_classification=Category('cs.DL'), +# title='foo title before daily.log existed', +# abstract='very abstract', +# authors='Ima N. Author', +# size_kilobytes=42, +# submission_type=EventType.NEW, +# secondary_classification=[ +# Category('cs.IR'), +# Category('cs.WT'), +# ], +# ), +# abs.AbsData( +# identifier=VersionedIdentifier('1902.00123v2'), +# submitter=None, +# submitted_date=date(2019, 2, 9), +# announced_month='2019-02', +# updated_date=datetime.now(), +# license=License('http://foo.license'), +# primary_classification=Category('cs.DL'), +# title='fooooo title after daily.log exists', +# abstract='very abstract', +# authors='Ima N. Author', +# size_kilobytes=42, +# submission_type=EventType.REPLACED, +# secondary_classification=[ +# Category('cs.IR'), +# Category('cs.WT'), +# Category('cs.FO') +# ], +# ) +# ] +# mock_daily.scan.return_value = [ +# daily.EventData( +# arxiv_id=ident, +# event_date=date(2019, 2, 10), +# event_type=EventType.REPLACED, +# version=-1, # Who knows what version this is? +# categories=[ +# Category('cs.DL'), +# Category('cs.IR'), +# Category('cs.WT'), +# Category('cs.FO') +# ] +# ) +# ] + +# events = backfill._load_predaily('/foo', '/path', '/ba', ident, {}, {}) +# self.assertEqual(len(events), 1, 'Generates one event') +# self.assertEqual(events[0].event_type, EventType.NEW, +# 'Generates a NEW event') +# self.assertEqual(events[0].version.identifier, +# VersionedIdentifier('1902.00123v1'), +# 'With the first version') +# self.assertEqual(events[0].version.metadata.title, +# 'foo title before daily.log existed', +# 'And the correct title') +# self.assertEqual(events[0].version.metadata.secondary_classification, +# [Category('cs.IR'), Category('cs.WT')], +# 'And the correct cross-list categories') + +# @mock.patch(f'{backfill.__name__}.content', mock.MagicMock()) +# @mock.patch(f'{backfill.__name__}.daily') +# @mock.patch(f'{backfill.__name__}.abs') +# def test_load_new_before_daily_with_cross(self, mock_abs, mock_daily): +# """First version of an e-print in pre-history, with a cross event.""" +# ident = Identifier('1902.00123') +# mock_abs.parse_versions.return_value = [ +# abs.AbsData( +# identifier=VersionedIdentifier('1902.00123v1'), +# submitter=None, +# submitted_date=date(2019, 2, 1), +# announced_month='2019-02', +# updated_date=datetime.now(), +# license=License('http://foo.license'), +# primary_classification=Category('cs.DL'), +# title='foo title before daily.log existed', +# abstract='very abstract', +# authors='Ima N. Author', +# size_kilobytes=42, +# submission_type=EventType.NEW, +# secondary_classification=[ +# Category('cs.IR'), +# Category('cs.WT'), # <- This was added by a cross event! +# ], +# ), +# abs.AbsData( +# identifier=VersionedIdentifier('1902.00123v2'), +# submitter=None, +# submitted_date=date(2019, 2, 9), +# announced_month='2019-02', +# updated_date=datetime.now(), +# license=License('http://foo.license'), +# primary_classification=Category('cs.DL'), +# title='fooooo title after daily.log exists', +# abstract='very abstract', +# authors='Ima N. Author', +# size_kilobytes=42, +# submission_type=EventType.REPLACED, +# secondary_classification=[ +# Category('cs.IR'), +# Category('cs.WT'), +# Category('cs.FO') +# ], +# ) +# ] +# mock_daily.scan.return_value = [ +# daily.EventData( +# arxiv_id=ident, +# event_date=date(2019, 2, 9), +# event_type=EventType.CROSSLIST, +# version=-1, # Who knows what version this is? +# categories=[Category('cs.WT')], +# ), +# daily.EventData( +# arxiv_id=ident, +# event_date=date(2019, 2, 10), +# event_type=EventType.REPLACED, +# version=-1, # Who knows what version this is? +# categories=[ +# Category('cs.DL'), +# Category('cs.IR'), +# Category('cs.WT'), +# Category('cs.FO') +# ] +# ) +# ] + +# events = backfill._load_predaily('/foo', '/bar', '/baz', ident, {}, {}) + +# self.assertEqual(len(events), 1, 'Still generates one event') +# self.assertEqual(events[0].version.metadata.secondary_classification, +# [Category('cs.IR')], +# 'But the cross-list category is not included in the' +# ' NEW event for the first version!') + + +# class TestDailyEvents(TestCase): +# """Load daily events!""" + +# @mock.patch(f'{backfill.__name__}.content', mock.MagicMock()) +# @mock.patch(f'{backfill.__name__}.abs') +# def test_load_new(self, mock_abs): +# """Load a NEW event.""" +# ident = Identifier('2302.00123') +# event_datum = daily.EventData( +# arxiv_id=ident, +# event_date=date(2019, 2, 10), +# event_type=EventType.NEW, +# version=1, +# categories=[ +# Category('cs.DL'), +# Category('cs.IR'), +# ] +# ) + +# mock_abs.parse.return_value = abs.AbsData( +# identifier=VersionedIdentifier.from_parts(ident, 1), +# submitter=None, +# submitted_date=datetime(2023, 2, 1, 2, 42, 1), +# announced_month='2023-02', +# updated_date=datetime.now(), +# license=License('http://foo.license'), +# primary_classification=Category('cs.DL'), +# title='foo title', +# abstract='very abstract', +# authors='Ima N. Author', +# size_kilobytes=42, +# submission_type=EventType.NEW, +# secondary_classification=[ +# Category('cs.IR'), +# ], +# ) +# event = backfill._load_daily_event('', '', event_datum, {}, {}) + +# self.assertEqual(event.event_type, EventType.NEW, 'Creates NEW event') +# self.assertEqual(event.version.identifier, +# VersionedIdentifier.from_parts(ident, 1), +# 'With the correct identifier') +# self.assertEqual(event.version.metadata.abstract, 'very abstract', +# 'And the correct abstract') +# self.assertEqual( +# event.event_date, +# datetime(2019, 2, 10, 20, 0, 0, 123, tzinfo=backfill.ET), +# 'Event timestamp reflects the announcement day, with microsecond' +# ' based on the incremental part of the identifier to preserve' +# ' order.' +# ) + +# @mock.patch(f'{backfill.__name__}.content', mock.MagicMock()) +# @mock.patch(f'{backfill.__name__}.abs') +# def test_load_cross(self, mock_abs): +# """Load a CROSSLIST event.""" +# ident = Identifier('2302.00123') +# event_datum = daily.EventData( +# arxiv_id=ident, +# event_date=date(2019, 2, 12), +# event_type=EventType.CROSSLIST, +# version=-1, +# categories=[Category('cs.WT')] +# ) + +# mock_abs.parse.return_value = abs.AbsData( +# identifier=VersionedIdentifier.from_parts(ident, 1), +# submitter=None, +# submitted_date=datetime(2023, 2, 1, 2, 42, 1), +# announced_month='2023-02', +# updated_date=datetime.now(), +# license=License('http://foo.license'), +# primary_classification=Category('cs.DL'), +# title='foo title', +# abstract='very abstract', +# authors='Ima N. Author', +# size_kilobytes=42, +# submission_type=EventType.NEW, +# secondary_classification=[ +# Category('cs.IR'), Category('cs.WT') +# ], +# ) +# current = {ident: 1} # The current version number. +# event = backfill._load_daily_event('', '', event_datum, current, {}) + +# self.assertEqual(event.event_type, EventType.CROSSLIST, +# 'Creates CROSSLIST event') +# self.assertEqual(event.version.identifier, +# VersionedIdentifier.from_parts(ident, 1), +# 'With the correct identifier') +# self.assertEqual(event.version.metadata.secondary_classification, +# [Category('cs.IR'), Category('cs.WT')], +# 'And the correct cross-list classification') +# self.assertEqual( +# event.event_date, +# datetime(2019, 2, 12, 20, 0, 0, 123, tzinfo=backfill.ET), +# 'Event timestamp reflects the announcement day, with microsecond' +# ' based on the incremental part of the identifier to preserve' +# ' order.' +# ) + +# @mock.patch(f'{backfill.__name__}.content', mock.MagicMock()) +# @mock.patch(f'{backfill.__name__}.abs') +# def test_load_replacement(self, mock_abs): +# """Load a REPLACED event.""" +# ident = Identifier('2302.00123') +# event_datum = daily.EventData( +# arxiv_id=ident, +# event_date=date(2019, 2, 12), +# event_type=EventType.REPLACED, +# version=-1, +# categories=[ +# Category('cs.DL'), +# Category('cs.IR'), +# Category('cs.WT') +# ] +# ) + +# mock_abs.parse.return_value = abs.AbsData( +# identifier=VersionedIdentifier.from_parts(ident, 2), +# submitter=None, +# submitted_date=datetime(2023, 2, 1, 2, 42, 1), +# announced_month='2023-02', +# updated_date=datetime.now(), +# license=License('http://foo.license'), +# primary_classification=Category('cs.DL'), +# title='foo title', +# abstract='very abstract', +# authors='Ima N. Author', +# size_kilobytes=42, +# submission_type=EventType.NEW, +# secondary_classification=[ +# Category('cs.IR'), Category('cs.WT') +# ], +# ) +# current = {ident: 1} # The current version number. +# first = {ident: date(2019, 2, 11)} # First announcement date. +# event = backfill._load_daily_event('', '', event_datum, current, first) + +# self.assertEqual(event.event_type, EventType.REPLACED, +# 'Creates REPLACED event') +# self.assertEqual(event.version.identifier, +# VersionedIdentifier.from_parts(ident, 2), +# 'With the correct identifier') +# self.assertEqual(event.version.metadata.secondary_classification, +# [Category('cs.IR'), Category('cs.WT')], +# 'And the correct cross-list classification') +# self.assertEqual( +# event.event_date, +# datetime(2019, 2, 12, 20, 0, 0, 123, tzinfo=backfill.ET), +# 'Event timestamp reflects the announcement day, with microsecond' +# ' based on the incremental part of the identifier to preserve' +# ' order.' +# ) + diff --git a/arxiv/canonical/classic/tests/test_content.py b/arxiv/canonical/classic/tests/test_content.py new file mode 100644 index 0000000..5607508 --- /dev/null +++ b/arxiv/canonical/classic/tests/test_content.py @@ -0,0 +1,470 @@ +"""Tests for :mod:`arxiv.canonical.classic.content`.""" + +import os +from datetime import datetime +from os.path import join +import shutil +import tempfile +from unittest import TestCase, mock + +from pytz import UTC + +from .. import content +from ... import domain as D + + +def touch(path): + parent, _ = os.path.split(path) + if not os.path.exists(parent): + os.makedirs(parent) + with open(path, 'wb') as f: + f.write(b'') + + +class TestGetFormats(TestCase): + """Get the dissemination formats for a version.""" + + def setUp(self): + """Make the classic file tree.""" + self.data_path = tempfile.mkdtemp() + self.ori = join(self.data_path, 'orig') + self.ftp = join(self.data_path, 'ftp') + os.makedirs(self.ori) + os.makedirs(self.ftp) + self.cache_path = tempfile.mkdtemp() + self.psc = join(self.cache_path, 'ps_cache') + os.makedirs(self.psc) + + @mock.patch(f'{content.__name__}.RemoteSourceWithHead') + def test_get_v1_single_format(self, mock_RemoteSourceWithHead): + """Get the first version of a multi-version e-print with one format.""" + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.abs')) + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.tar.gz')) + pdf_path = join(self.psc, 'arxiv', 'pdf', '1901', '1901.00123v1.pdf') + touch(pdf_path) + + # HTTP fallback does not yield any additional formats. + mock_remote_source = mock.MagicMock() + mock_remote_source.head.return_value = None + mock_RemoteSourceWithHead.return_value = mock_remote_source + + identifier = D.VersionedIdentifier('1901.00123v1') + + source_type = D.SourceType('') + source_file = content.get_source(self.data_path, identifier) + formats = content.get_formats(self.data_path, self.cache_path, + identifier, source_type, source_file) + + cfs = [o for o in formats] + self.assertEqual(len(cfs), 1) + self.assertEqual(cfs[0].content_type, D.ContentType.pdf) + self.assertFalse(cfs[0].is_gzipped) + self.assertEqual(cfs[0].size_bytes, 0) + self.assertEqual(cfs[0].ref.path, pdf_path) + + @mock.patch(f'{content.__name__}.REMOTE') + def test_get_v1_two_formats(self, mock_remote_source): + """Get the first version of an e-print with two formats.""" + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.abs')) + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.tar.gz')) + pdf_path = join(self.psc, 'arxiv', 'pdf', '1901', '1901.00123v1.pdf') + ps_path = join(self.psc, 'arxiv', 'ps', '1901', '1901.00123v1.ps.gz') + touch(pdf_path) + touch(ps_path) + + # HTTP fallback does not yield any additional formats. + mock_remote_source.head.return_value = None + + identifier = D.VersionedIdentifier('1901.00123v1') + + source_type = D.SourceType('') + source_file = content.get_source(self.data_path, identifier) + formats = content.get_formats(self.data_path, self.cache_path, + identifier, source_type, source_file) + + cfs = [o for o in formats] + self.assertEqual(len(cfs), 2) + self.assertIn(D.ContentType.pdf, [cf.content_type for cf in cfs]) + self.assertIn(D.ContentType.ps, [cf.content_type for cf in cfs]) + for cf in cfs: + self.assertEqual(cf.size_bytes, 0) + if cf.content_type == D.ContentType.pdf: + self.assertFalse(cf.is_gzipped) + self.assertEqual(cf.ref.path, pdf_path) + elif cf.content_type == D.ContentType.ps: + self.assertTrue(cf.is_gzipped) + self.assertEqual(cf.ref.path, ps_path) + + @mock.patch(f'{content.__name__}.REMOTE') + def test_get_v1_with_one_remote(self, mock_remote_source): + """Get formats for the first version with one local format missing.""" + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.abs')) + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.tar.gz')) + pdf_path = join(self.psc, 'arxiv', 'pdf', '1901', '1901.00123v1.pdf') + ps_path = join(self.psc, 'arxiv', 'ps', '1901', '1901.00123v1.ps.gz') + touch(pdf_path) + touch(ps_path) + + # HTTP fallback yields one additional format. + mock_remote_source.head.return_value = D.CanonicalFile( + modified=datetime.now(UTC), + size_bytes=42, + content_type=D.ContentType.dvi, + ref=D.URI('https://arxiv.org/dvi/1901.00123v1'), + filename='1901.00123v1.dvi', + is_gzipped=True + ) + + identifier = D.VersionedIdentifier('1901.00123v1') + + source_type = D.SourceType('') + source_file = content.get_source(self.data_path, identifier) + formats = content.get_formats(self.data_path, self.cache_path, + identifier, source_type, source_file) + + cfs = [o for o in formats] + self.assertEqual(len(cfs), 3) + self.assertIn(D.ContentType.pdf, [cf.content_type for cf in cfs]) + self.assertIn(D.ContentType.ps, [cf.content_type for cf in cfs]) + self.assertIn(D.ContentType.dvi, [cf.content_type for cf in cfs]) + for cf in cfs: + if cf.content_type == D.ContentType.dvi: + self.assertTrue(cf.is_gzipped) + self.assertEqual(cf.ref.path, '/dvi/1901.00123v1') + self.assertEqual(cf.size_bytes, 42) + + @mock.patch(f'{content.__name__}.REMOTE') + def test_get_v1_source_encrypted(self, mock_remote_source): + """Get formats for a source-encrypted version.""" + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.abs')) + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.tar.gz')) + pdf_path = join(self.psc, 'arxiv', 'pdf', '1901', '1901.00123v1.pdf') + ps_path = join(self.psc, 'arxiv', 'ps', '1901', '1901.00123v1.ps.gz') + touch(pdf_path) + touch(ps_path) + + # HTTP fallback yields no additional formats. + mock_remote_source.head.return_value = None + + identifier = D.VersionedIdentifier('1901.00123v1') + + source_type = D.SourceType('IS') + source_file = content.get_source(self.data_path, identifier) + formats = content.get_formats(self.data_path, self.cache_path, + identifier, source_type, source_file) + + cfs = [o for o in formats] + # Finds postscript and pdf formats. + self.assertEqual(len(cfs), 2) + self.assertIn(D.ContentType.pdf, [cf.content_type for cf in cfs]) + self.assertIn(D.ContentType.ps, [cf.content_type for cf in cfs]) + + @mock.patch(f'{content.__name__}.REMOTE') + def test_get_v1_ignore(self, mock_remote_source): + """Get formats for an ignore-type version.""" + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.abs')) + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.tar.gz')) + pdf_path = join(self.psc, 'arxiv', 'pdf', '1901', '1901.00123v1.pdf') + ps_path = join(self.psc, 'arxiv', 'ps', '1901', '1901.00123v1.ps.gz') + touch(pdf_path) + touch(ps_path) + + # HTTP fallback yields no additional formats. + mock_remote_source.head.return_value = None + + identifier = D.VersionedIdentifier('1901.00123v1') + + source_type = D.SourceType('I') + source_file = content.get_source(self.data_path, identifier) + formats = content.get_formats(self.data_path, self.cache_path, + identifier, source_type, source_file) + + # Finds no formats. + self.assertEqual(len([o for o in formats]), 0) + + @mock.patch(f'{content.__name__}.REMOTE') + def test_get_v1_ps_only(self, mock_remote_source): + """Get formats for a ps-only version.""" + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.abs')) + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.tar.gz')) + pdf_path = join(self.psc, 'arxiv', 'pdf', '1901', '1901.00123v1.pdf') + ps_path = join(self.psc, 'arxiv', 'ps', '1901', '1901.00123v1.ps.gz') + touch(pdf_path) + touch(ps_path) + + # HTTP fallback yields no additional formats. + mock_remote_source.head.return_value = None + + identifier = D.VersionedIdentifier('1901.00123v1') + + source_type = D.SourceType('P') + source_file = content.get_source(self.data_path, identifier) + formats = content.get_formats(self.data_path, self.cache_path, + identifier, source_type, source_file) + + cfs = [o for o in formats] + # Finds postscript and pdf formats. + self.assertEqual(len(cfs), 2) + self.assertIn(D.ContentType.pdf, [cf.content_type for cf in cfs]) + self.assertIn(D.ContentType.ps, [cf.content_type for cf in cfs]) + + @mock.patch(f'{content.__name__}.REMOTE') + def test_get_v1_pdflatex(self, mock_remote_source): + """Get formats for a pdflatex version.""" + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.abs')) + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.tar.gz')) + pdf_path = join(self.psc, 'arxiv', 'pdf', '1901', '1901.00123v1.pdf') + ps_path = join(self.psc, 'arxiv', 'ps', '1901', '1901.00123v1.ps.gz') + touch(pdf_path) + touch(ps_path) + + # HTTP fallback yields no additional formats. + mock_remote_source.head.return_value = None + + identifier = D.VersionedIdentifier('1901.00123v1') + + source_type = D.SourceType('D') + source_file = content.get_source(self.data_path, identifier) + formats = content.get_formats(self.data_path, self.cache_path, + identifier, source_type, source_file) + + cfs = [o for o in formats] + # Finds pdf format. + self.assertEqual(len(cfs), 1) + self.assertIn(D.ContentType.pdf, [cf.content_type for cf in cfs]) + + @mock.patch(f'{content.__name__}.REMOTE') + def test_get_v1_pdf_only(self, mock_remote_source): + """Get formats for a pdfladtex version.""" + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.abs')) + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.tar.gz')) + pdf_path = join(self.psc, 'arxiv', 'pdf', '1901', '1901.00123v1.pdf') + ps_path = join(self.psc, 'arxiv', 'ps', '1901', '1901.00123v1.ps.gz') + touch(pdf_path) + touch(ps_path) + + # HTTP fallback yields no additional formats. + mock_remote_source.head.return_value = None + + identifier = D.VersionedIdentifier('1901.00123v1') + + source_type = D.SourceType('F') + source_file = content.get_source(self.data_path, identifier) + formats = content.get_formats(self.data_path, self.cache_path, + identifier, source_type, source_file) + + cfs = [o for o in formats] + # Finds pdf format. + self.assertEqual(len(cfs), 1) + self.assertIn(D.ContentType.pdf, [cf.content_type for cf in cfs]) + + @mock.patch(f'{content.__name__}.REMOTE') + def test_get_v1_html(self, mock_remote_source): + """Get formats for a multi-file HTML version.""" + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.abs')) + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.tar.gz')) + html_path = join(self.psc, 'arxiv', 'html', '1901', '1901.00123v1', + 'fooPaper.html') + + touch(html_path) + + # HTTP fallback yields no additional formats. + mock_remote_source.head.return_value = None + + identifier = D.VersionedIdentifier('1901.00123v1') + + source_type = D.SourceType('H') + source_file = content.get_source(self.data_path, identifier) + formats = content.get_formats(self.data_path, self.cache_path, + identifier, source_type, source_file) + + cfs = [o for o in formats] + # Finds html format. + self.assertEqual(len(cfs), 1) + self.assertIn(D.ContentType.html, [cf.content_type for cf in cfs]) + + # TODO: implement this test! + @mock.patch(f'{content.__name__}.REMOTE') + def test_get_v1_docx(self, mock_remote_source): + """Get formats for a DOCX version.""" + source_type = D.SourceType('H') + + +class TestGetSource(TestCase): + """Get the source for a version.""" + + def setUp(self): + """Make the classic file tree.""" + self.data_path = tempfile.mkdtemp() + self.ori = join(self.data_path, 'orig') + self.ftp = join(self.data_path, 'ftp') + os.makedirs(self.ori) + os.makedirs(self.ftp) + self.cache_path = tempfile.mkdtemp() + self.psc = join(self.cache_path, 'ps_cache') + os.makedirs(self.psc) + + def test_get_v1_of_multiple(self): + """Get the first labeled version of a multi-version e-print.""" + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.abs')) + path = join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.tar.gz') + touch(path) + identifier = D.VersionedIdentifier('1901.00123v1') + + source = content.get_source(self.data_path, identifier) + + self.assertEqual(source.content_type, D.ContentType.tar) + self.assertTrue(source.is_gzipped) + self.assertEqual(source.ref.path, path) + + def test_get_v1_of_multiple_without_extension(self): + """Get the first version, lacking an extension.""" + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.abs')) + path = join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.gz') + touch(path) + identifier = D.VersionedIdentifier('1901.00123v1') + + source = content.get_source(self.data_path, identifier) + + self.assertEqual(source.content_type, D.ContentType.tex, + 'We assume that it is a TeX source.') + self.assertTrue(source.is_gzipped) + self.assertEqual(source.ref.path, path) + + def test_get_v1_of_multiple_old_style(self): + """Get the first labeled version of a multi-version e-print.""" + touch(join(self.ori, 'math', 'papers', '9501', '95010123v1.abs')) + path = join(self.ori, 'math', 'papers', '9501', '95010123v1.tar.gz') + touch(path) + identifier = D.VersionedIdentifier('math/95010123v1') + + source = content.get_source(self.data_path, identifier) + + self.assertEqual(source.content_type, D.ContentType.tar) + self.assertTrue(source.is_gzipped) + self.assertEqual(source.ref.path, path) + + def test_get_v3_the_latest(self): + """Get the third version, the most recent.""" + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.abs')) + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.tar.gz')) + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v2.abs')) + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v2.ps.gz')) + + touch(join(self.ftp, 'arxiv', 'papers', '1901', '1901.00123.abs')) + path = join(self.ftp, 'arxiv', 'papers', '1901', '1901.00123.tar.gz') + touch(path) + + identifier = D.VersionedIdentifier('1901.00123v3') + + source = content.get_source(self.data_path, identifier) + + self.assertEqual(source.content_type, D.ContentType.tar) + self.assertTrue(source.is_gzipped) + self.assertEqual(source.ref.path, path) + + def test_get_v3_the_latest_old_style(self): + """Get the third version, the most recent.""" + touch(join(self.ori, 'math', 'papers', '9501', '95010123v1.abs')) + touch(join(self.ori, 'math', 'papers', '9501', '95010123v1.tar.gz')) + touch(join(self.ori, 'math', 'papers', '9501', '95010123v2.abs')) + touch(join(self.ori, 'math', 'papers', '9501', '95010123v2.tar.gz')) + + touch(join(self.ftp, 'math', 'papers', '9501', '95010123.abs')) + path = join(self.ftp, 'math', 'papers', '9501', '95010123.tar.gz') + touch(path) + + identifier = D.VersionedIdentifier('math/95010123v3') + + source = content.get_source(self.data_path, identifier) + + self.assertEqual(source.content_type, D.ContentType.tar) + self.assertTrue(source.is_gzipped) + self.assertEqual(source.ref.path, path) + + def test_get_v1_the_only(self): + """Get the first and only version.""" + touch(join(self.ftp, 'arxiv', 'papers', '1901', '1901.00123.abs')) + path = join(self.ftp, 'arxiv', 'papers', '1901', '1901.00123.tar.gz') + touch(path) + + identifier = D.VersionedIdentifier('1901.00123v1') + + source = content.get_source(self.data_path, identifier) + + self.assertEqual(source.content_type, D.ContentType.tar) + self.assertTrue(source.is_gzipped) + self.assertEqual(source.ref.path, path) + + def test_get_v1_the_only_old_style(self): + """Get the first and only version.""" + touch(join(self.ftp, 'math', 'papers', '9501', '95010123.abs')) + path = join(self.ftp, 'math', 'papers', '9501', '95010123.tar.gz') + touch(path) + + identifier = D.VersionedIdentifier('math/95010123v1') + + source = content.get_source(self.data_path, identifier) + + self.assertEqual(source.content_type, D.ContentType.tar) + self.assertTrue(source.is_gzipped) + self.assertEqual(source.ref.path, path) + + def test_get_v2_nonexistant(self): + """Get a version that does not exist.""" + touch(join(self.ftp, 'arxiv', 'papers', '1901', '1901.00123.abs')) + touch(join(self.ftp, 'arxiv', 'papers', '1901', '1901.00123.tar.gz')) + + with self.assertRaises(IOError): + content.get_source(self.data_path, + D.VersionedIdentifier('1901.00123v2')) + + def test_get_v2_nonexistant_old_style(self): + """Get a version that does not exist.""" + touch(join(self.ftp, 'math', 'papers', '9501', '95010123.abs')) + touch(join(self.ftp, 'math', 'papers', '9501', '95010123.tar.gz')) + + with self.assertRaises(IOError): + content.get_source(self.data_path, + D.VersionedIdentifier('math/95010123v2')) + + def test_get_v3_nonexistant(self): + """Get a version that does not exist.""" + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.abs')) + touch(join(self.ori, 'arxiv', 'papers', '1901', '1901.00123v1.ps.gz')) + touch(join(self.ftp, 'arxiv', 'papers', '1901', '1901.00123.abs')) + touch(join(self.ftp, 'arxiv', 'papers', '1901', '1901.00123.tar.gz')) + + with self.assertRaises(IOError): + content.get_source(self.data_path, + D.VersionedIdentifier('1901.00123v3')) + + def test_get_v3_nonexistant_old_style(self): + """Get a version that does not exist.""" + touch(join(self.ori, 'math', 'papers', '9501', '95010123v1.abs')) + touch(join(self.ori, 'math', 'papers', '9501', '95010123v1.ps.gz')) + touch(join(self.ftp, 'math', 'papers', '9501', '95010123.abs')) + touch(join(self.ftp, 'math', 'papers', '9501', '95010123.tar.gz')) + + with self.assertRaises(IOError): + content.get_source(self.data_path, + D.VersionedIdentifier('math/95010123v3')) + + def tearDown(self): + shutil.rmtree(self.data_path) + shutil.rmtree(self.cache_path) + + + +# class TestGetRemoteContent(TestCase): +# """Test getting content from arxiv.org.""" + +# def test_get_via_http(self): +# """Get metadata about a PDF via HTTP.""" +# cf = content._get_via_http(D.VersionedIdentifier('0801.1021v2'), +# D.ContentType.pdf) +# self.assertEqual(cf.size_bytes, 237187) +# self.assertEqual(cf.content_type, D.ContentType.pdf) +# self.assertTrue(cf.filename.endswith(D.ContentType.pdf.ext)) +# self.assertEqual(cf.ref, +# D.URI('https://arxiv.org/pdf/0801.1021v2.pdf')) diff --git a/arxiv/canonical/serialize/classic/tests/test_daily.py b/arxiv/canonical/classic/tests/test_daily.py similarity index 77% rename from arxiv/canonical/serialize/classic/tests/test_daily.py rename to arxiv/canonical/classic/tests/test_daily.py index 2aa4f8f..dde50fc 100644 --- a/arxiv/canonical/serialize/classic/tests/test_daily.py +++ b/arxiv/canonical/classic/tests/test_daily.py @@ -1,10 +1,11 @@ """Tests for :mod:`.serialize.classic.daily`.""" -from unittest import TestCase import os +from datetime import date +from unittest import TestCase from .. import daily -from ....domain import Event +from ...domain import Event, EventType sample_data = """ 980302|gr-qc|9802067-9802072|hep-th9712213 hep-th9802173 physics.class-ph9802047|9708027 @@ -29,9 +30,9 @@ def test_parse_oldstyle_line(self): events = [e for e in daily.DailyLogParser().parse_line( "980302|hep-th|9802196-9802204|cond-mat.mes-hall9802266 cond-mat.mes-hall9802267 cond-mat.mes-hall9802290 hep-ph9802436|9709125 9712213 gr-qc9708027 hep-ph9708203" )] - new = [e for e in events if e.event_type is Event.Type.NEW] - cross = [e for e in events if e.event_type is Event.Type.CROSSLIST] - replaced = [e for e in events if e.event_type is Event.Type.REPLACED] + new = [e for e in events if e.event_type is EventType.NEW] + cross = [e for e in events if e.event_type is EventType.CROSSLIST] + replaced = [e for e in events if e.event_type is EventType.REPLACED] self.assertEqual(len(new), 9) for event in new: @@ -60,9 +61,9 @@ def test_parse_oldstyle_line_new_only(self): events = [e for e in daily.DailyLogParser().parse_line( "980302|hep-lat|9802036-9802038||" )] - new = [e for e in events if e.event_type is Event.Type.NEW] - cross = [e for e in events if e.event_type is Event.Type.CROSSLIST] - replaced = [e for e in events if e.event_type is Event.Type.REPLACED] + new = [e for e in events if e.event_type is EventType.NEW] + cross = [e for e in events if e.event_type is EventType.CROSSLIST] + replaced = [e for e in events if e.event_type is EventType.REPLACED] self.assertEqual(len(new), (9802038 - 9802036) + 1) self.assertEqual(len(cross), 0) @@ -73,9 +74,9 @@ def test_parse_oldstyle_new_and_cross(self): events = [e for e in daily.DailyLogParser().parse_line( "980302|hep-ex|9802024|hep-ph9802408 physics.ins-det9802015|" )] - new = [e for e in events if e.event_type is Event.Type.NEW] - cross = [e for e in events if e.event_type is Event.Type.CROSSLIST] - replaced = [e for e in events if e.event_type is Event.Type.REPLACED] + new = [e for e in events if e.event_type is EventType.NEW] + cross = [e for e in events if e.event_type is EventType.CROSSLIST] + replaced = [e for e in events if e.event_type is EventType.REPLACED] self.assertEqual(len(new), 1) self.assertEqual(len(cross), 2) @@ -89,12 +90,12 @@ def test_parse_newstyle_line(self): with open(os.path.join(DATA, 'new.daily.log')) as f: lines = [line for line in f] events = [e for e in parser.parse_line(lines[0])] - new = [e for e in events if e.event_type is Event.Type.NEW] - cross = [e for e in events if e.event_type is Event.Type.CROSSLIST] - replaced = [e for e in events if e.event_type is Event.Type.REPLACED] + new = [e for e in events if e.event_type is EventType.NEW] + cross = [e for e in events if e.event_type is EventType.CROSSLIST] + replaced = [e for e in events if e.event_type is EventType.REPLACED] self.assertEqual(len(new), 530) self.assertEqual(len(cross), 15) - self.assertEqual(len(replaced), 406) + self.assertEqual(len(replaced), 317) class TestParse(TestCase): @@ -110,6 +111,18 @@ def test_whole_file(self): events = [e for e in iterable] self.assertEqual(len(events), 1882, 'Reads 1,882 events from the log.') + def test_for_date(self): + """Parse only events for date 2019-04-12.""" + parser = daily.DailyLogParser() + iterable = parser.parse(os.path.join(DATA, 'new.daily.log'), + for_date=date(2019, 4, 12)) + + self.assertTrue(hasattr(iterable, '__iter__')) + + events = [e for e in iterable] + self.assertEqual(len(events), 951, 'Reads 951 events from the log.') + + class TestWeirdEdgeCase(TestCase): """ @@ -121,9 +134,9 @@ def test_weird_line(self): line = "991210|nlin-sys||cond-mat.mes-hall9912038 cond-mat.stat-mech9912081 cond-mat.stat-mech9912110 hep-th9908090 math.SG9912021 quant-ph9912007|quant-ph9902015 quant-ph9902016 9704019.0chao-dyn 9902003.0chao-dyn 9904021.0chao-dyn 9907001.0chao-dyn 9912003.4solv-int cond-mat.stat-mech9908480 cond-mat.stat-mech9911291" events = [e for e in daily.DailyLogParser().parse_line(line)] - new = [e for e in events if e.event_type is Event.Type.NEW] - cross = [e for e in events if e.event_type is Event.Type.CROSSLIST] - replaced = [e for e in events if e.event_type is Event.Type.REPLACED] + new = [e for e in events if e.event_type is EventType.NEW] + cross = [e for e in events if e.event_type is EventType.CROSSLIST] + replaced = [e for e in events if e.event_type is EventType.REPLACED] self.assertEqual(len(new), 0) self.assertEqual(len(cross), 6) diff --git a/arxiv/canonical/classic/tests/test_serialize_classic.py b/arxiv/canonical/classic/tests/test_serialize_classic.py new file mode 100644 index 0000000..8fb4750 --- /dev/null +++ b/arxiv/canonical/classic/tests/test_serialize_classic.py @@ -0,0 +1,44 @@ +"""Tests for :mod:`.serialize.classic`.""" + +from unittest import TestCase +import os +import json +from pprint import pprint + +import jsonschema + +from ... import serialize +from ... import classic + +DATA = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data') +ABS_ROOT = os.path.join(DATA, 'orig') + +ALL_ABS = [os.path.join(base, fname) + for base, dirs, fnames in os.walk(ABS_ROOT) + for fname in fnames if fname.endswith('.abs')] + + +# class TestClassicDeserialize(TestCase): +# """Test deserialization of the classic abs format.""" + +# SCHEMA_PATH = os.path.abspath('schema/resources') + +# def setUp(self): +# """Get a JSON Schema reference resolver.""" +# resolver_path = 'file://%s/' % self.SCHEMA_PATH +# self.resolver = jsonschema.RefResolver(resolver_path, None) + +# def test_parse(self): +# """Can parse and reserialize classic abs records.""" +# with open(os.path.join(self.SCHEMA_PATH, 'Abs.json')) as f: +# schema = json.load(f) + +# for abs in ALL_ABS: +# self.assertIsNone( +# jsonschema.validate( +# json.loads(serialize.dumps(classic.abs.parse(abs))), +# schema, +# resolver=self.resolver +# ), +# f"Abs file {abs} is parsed successfully" +# ) diff --git a/arxiv/canonical/classic/util.py b/arxiv/canonical/classic/util.py new file mode 100644 index 0000000..563f599 --- /dev/null +++ b/arxiv/canonical/classic/util.py @@ -0,0 +1,73 @@ + +import json +import os +from datetime import date +from typing import Optional + +import pickle + +from arxiv.util.serialize import ISO8601JSONEncoder, ISO8601JSONDecoder +from ..serialize import CanonicalEncoder, CanonicalDecoder + + +class _Persistent: + + encoder = ISO8601JSONEncoder + decoder = ISO8601JSONDecoder + + def save(self, path: Optional[str] = None) -> None: + if path is None: + path = self._path # type: ignore ; pylint: disable=no-member + with open(path, 'w') as f: + json.dump(self, f, cls=self.encoder) + + +class PersistentIndex(dict, _Persistent): + """Persistent lookup with JSON serialization.""" + + encoder = ISO8601JSONEncoder + decoder = ISO8601JSONDecoder + + def load(self, path: str) -> None: + self._path = path + if not os.path.exists(path): + with open(path, 'w') as f: + json.dump({}, f, cls=self.encoder) + with open(path, 'r') as f: + self.update(json.load(f, cls=self.decoder)) + + +class PersistentList(list, _Persistent): + """Persistent list with JSON serialization.""" + + def load(self, path: str) -> None: + self._path = path + if not os.path.exists(path): + with open(path, 'w') as f: + json.dump([], f, cls=self.encoder) + with open(path, 'r') as f: + self.extend(json.load(f, cls=self.decoder)) + + +class PersistentMultifileIndex(dict): + encoder = ISO8601JSONEncoder + decoder = ISO8601JSONDecoder + + def load(self, path: str) -> None: + self._path = path + if not os.path.exists(self._path): + os.makedirs(self._path) + for fname in os.listdir(self._path): + if not fname.startswith('_'): + continue + with open(os.path.join(self._path, fname), 'rb') as f: + key = json.loads(fname[1:], cls=self.decoder) + self[key] = pickle.load(f) + + def save(self, path: Optional[str] = None) -> None: + if path is None: + path = self._path # type: ignore ; pylint: disable=no-member + for key, value in self.items(): + fname = f'_{json.dumps(key, cls=self.encoder)}' + with open(os.path.join(self._path, fname), 'wb') as f: + pickle.dump(value, f) diff --git a/arxiv/canonical/core.py b/arxiv/canonical/core.py new file mode 100644 index 0000000..e74f5c6 --- /dev/null +++ b/arxiv/canonical/core.py @@ -0,0 +1,277 @@ +"""Core interfaces for the canonical record.""" + +import io +import datetime +from typing import Any, Callable, Dict, IO, Iterable, List, Sequence, Tuple, Type, \ + TypeVar, Union + +from typing_extensions import Protocol + +from . import domain as D +from . import integrity as I +from . import record as R +from .manifest import Manifest + + +class IEventStream(Protocol): + """Interface for the canonical event stream.""" + + def emit(self, event: D.Event) -> None: + """ + Emit an :class:`Event` on the stream. + + Parameters + ---------- + event : :class:`Event` + + """ + + def listen(self, on_event: Callable[[D.Event], None]) -> None: + """ + Listen for :class:`Event`s on the stream. + + Parameters + ---------- + on_event : callable + This object will be called for every event that is received. It + should accept a single argument, an :class:`.Event`, and is + expected to return ``None``. + + """ + + +class ICanonicalSource(Protocol): + """Interface for source services, used to dereference URIs.""" + + def can_resolve(self, uri: D.URI) -> bool: + """ + Indicate whether or not the implementation can resolve an URI. + + Parameters + ---------- + uri : :class:`.D.URI` + + Returns + ------- + bool + + """ + + def load(self, key: D.URI) -> IO[bytes]: # pylint: disable=unused-argument; this is a stub. + """ + Make an IO that waits to load from the record until it is read(). + + Parameters + ---------- + key : :class:`D.URI` + + Returns + ------- + IO + Yields bytes when read. This may be a lazy IO object, so that + reading is deferred until the latest possible time. + + """ + + +class IStorableEntry(Protocol): + """ + Minimal interface for a bitstream interface that can be stored. + + Services that implement :class:`.ICanonicalStorage` can assume that the + attributes of this interface are available on objects passed for storing. + """ + + name: str + """Name of the entry.""" # pylint: disable=pointless-string-statement; this is a docstring. + + @property + def checksum(self) -> str: + """URL-safe b64-encoded md5 hash.""" + + @property + def record(self) -> R.RecordEntry: + """Reference to a :class:`.RecordEntry`.""" + + def update_checksum(self) -> None: + """Update the integrity checksum for this entry.""" + + +class IManifestStorage(Protocol): + """ + Manifest protocol. + + This could conceivably be stored separately from the canonical record + content, so it is defined separately. + """ + + def store_manifest(self, key: D.Key, manifest: Manifest) -> None: # pylint: disable=unused-argument; this is a stub. + """ + Store an integrity manifest. + + Parameters + ---------- + key : :class:`.Key` + Key used to identify ``manifest`` in storage. + manifest : :class:`.Manifest` + The manifest record to store. + + """ + + def load_manifest(self, key: D.Key) -> Manifest: # pylint: disable=unused-argument; this is a stub. + """ + Load an integrity manifest. + + Parameters + ---------- + key : :class:`.Key` + Key used to identify ``manifest`` in storage. + + Returns + ------- + :class:`.Manifest` + + """ + + +_I = TypeVar('_I', I.IntegrityEntry, I.IntegrityMetadata, I.IntegrityListing, + covariant=True) + + +class ICanonicalStorage(ICanonicalSource, IManifestStorage, Protocol): + """Interface for services that store the canonical record.""" + + def list_subkeys(self, key: D.URI) -> List[str]: # pylint: disable=unused-argument; this is a stub. + """ + List all of the subkeys (direct descendants) of ``key`` in the record. + + Parameters + ---------- + key : :class:`.URI` + + Returns + ------- + list + Items are the relative names of the descendants of ``key``. For + filesystem-based storage, this may be equivalent to ``os.listdir``. + + """ + + def store_entry(self, ri: IStorableEntry) -> None: # pylint: disable=unused-argument; this is a stub. + """ + Store a bitstream entry in the record. + + This method MUST decompress the content of the entry if it is gzipped + (as is sometimes the case in the classic system) and update the + ``CanonicalFile`` (``ri.record.stream.domain``). + + Parameters + ---------- + ri : :class:`.IStorableEntry` + A storable bitstream. + + """ + + def load_entry(self, key: D.URI) -> Tuple[R.RecordStream, str]: + """ + Load a bitstream entry. + + Parameters + ---------- + key : :class:`.URI` + Key that identifies the bitsream in the record. + + Returns + ------- + :class:`.RecordStream` + The bitstream resource. + str + Checksum of the bitstream (URL-safe base64-encoded md5 hash).ß + + """ + + +Year = int +"""Years are represented as four-digit integers.""" + +Month = int +"""Months are represented as integers.""" + +YearMonth = Tuple[Year, Month] +"""A month in a particular year is represented as a 2-tuple of integers.""" + +Selector = Union[Year, YearMonth, datetime.date] +"""A selector can refer to a year, month, or a specific date.""" + +_ID = Union[D.VersionedIdentifier, D.Identifier] + + +class IRegisterAPI(Protocol): + """Interface for the canonical register API.""" + + def add_events(self, *events: D.Event) -> None: + """Add new events to the register.""" + + def load_version(self, identifier: D.VersionedIdentifier) -> D.Version: # pylint: disable=unused-argument; this is a stub. + """Load an e-print :class:`.Version` from the record.""" + + def load_eprint(self, identifier: D.Identifier) -> D.EPrint: # pylint: disable=unused-argument; this is a stub. + """Load an :class:`.EPrint` from the record.""" + + def load_history(self, identifier: _ID) -> Iterable[D.EventSummary]: # pylint: disable=unused-argument; this is a stub. + """Load the event history of an :class:`.EPrint`.""" + + def load_event(self, identifier: str) -> D.Event: # pylint: disable=unused-argument; this is a stub. + """Load an :class:`.Event` by identifier.""" + + def load_events(self, selector: Selector) -> Tuple[Iterable[D.Event], int]: # pylint: disable=unused-argument; this is a stub. + """Load all :class:`.Event`s for a day, month, or year.""" + + def load_listing(self, date: datetime.date, # pylint: disable=unused-argument; this is a stub. + shard: str = D.Event.get_default_shard()) -> D.Listing: # pylint: disable=no-member, unused-argument + """Load a :class:`.Listing` for a particulate date.""" + + +# TODO: implement me! +class IPreservationAPI(Protocol): + """Interface for the daily preservation record API.""" + + def add_events(self, *events: D.Event) -> None: + """Add new events to the preservation record.""" + + def load_package(self, date: datetime.date) -> I.IntegrityEntry: + """Load the preservation package for a particular date.""" + + +# TODO: consider a semantically more meaningful exception for failure to +# dereference the URI. +def dereference(sources: Sequence[ICanonicalSource], uri: D.URI) -> IO[bytes]: + """ + Dereference an URI using a set of available sources. + + Sources are checked one at a time for ability to resolve the URI. When one + is found, the URI is loaded. + + Parameters + ---------- + sources : sequence + Items are content sources that should conform to + :class:`.ICanonicalSource`. They will be tried in the order provided. + uri : :class:`.URI` + URI to dereference. + + Returns + ------- + io + BytesIO object. + + Raises + ------ + :class:`RuntimeError` + Raised when the URI cannot be resolved. + + """ + for source in sources: + if source.can_resolve(uri): + return source.load(uri) + raise RuntimeError(f'Cannot resolve URI: {uri}') \ No newline at end of file diff --git a/arxiv/canonical/domain/__init__.py b/arxiv/canonical/domain/__init__.py index d603323..7d8005f 100644 --- a/arxiv/canonical/domain/__init__.py +++ b/arxiv/canonical/domain/__init__.py @@ -1,18 +1,73 @@ -"""Core data structures and concepts.""" +""" +Core data structures and concepts used to describe arXiv e-prints. +The structures in this module capture the central ideas and semantics of +e-prints, their versions, and their content. It does not address things like +how the canonical record is encoded in a key-value system, nor mechanisms for +verifying completeness or integrity. +""" + +from arxiv.taxonomy import Category + +from .base import CanonicalBase, CanonicalBaseCollection +from .block import AllEPrints, EPrintYear, EPrintMonth, EPrintDay +from .content import ContentType, SourceType, available_formats_by_ext, \ + list_source_extensions +from .eprint import EPrint +from .file import CanonicalFile, URI, Key +from .identifier import Identifier, InvalidIdentifier, VersionedIdentifier from .license import License +from .listing import Listing, ListingDay, ListingMonth, ListingYear, \ + AllListings, ListingIdentifier from .person import Person -from .event import Event -from .identifier import Identifier, VersionedIdentifier -from .file import File -from .eprint import EPrint, VersionReference -from .block import MonthlyBlock -from .record import CanonicalRecord -from .listing import Listing - - -domain_classes = [ - obj for obj in locals().values() - if type(obj) is type and tuple in obj.__bases__ and hasattr(obj, '_fields') -] +from .version import Version, VersionReference, Metadata, Event, EventType, \ + EventSummary, EventIdentifier + +domain_classes = ( + AllEPrints, + AllListings, + CanonicalFile, + Category, + CanonicalBase, + CanonicalBaseCollection, + ContentType, + EPrint, + EPrintDay, + EPrintMonth, + EPrintYear, + Event, + EventIdentifier, + EventSummary, + EventType, + Identifier, + InvalidIdentifier, + Key, + License, + Listing, + ListingDay, + ListingIdentifier, + ListingYear, + ListingMonth, + Metadata, + Person, + SourceType, + URI, + Version, + VersionedIdentifier, + VersionReference, +) + + +__all__ = [cls.__name__ for cls in domain_classes] """All of the core domain classes in this package.""" + +__all__ += ['available_formats_by_ext', 'list_source_extensions'] + + +class Canon(CanonicalBase): + """Represents the canonical record as a whole from a domain perspective.""" + + def __init__(self, eprints: AllEPrints, listings: AllListings) -> None: + """The canon is comprised of all e-prints and all listings.""" + self.eprints = eprints + self.listings = listings \ No newline at end of file diff --git a/arxiv/canonical/domain/base.py b/arxiv/canonical/domain/base.py new file mode 100644 index 0000000..bbbf437 --- /dev/null +++ b/arxiv/canonical/domain/base.py @@ -0,0 +1,30 @@ +"""Base classes/types for the domain.""" + +from typing import (Any, Callable, Dict, Iterable, Set, Type, TypeVar, Union, + cast) +from typing_extensions import Protocol, runtime_checkable + + +class CanonicalBase: + """Base class for all canonical domain classes.""" + + exclude_from_comparison: Set[str] = set() + """Names of attributes not to be used in __eq__ comparisons.""" + + def __eq__(self, other: Any) -> bool: + """Compare this domain object to another domain object.""" + if not isinstance(other, CanonicalBase): + return False + keys = ((set(self.__class__.__annotations__.keys()) # pylint: disable=no-member ; subclasses have annotations. + | set(other.__class__.__annotations__.keys())) + - self.exclude_from_comparison) + try: + for key in keys: + assert getattr(self, key) == getattr(other, key) + except AssertionError: + return False + return True + + +class CanonicalBaseCollection(CanonicalBase): + """Base class for domain classes that act as collections.""" \ No newline at end of file diff --git a/arxiv/canonical/domain/block.py b/arxiv/canonical/domain/block.py index 4228cc2..7996044 100644 --- a/arxiv/canonical/domain/block.py +++ b/arxiv/canonical/domain/block.py @@ -1,68 +1,67 @@ -"""Provides core domain concepts and logic for the monly listing block.""" +"""Structures for organizing e-prints into periods of time.""" -from typing import NamedTuple, List, Mapping, Optional -from datetime import date, datetime -from collections import OrderedDict +import collections +import datetime +from typing import NamedTuple, List, Mapping, Optional, Dict, Iterator, Tuple +from typing_extensions import Protocol + +from .base import CanonicalBase from .eprint import EPrint -from .event import Event +from .version import Event from .identifier import Identifier, VersionedIdentifier from .listing import Listing from .util import now +from .version import Version + +Year = int +Month = int +YearMonth = Tuple[Year, Month] + +class EPrintDay(CanonicalBase): + """E-prints originally announced on a specific day.""" -class MonthlyBlock(NamedTuple): - """Contains the e-prints announced in a particular calendar month.""" + def __init__(self, date: datetime.date, + eprints: Mapping[Identifier, EPrint]) -> None: + """Initialize with e-prints for a particular day.""" + self.date = date + self.eprints = eprints - year: int - month: int - eprints: Mapping[VersionedIdentifier, EPrint] + +class EPrintMonth(CanonicalBase): + """E-prints originally announced in a particular calendar month.""" + + def __init__(self, name: YearMonth, + days: Mapping[datetime.date, EPrintDay]) -> None: + """Initialize with e-prints for a particular month.""" + self.name = name + self.days = days @property - def is_open(self) -> bool: - """Determine whether this block can accept new e-prints.""" - today = date.today() - return bool(today.year == self.year and today.month == self.month) + def year(self) -> Year: + return self.name[0] @property - def is_closed(self) -> bool: - """Inverse of :attr:`.is_open` (of course).""" - return not self.is_open - - def get_next_identifier(self) -> Identifier: - """Get the next available (unused) arXiv identifier in this block.""" - identifiers = sorted(self.eprints.keys(), - key=lambda ident: ident.incremental_part) - inc = identifiers[-1].incremental_part + 1 if identifiers else 1 - return Identifier.from_parts(self.year, self.month, inc) - - def add(self, eprint: EPrint) -> None: - if eprint.versioned_identifier in self.eprints: - raise ValueError(f'Already exists: {eprint.versioned_identifier}') - self._check_right_block(eprint) - self.eprints[eprint.versioned_identifier] = eprint - - def update(self, eprint: EPrint) -> None: - if eprint.versioned_identifier not in self.eprints: - raise ValueError(f'Not in block: {eprint.versioned_identifier}') - self._check_right_block(eprint) - self.eprints[eprint.versioned_identifier] = eprint - - def load_eprint(self, arxiv_id: Identifier, - version: Optional[int] = None) -> EPrint: - if not version: - version = self._get_latest_version(arxiv_id) - return self.eprints[VersionedIdentifier.from_parts(arxiv_id, version)] - - def _get_latest_version(self, arxiv_id: Identifier) -> int: - versions = [versioned_identifier for versioned_identifier - in self.eprints.keys() - if versioned_identifier.arxiv_id == arxiv_id] - if not versions: - raise KeyError(f'No such eprint: {arxiv_id}') - return sorted(versions, key=lambda k: k.version)[-1].version - - def _check_right_block(self, eprint: EPrint) -> None: - if eprint.arxiv_id.year != self.year \ - or eprint.arxiv_id.month != self.month: - raise ValueError(f'Wrong block: {eprint.versioned_identifier}') \ No newline at end of file + def month(self) -> Month: + return self.name[1] + + +class EPrintYear(CanonicalBase): + """E-prints originally announced in a particular calendar year.""" + + def __init__(self, year: Year, + months: Mapping[Tuple[int, int], EPrintMonth]) -> None: + """Initialize with e-prints for a particular year.""" + self.year = year + self.months = months + + +class AllEPrints(CanonicalBase): + """Represents the complete set of announced e-prints.""" + + def __init__(self, name: str, + years: Mapping[int, EPrintYear]) -> None: + """Initialize with all of the e-prints in the record.""" + self.name = name + self.years = years diff --git a/arxiv/canonical/domain/content.py b/arxiv/canonical/domain/content.py new file mode 100644 index 0000000..dec65ed --- /dev/null +++ b/arxiv/canonical/domain/content.py @@ -0,0 +1,240 @@ +"""Core concepts for characterizing bitstream/version content.""" + +from enum import Enum +from typing import List, Optional + +from .identifier import VersionedIdentifier + + +class SourceFileType(Enum): + """Source file types are represented by single-character codes.""" + + Ignore = 'I' + """All files auto ignore. No paper available.""" + + SourceEncrypted = 'S' + """Source is encrypted and should not be made available.""" + + PostscriptOnly = 'P' + """ + Multi-file PS submission. + + It is not necessary to indicate P with single file PS since in this case + the source file has .ps.gz extension. + """ + + PDFLaTeX = 'D' + """A TeX submission that must be processed with PDFlatex.""" + + HTML = 'H' + """Multi-file HTML submission.""" + + Ancillary = 'A' + """Submission includes ancillary files in the /anc directory.""" + + DCPilot = 'B' + """Submission has associated data in the DC pilot system.""" + + DOCX = 'X' + """Submission in Microsoft DOCX (Office Open XML) format.""" + + ODF = 'O' + """Submission in Open Document Format.""" + + PDFOnly = 'F' + """PDF-only with .tar.gz package (likely because of anc files).""" + + +class SourceType(str): + """Characterizes a version source package.""" + + def __init__(self, value: str) -> None: + """Initialize with source file type codes.""" + self._types = [SourceFileType(v) for v in list(value.upper())] + + @property + def has_docx(self) -> bool: + """Indicate whether the source has DOCX content.""" + return bool(SourceFileType.DOCX in self._types) + + @property + def has_encrypted_source(self) -> bool: + """Indicate whether the source is encryped.""" + return bool(SourceFileType.SourceEncrypted in self._types) + + @property + def has_html(self) -> bool: + """Indicate whether the source has HTML content.""" + return bool(SourceFileType.HTML in self._types) + + @property + def has_ignore(self) -> bool: + """Indicate whether the source content should be ignored.""" + return bool(SourceFileType.Ignore in self._types) + + @property + def has_odf(self) -> bool: + """Indicate whether the source has ODF content.""" + return bool(SourceFileType.ODF in self._types) + + @property + def has_pdf_only(self) -> bool: + """Indicate whether the source contains only a PDF.""" + return bool(SourceFileType.PDFOnly in self._types) + + @property + def has_pdflatex(self) -> bool: + """Indicate whether the source has PDFLaTeX content.""" + return bool(SourceFileType.PDFLaTeX in self._types) + + @property + def has_ps_only(self) -> bool: + """Indicate whether the source has postcript content only.""" + return bool(SourceFileType.PostscriptOnly in self._types) + + @property + def available_formats(self) -> List['ContentType']: + """ + List the available dissemination formats for this source type. + + Depending on the original source type, we may not be able to provide + all supported formats. + + This does not include the source format. Note also that this does + **not** enforce rules about what should be displayed as an option + or provided to end users. + """ + formats = [] + if self.has_ignore and not self.has_encrypted_source: + pass + elif self.has_ps_only: + formats.extend([ContentType.pdf, ContentType.ps]) + elif self.has_pdflatex: + formats.append(ContentType.pdf) + elif self.has_pdf_only: + formats.append(ContentType.pdf) + elif self.has_html: + formats.append(ContentType.html) + elif self.has_docx or self.has_odf: + formats.append(ContentType.pdf) + else: + formats.extend([ + ContentType.pdf, + ContentType.ps, + ContentType.dvi, + ]) + return formats + + +class ContentType(Enum): + """Characterization of the content type of an individual bitstream.""" + + pdf = 'pdf' + tar = 'tar' + json = 'json' + abs = 'abs' + html = 'html' + dvi = 'dvi' + ps = 'ps' + tex = 'tex' + + @property + def mime_type(self) -> str: + """The MIME content type for this :class:`.ContentType`.""" + return _mime_types[self] + + @property + def ext(self) -> str: + """The preferred filename extension for this :class:`.ContentType`.""" + return _extensions[self] + + @classmethod + def from_filename(cls, filename: str) -> 'ContentType': + """Infer the :class:`.ContentType` of a file from its filename.""" + for ctype, ext in _extensions.items(): + if filename.endswith(ext) or filename.endswith(f'{ext}.gz'): + return ctype + raise ValueError(f'Unrecognized extension: {filename}') + + @classmethod + def from_mimetype(cls, mime: str) -> 'ContentType': + """Infer the :class:`.ContentType` of a file from its MIME type.""" + return {v: k for k, v in _mime_types.items()}[mime] + + def make_filename(self, identifier: VersionedIdentifier, + is_gzipped: bool = False) -> str: + """Make a filename for a bitstream with this :class:`.ContentType`.""" + if identifier.is_old_style: + fn = f'{identifier.numeric_part}v{identifier.version}.{self.ext}' + else: + fn = f'{identifier}.{self.ext}' + if is_gzipped: + fn = f'{fn}.gz' + return fn + + +_mime_types = { + ContentType.pdf: 'application/pdf', + ContentType.tar: 'application/x-tar', + ContentType.json: 'application/json', + ContentType.abs: 'text/plain', + ContentType.html: 'text/html', + ContentType.dvi: 'application/x-dvi', + ContentType.ps: 'application/postscript', + ContentType.tex: 'application/x-tex', +} + +_extensions = { + ContentType.pdf: 'pdf', + ContentType.tar: 'tar', + ContentType.json: 'json', + ContentType.abs: 'abs', + ContentType.html: 'html', + ContentType.dvi: 'dvi', + ContentType.ps: 'ps', + ContentType.tex: 'tex' +} + + +DISSEMINATION_FORMATS_BY_SOURCE_EXT = [ + ('.tar.gz', None), + ('.tar', None), + ('.dvi.gz', None), + ('.dvi', None), + ('.pdf', [ContentType.pdf]), + ('.ps.gz', [ContentType.pdf, ContentType.ps]), + ('.ps', [ContentType.pdf, ContentType.ps]), + ('.html.gz', [ContentType.html]), + ('.html', [ContentType.html]), + ('.gz', None), +] +""" +Dissemination formats that can be inferred from source file extension. + +.. note:: + This is largely to support format discovery in classic. In the NG + canonical record, this should all be explicit. +""" + + +def available_formats_by_ext(filename: str) -> Optional[List[ContentType]]: + """ + Attempt to determine the available dissemination formats by file extension. + + It sometimes (but not always) possible to infer the available dissemination + formats based on the filename extension of the source package. + + .. note:: + This is largely to support format discovery in classic. In the NG + canonical record, this should all be explicit. + + """ + for ext, formats in DISSEMINATION_FORMATS_BY_SOURCE_EXT: + if filename.endswith(ext): + return formats + return None + + +def list_source_extensions() -> List[str]: + """List all of the known filename extensions for source files.""" + return [ext for ext, _ in DISSEMINATION_FORMATS_BY_SOURCE_EXT] \ No newline at end of file diff --git a/arxiv/canonical/domain/eprint.py b/arxiv/canonical/domain/eprint.py index aa58292..4796f10 100644 --- a/arxiv/canonical/domain/eprint.py +++ b/arxiv/canonical/domain/eprint.py @@ -1,122 +1,45 @@ -"""Provides the core domain concept and logic for e-prints.""" +"""Provides :class:`.EPrint`.""" -from typing import NamedTuple, Optional, List -from datetime import datetime, date +from datetime import date +from typing import Optional, NamedTuple, Mapping -from arxiv.taxonomy import Category -from .identifier import Identifier -from .event import Event -from .person import Person -from .file import File -from .license import License +from .base import CanonicalBase +from .identifier import Identifier, VersionedIdentifier +from .version import Version -class VersionReference(NamedTuple): - """Reference to an e-print version.""" +class EPrint(CanonicalBase): + """ + Core concept of an e-print in the canonical record. - arxiv_id: str - version: int - submitted_date: datetime - announced_date: str - source_type: str - size_kilobytes: int + An e-print is a collection of one or more sequential :class:`.Version`s, + generally representing projections of a single scholarly work over time. + E-prints are identified by :class:`.Identifier`s. + """ -class EPrint(NamedTuple): - """Canonical metadata record for an arXiv e-print.""" - - arxiv_id: Optional[Identifier] - version: Optional[int] - announced_date: Optional[date] - - legacy: bool - submitted_date: datetime - license: License - primary_classification: Category - title: str - abstract: str - authors: str - source_type: str # TODO: make this an enum. - """Internal code for the source type.""" - size_kilobytes: int - previous_versions: List[VersionReference] - secondary_classification: List[Category] - history: List[Event] - - submitter: Optional[Person] = None - proxy: Optional[str] = None - comments: Optional[str] = None - journal_ref: Optional[str] = None - report_num: Optional[str] = None - doi: Optional[str] = None - msc_class: Optional[str] = None - acm_class: Optional[str] = None - - is_withdrawn: bool = False - reason_for_withdrawal: Optional[str] = None - - source_package: Optional[File] = None - pdf: Optional[File] = None + def __init__(self, identifier: Optional[Identifier], + versions: Mapping[VersionedIdentifier, Version]) -> None: + """Initialize with an identifier and a set of versions.""" + self.identifier = identifier + self.versions = versions @property - def all_categories(self) -> List[str]: - return [self.primary_classification] + self.secondary_classification + def announced_date(self) -> Optional[date]: + """Date on which the first version of this e-print was announced.""" + idents = [v for v in self.versions] + return self.versions[idents[0]].announced_date @property - def is_announced(self): - """ - Determine whether or not this e-print has already been announced. - - An e-print is announced when it has been assigned an identifier, and - the announcement date is set. Replacements or cross-lists that are - not announced will have identifiers but not announcement dates. - """ - return self.arxiv_id is not None and self.announced_date is not None - - def as_announced(self, arxiv_id: Identifier, version: int, on: date) \ - -> 'EPrint': - _, _, _, *data = self - return EPrint(arxiv_id, version, on, *data) - - def as_withdrawn(self, version: int, on: date) -> 'EPrint': - return EPrint( - arxiv_id=self.arxiv_id, - version=version, - announced_date=on, - legacy=self.legacy, - submitted_date=self.submitted_date, - license=self.license, - primary_classification=self.primary_classification, - title=self.title, - abstract=self.abstract, - authors=self.authors, - source_type=self.source_type, - size_kilobytes=self.size_kilobytes, - previous_versions=self.previous_versions, - secondary_classification=self.secondary_classification, - history=self.history, - submitter=self.submitter, - proxy=self.proxy, - comments=self.comments, - journal_ref=self.journal_ref, - report_num=self.report_num, - doi=self.doi, - msc_class=self.msc_class, - acm_class=self.acm_class, - is_withdrawn=True, - reason_for_withdrawal=self.reason_for_withdrawal, - source_package=self.source_package, - pdf=self.pdf - ) - - def add_secondaries(self, *new_secondaries: Category) -> None: - for category in new_secondaries: - if category not in self.secondary_classification: - self.secondary_classification.append(category) + def is_withdrawn(self) -> bool: + """Indicate whether this e-print has been withdrawn.""" + idents = [v for v in self.versions] + return self.versions[idents[-1]].is_withdrawn + # TODO: this is a legacy hold-over; reconsider whether we need it for + # anything. @property - def versioned_identifier(self) -> str: - if not self.arxiv_id or not self.version: - raise ValueError('arXiv ID or version not set') - return f'{self.arxiv_id}v{self.version}' - \ No newline at end of file + def size_kilobytes(self) -> int: + """Indicate the size of the current version of this e-print in kb.""" + idents = [v for v in self.versions] + return self.versions[idents[-1]].size_kilobytes \ No newline at end of file diff --git a/arxiv/canonical/domain/event.py b/arxiv/canonical/domain/event.py deleted file mode 100644 index c349f85..0000000 --- a/arxiv/canonical/domain/event.py +++ /dev/null @@ -1,31 +0,0 @@ -"""Provides the core announcement event concept.""" - -from typing import NamedTuple, Type, List, Optional -from datetime import datetime -from enum import Enum - -from arxiv.taxonomy import Category -from .identifier import Identifier - - -class Event(NamedTuple): - """An announcement-related event.""" - - class Type(Enum): - """Supported event types.""" - - NEW = 'new' - UPDATED = 'updated' - REPLACED = 'replaced' - CROSSLIST = 'cross' - WITHDRAWN = 'withdrawn' - - arxiv_id: Identifier - event_date: datetime - event_type: Type - categories: List[Category] - - description: str = '' - legacy: bool = False - event_agent: Optional[str] = None - version: int = -1 diff --git a/arxiv/canonical/domain/file.py b/arxiv/canonical/domain/file.py index 4fe2c01..e71b3bb 100644 --- a/arxiv/canonical/domain/file.py +++ b/arxiv/canonical/domain/file.py @@ -1,35 +1,141 @@ -"""Provides file-related concepts and logic.""" +"""Provides bitstream-related concepts and logic.""" -from typing import NamedTuple +import os from datetime import datetime +from typing import Any, Dict, IO, Iterable, NamedTuple, Optional, Union +from urllib.parse import urlparse + from typing_extensions import Protocol +from .base import CanonicalBase +from .content import ContentType + + +class URI(str): + """ + A unique identifier for bitstream content. + + Bitstream content may be located in a variety of places prior to + canonicalization. For example, it may be located on a local filesystem, + or at a remote location accessible via HTTP. + """ + + def __new__(cls, value: str) -> 'URI': + """Make a new URI.""" + if value.startswith('/'): + value = f'file:///{value.lstrip("/")}' + uri: URI = super(URI, cls).__new__(cls, value) # type: ignore + return uri -class Readable(Protocol): - def read(self, size: int = -1) -> bytes: - """ - Read raw bytes content from the resource. + def __init__(self, value: str) -> None: + """Initialize and parse an URI from a str value.""" + if value.startswith('/'): + value = f'file:///{value.lstrip("/")}' - This should behave more or less like :func:`io.BufferedIOBase.read`. + o = urlparse(value) + self.scheme = o.scheme + if not self.scheme: + raise ValueError(f'Not a valid URI: {value}') + self.netloc = o.netloc + self.path = o.path + self.params = o.params + self.query = o.query + self.fragment = o.fragment - Examples might include: + @property + def is_canonical(self) -> bool: + """Indicate whether the URI is a key in the canonical record.""" + return bool(self.scheme == 'arxiv') - - A native Python ``file`` object; - - A closure that, when called, creates a new ``file`` pointer and reads - it; - - A closure that, when called, makes an HTTP request and reads the - resource. + @property + def is_file(self) -> bool: + """Indicate whether the URI is a path to a local file.""" + return bool(self.scheme == 'file') - """ - ... + @property + def is_http_url(self) -> bool: + """Indicate whether the URI is an HTTP URL.""" + return bool(self.scheme == 'http' or self.scheme == 'https') -class File(NamedTuple): +class Key(URI): + """The unique identifier for a bitstream in the canonical record.""" + + def __new__(cls, value: str) -> 'Key': + """Make a new key.""" + if not value.startswith('arxiv:///'): + value = f'arxiv:///{value.lstrip("/")}' + key: Key = super(Key, cls).__new__(cls, value) # type: ignore + return key + + def __init__(self, value: str) -> None: + """Initialize a key with a str value.""" + if not value.startswith('arxiv:///'): + value = f'arxiv:///{value.lstrip("/")}' + super(Key, self).__init__(value) + _, self.filename = os.path.split(self.path) + + +class CanonicalFile(CanonicalBase): """Represents a file in the canonical record, e.g. a source package.""" - filename: str - mime_type: str - checksum: str - content: Readable - created: datetime - modified: datetime \ No newline at end of file + modified: datetime + """Last time the file was modified.""" + + size_bytes: int + """Size of the file in bytes.""" + + content_type: ContentType + """The content type of the file.""" + + filename: Optional[str] + """Filename in the canonical record.""" + + ref: URI + """A reference to the location of the content of the file.""" + + is_gzipped: bool + """Whether or not the content at ``ref`` is served in gzipped form.""" + + exclude_from_comparison = {'ref', 'is_gzipped'} + + def __init__(self, modified: datetime, + size_bytes: int, + content_type: ContentType, + ref: URI, + filename: Optional[str] = None, + is_gzipped: bool = False) -> None: + self.modified = modified + self.size_bytes = size_bytes + self.content_type = content_type + self.filename = filename + self.ref = ref + self.is_gzipped = is_gzipped + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'CanonicalFile': + """Reconstitute a :class:`.CanonicalFile` from a native dict.""" + return cls( + modified=datetime.fromisoformat(data['modified']), # type: ignore ; pylint: disable=no-member + size_bytes=data['size_bytes'], + content_type=ContentType(data['content_type']), + filename=data['filename'], + ref=URI(data['ref']), + is_gzipped=data.get('is_gzipped', False) + ) + + @property + def mime_type(self) -> str: + """Convenience accessor for the MIME type of the file.""" + return self.content_type.mime_type + + def to_dict(self) -> Dict[str, Any]: + """Generate a native dict from this :class:`.CanonicalFile`.""" + return { + 'modified': self.modified.isoformat(), + 'size_bytes': self.size_bytes, + 'content_type': self.content_type.value, + 'filename': self.filename, + 'ref': self.ref, + 'is_gzipped': self.is_gzipped + } diff --git a/arxiv/canonical/domain/identifier.py b/arxiv/canonical/domain/identifier.py index ca99f57..2ccdb8b 100644 --- a/arxiv/canonical/domain/identifier.py +++ b/arxiv/canonical/domain/identifier.py @@ -1,23 +1,56 @@ """Provides the concept of an arXiv identifier.""" +from typing import Any + from arxiv import identifier +# These are somewhat of a mystery, for there is no reference to ``neuro-sys`` +# in the codebase. This mapping was generated by searching for "neuro-sys" +# in comments field, which yields results like: +# +# "Originally submitted to the neuro-sys archive which was never publicly +# announced (was 0005002)" +NEURO_SYS_IDENTIFIERS = { + "neuro-sys/0005002": "q-bio/0309034", + "neuro-sys/9905003": "q-bio/0309033", # Comment said 002, but must be 003. + "neuro-sys/9905002": "q-bio/0309032", + "neuro-sys/9810001": "q-bio/0309031", + "neuro-sys/9809002": "q-bio/0309030", + "neuro-sys/9806001": "q-bio/0309029", + "neuro-sys/9804003": "q-bio/0309028", + "neuro-sys/9804002": "q-bio/0309027", + "neuro-sys/9804001": "q-bio/0309026", + "neuro-sys/9803001": "q-bio/0309025", + "neuro-sys/9802001": "q-bio/0309024", + "neuro-sys/9801001": "q-bio/0309023", + "neuro-sys/9905001": "cond-mat/9905438", + "neuro-sys/9809001": "cs/9809125", + "neuro-dev/9710001": "cond-mat/9710352", +} + + +class InvalidIdentifier(ValueError): + """A value was encountered that is not a valid arXiv identifier.""" + class Identifier(str): """ An arXiv e-print identifier. - - Supports both old-style (``archive.category/YYMMNNN``) and new-style + + Supports both old-style (``archive.category/YYMMNNN``) and new-style (``YYMM.NNNNN``) identifiers. """ def __init__(self, value: str) -> None: - if identifier.STANDARD.match(value.__str__()): + """Initialize from a raw str value.""" + if value in NEURO_SYS_IDENTIFIERS: + value = NEURO_SYS_IDENTIFIERS[value] + if identifier.STANDARD.match(value.__str__()): # pylint: disable=no-member self.is_old_style = False - elif identifier.OLD_STYLE.match(value.__str__()): + elif identifier.OLD_STYLE.match(value.__str__()): # pylint: disable=no-member self.is_old_style = True else: - raise ValueError('Not a valid arXiv ID') + raise InvalidIdentifier(f'Not a valid arXiv ID: {value}') @classmethod def from_parts(cls, year: int, month: int, inc: int) -> 'Identifier': @@ -25,15 +58,46 @@ def from_parts(cls, year: int, month: int, inc: int) -> 'Identifier': prefix = f'{str(year)[-2:]}{str(month).zfill(2)}' return cls(f'{prefix}.{str(inc).zfill(5)}') + @property + def category_part(self) -> str: + """For old-style identifiers, conveys the primary category.""" + if not self.is_old_style: + raise ValueError('New identifiers have no category semantics') + return self.split('/')[0] + @property def incremental_part(self) -> int: """The part of the identifier that is incremental.""" if self.is_old_style: - return int(self.split('/', 1)[1][4:]) + return int(self.numeric_part[4:]) return int(self.split('.', 1)[1]) + @property + def numeric_part(self) -> str: + """ + The entire numeric component of the identifier. + + For new-style identifiers, this is the entire identifier. + """ + if self.is_old_style: + return self.split('/')[1] + return str(self) + + @property + def yymm(self) -> str: + """Numeric part conveying the original announcement year and month.""" + if self.is_old_style: + numeric_part = self.split('/', 1)[1] + yy = numeric_part[0:2] + mm = numeric_part[2:4] + else: + yy = self[:2] + mm = self[2:4] + return f'{yy}{mm}' + @property def year(self) -> int: + """Year in which the first version of the e-print was announced.""" if self.is_old_style: yy = int(self.split('/', 1)[1][0:2]) else: @@ -44,16 +108,63 @@ def year(self) -> int: @property def month(self) -> int: + """Month in which the first version of the e-print was announced.""" if self.is_old_style: return int(self.split('/', 1)[1][2:4]) return int(self[2:4]) + def __gt__(self, other: Any) -> bool: + if not isinstance(other, Identifier): + raise ValueError(f'Cannot compare Identifier to {type(other)}') + if self.year < other.year: + return False + elif self.year > other.year: + return True + if self.month < other.month: + return False + elif self.month > other.month: + return True + return bool(self.incremental_part > other.incremental_part) + + def __lt__(self, other: Any) -> bool: + if not isinstance(other, Identifier): + raise ValueError(f'Cannot compare {self} to {type(other)}') + if self.year < other.year: + return True + elif self.year > other.year: + return False + if self.month < other.month: + return True + elif self.month > other.month: + return False + return bool(self.incremental_part < other.incremental_part) + + def __le__(self, other: Any) -> bool: + if not isinstance(other, Identifier): + raise ValueError(f'Cannot compare {self} to {type(other)}') + return self < other or self == other + + def __ge__(self, other: Any) -> bool: + if not isinstance(other, Identifier): + raise ValueError(f'Cannot compare {self} to {type(other)}') + return self > other or self == other + class VersionedIdentifier(str): + """ + An arXiv identifier for a specific :class:`.Version`. + + This is an :class:`.Identifier` with a version (``v{N}``) affix. + """ + def __init__(self, value: str) -> None: - id_part, version_part = self.split('v', 1) - self.arxiv_id = Identifier(id_part) - self.version = int(version_part) + """Initialize with a raw str value.""" + try: + id_part, version_part = self.split('v', 1) + self.arxiv_id = Identifier(id_part) + self.version = int(version_part) + except ValueError as e: + raise ValueError(f'Not a valid version identifier: {value}') from e @classmethod def from_parts(cls, arxiv_id: Identifier, version: int) \ @@ -61,15 +172,69 @@ def from_parts(cls, arxiv_id: Identifier, version: int) \ """Generate a new-style versioned identifier from its parts.""" return cls(f'{arxiv_id}v{version}') + @property + def category_part(self) -> str: + """For old-style identifiers, conveys the primary category.""" + return self.arxiv_id.category_part + + @property + def numeric_part(self) -> str: + """ + The entire numeric component of the identifier. + + For new-style identifiers, this is the entire identifier. + """ + return self.arxiv_id.numeric_part + @property def incremental_part(self) -> int: """The part of the identifier that is incremental.""" return self.arxiv_id.incremental_part + @property + def is_old_style(self) -> int: + """Indicate whether this is an old-style identifier.""" + return self.arxiv_id.is_old_style + @property def year(self) -> int: + """Year in which the first version of the e-print was announced.""" return self.arxiv_id.year + @property + def yymm(self) -> str: + """Numeric part conveying the original announcement year and month.""" + return self.arxiv_id.yymm + @property def month(self) -> int: + """Month in which the first version of the e-print was announced.""" return self.arxiv_id.month + + def __gt__(self, other: Any) -> bool: + if not isinstance(other, VersionedIdentifier): + raise ValueError(f'Cannot compare {self} to {type(other)}') + if self.arxiv_id > other.arxiv_id: + return True + elif self.arxiv_id < other.arxiv_id: + return False + return self.version > other.version + + def __lt__(self, other: Any) -> bool: + if not isinstance(other, VersionedIdentifier): + raise ValueError(f'Cannot compare {self} to {type(other)}') + if self.arxiv_id > other.arxiv_id: + return False + elif self.arxiv_id < other.arxiv_id: + return True + return self.version < other.version + + def __le__(self, other: Any) -> bool: + if not isinstance(other, VersionedIdentifier): + raise ValueError(f'Cannot compare {self} to {type(other)}') + return self < other or self == other + + def __ge__(self, other: Any) -> bool: + if not isinstance(other, VersionedIdentifier): + raise ValueError(f'Cannot compare {self} to {type(other)}') + return self > other or self == other diff --git a/arxiv/canonical/domain/license.py b/arxiv/canonical/domain/license.py index 4b78f93..0a62adb 100644 --- a/arxiv/canonical/domain/license.py +++ b/arxiv/canonical/domain/license.py @@ -1,9 +1,24 @@ """Provide license-related domain concepts and logic.""" -from typing import NamedTuple +from typing import Any, Dict, Iterable +from .base import CanonicalBase -class License(NamedTuple): + +class License(CanonicalBase): """License under which the e-print was provided to arXiv.""" - href: str \ No newline at end of file + href: str + """URI of the license resource.""" + + def __init__(self, href: str) -> None: + self.href = href + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'License': + """Reconstitute from a native dict.""" + return cls(href=data['href']) + + def to_dict(self) -> Dict[str, Any]: + """Generate a native dict representation.""" + return {'href': self.href} diff --git a/arxiv/canonical/domain/listing.py b/arxiv/canonical/domain/listing.py index 7bff81c..02d80ef 100644 --- a/arxiv/canonical/domain/listing.py +++ b/arxiv/canonical/domain/listing.py @@ -1,21 +1,170 @@ -"""Provides domain concepts and logic for the listing.""" +"""Provides domain concepts and logic for event listings.""" -from typing import NamedTuple, List, Optional -from datetime import date +import datetime +from collections import defaultdict +from typing import NamedTuple, MutableSequence, Mapping, Tuple, Optional, \ + Any, Dict, Iterable, Callable +from .base import CanonicalBase from .eprint import EPrint, Identifier -from .event import Event +from .version import Event, EventType +Year = int +Month = int +YearMonth = Tuple[Year, Month] -class Listing(NamedTuple): + +class ListingIdentifier(str): + """ + Unique identifier for a :class:`.Listing`. + + Comprised of an ISO-8601 date and a name string. + """ + + def __init__(self, value: str) -> None: + """Initialize from a raw str value.""" + date_part, self.name = self.split('::', 1) + self.date = datetime.datetime.strptime(date_part, '%Y-%m-%d').date() + + @classmethod + def from_parts(cls, date: datetime.date, name: str) -> 'ListingIdentifier': + """Generate from date and name parts.""" + if ':' in name: + raise ValueError('Name may not contains colons `:`') + """Generate a listing identifier from its parts.""" + return cls(date.strftime(f'%Y-%m-%d::{name}')) + + +class Listing(CanonicalBase): """A collection of announcement-related events on a particular day.""" - date: date - """Date on which the events occurred.""" - events: List[Event] + identifier: ListingIdentifier + """Unique identifier for this listing, based on the date and name.""" + + events: MutableSequence[Event] """Events in this listing.""" - def add_event(self, eprint: EPrint, event: Event) -> None: - assert eprint.arxiv_id == event.arxiv_id - assert eprint.version == event.version - self.events.append(event) \ No newline at end of file + def __init__(self, identifier: ListingIdentifier, + events: MutableSequence[Event]) -> None: + """Initialize with a set of events.""" + self.identifier = identifier + self.events = events + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'Listing': + """Reconstitute from a native dict.""" + return cls(identifier=ListingIdentifier(data['identifier']), + events=[Event.from_dict(e) + for e in data['events']]) + + @property + def date(self) -> datetime.date: + """The date of this listing.""" + return self.identifier.date + + @property + def end_datetime(self) -> datetime.datetime: + """Timestamp of the most recent event in this listing.""" + if not self.events: + return datetime.datetime.now() + return self.events[-1].event_date + + @property + def number_of_events(self) -> int: + """Total number of events in this listing.""" + return len(self.events) + + @property + def number_of_events_by_type(self) -> Dict[EventType, int]: + """Number of events in this listing by event type.""" + counts: Dict[EventType, int] = defaultdict(int) + for event in self.events: + counts[event.event_type] += 1 + return dict(counts.items()) + + @property + def number_of_versions(self) -> int: + """Total number of :class:`.Version`s represented in this listing.""" + return 0 + + @property + def start_datetime(self) -> datetime.datetime: + """Timestamp of the earliest event in this listing.""" + if not self.events: + return datetime.datetime.now() + return self.events[0].event_date + + def to_dict(self) -> Dict[str, Any]: + """Generate a native dict representation.""" + return { + 'identifier': str(self.identifier), + 'events': [e.to_dict() for e in self.events] + } + + +class ListingDay(CanonicalBase): + """Represents all of the listings for a particular day.""" + + date: datetime.date + """Date on which the events occurred.""" + + listings: Mapping[str, Listing] + """All of the listings on this date.""" + + def __init__(self, date: datetime.date, + listings: Mapping[str, Listing]) -> None: + self.date = date + self.listings = listings + + +class ListingMonth(CanonicalBase): + """A collection of listings over a month.""" + + name: YearMonth + """The year and month of this collection.""" + + listings: Mapping[datetime.date, ListingDay] + """All of the listings in this month.""" + + def __init__(self, name: YearMonth, + listings: Mapping[datetime.date, ListingDay]) -> None: + self.name = name + self.listings = listings + + @property + def year(self) -> Year: + """Year represented by this block.""" + return self.name[0] + + @property + def month(self) -> Month: + """Month represented by this block.""" + return self.name[1] + + +class ListingYear(CanonicalBase): + """A collection of listings over a year.""" + + year: int + """The year of this collection.""" + + months: Mapping[Tuple[int, int], ListingMonth] + """All of the listings in this year.""" + + def __init__(self, year: int, + months: Mapping[Tuple[int, int], ListingMonth]) -> None: + self.year = year + self.months = months + + +class AllListings(CanonicalBase): + """All listings in the canonical record.""" + + name: Optional[str] + + years: Mapping[int, ListingYear] + + def __init__(self, name: Optional[str], + years: Mapping[int, ListingYear]) -> None: + self.name = name + self.years = years diff --git a/arxiv/canonical/domain/person.py b/arxiv/canonical/domain/person.py index 3ad4627..fcb5e02 100644 --- a/arxiv/canonical/domain/person.py +++ b/arxiv/canonical/domain/person.py @@ -1,10 +1,12 @@ """Provide person-related domain concepts and logic.""" -from typing import NamedTuple, Optional, List +from typing import Any, Dict, Iterable, List, Optional +from .base import CanonicalBase -class Person(NamedTuple): - """An arXiv user.""" + +class Person(CanonicalBase): + """Represents an human person in the canonical record.""" full_name: str last_name: Optional[str] = None @@ -12,4 +14,44 @@ class Person(NamedTuple): suffix: Optional[str] = None orcid: Optional[str] = None author_id: Optional[str] = None - affiliation: Optional[List[str]] = None \ No newline at end of file + affiliation: Optional[List[str]] = None + + def __init__(self, full_name: str, + last_name: Optional[str] = None, + first_name: Optional[str] = None, + suffix: Optional[str] = None, + orcid: Optional[str] = None, + author_id: Optional[str] = None, + affiliation: Optional[List[str]] = None) -> None: + self.full_name = full_name + self.last_name = last_name + self.first_name = first_name + self.suffix = suffix + self.orcid = orcid + self.author_id = author_id + self.affiliation = affiliation + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'Person': + """Reconstitute from a native dict.""" + return cls( + full_name=data['full_name'], + last_name=data.get('last_name'), + first_name=data.get('first_name'), + suffix=data.get('suffix'), + orcid=data.get('orcid'), + author_id=data.get('author_id'), + affiliation=data.get('affiliation', []), + ) + + def to_dict(self) -> Dict[str, Any]: + """Generate a native dict representation.""" + return { + 'full_name': self.full_name, + 'last_name': self.last_name, + 'first_name': self.first_name, + 'suffix': self.suffix, + 'orcid': self.orcid, + 'author_id': self.author_id, + 'affiliation': self.affiliation + } \ No newline at end of file diff --git a/arxiv/canonical/domain/preservation.py b/arxiv/canonical/domain/preservation.py new file mode 100644 index 0000000..647c7fe --- /dev/null +++ b/arxiv/canonical/domain/preservation.py @@ -0,0 +1,3 @@ +"""Core concepts and structs for the daily preservation record.""" + +# TODO: implement me! \ No newline at end of file diff --git a/arxiv/canonical/domain/record.py b/arxiv/canonical/domain/record.py deleted file mode 100644 index 17e7d6f..0000000 --- a/arxiv/canonical/domain/record.py +++ /dev/null @@ -1,125 +0,0 @@ -from typing import NamedTuple, Tuple, Mapping, List, Optional -from datetime import date, datetime - -from .eprint import EPrint -from .event import Event -from .block import MonthlyBlock -from .identifier import Identifier -from .util import now - -Year = int -Month = int - - -class CanonicalRecord(NamedTuple): - """The arXiv canonical record.""" - - blocks: Mapping[Tuple[Year, Month], MonthlyBlock] - """ - Blocks are monthly storage units. - - E-prints are grouped into blocks based on the month in which the first - version of the e-print was announced. - - This mapping must always produce a result, creating the appropriate - :class:`.MonthlyBlock` if necessary. - """ - - listings: Mapping[date, 'Listing'] - """Listings are daily streams of e-print events.""" - - @property - def current_block(self) -> MonthlyBlock: - """Get the current monthly block of announcements.""" - today = date.today() - return self.blocks[(today.year, today.month)] - - def announce_new(self, eprint: EPrint) -> EPrint: - """ - Announce a new e-print. - - This involves setting its identifier to the next available identifier, - version, and announcement date, and updating the appropriate - :class:`.MonthlyBlock` and :class:`.Listing`. - """ - if eprint.is_announced: - raise ValueError(f'E-print already announced: {eprint.arxiv_id}') - today = date.today() - version = 1 - eprint = eprint.as_announced(self.current_block.get_next_identifier(), - version, today) - self.current_block.add(eprint) - self._emit(eprint, Event.Type.NEW) - return eprint - - def announce_replacement(self, eprint: EPrint) -> EPrint: - """ - Announce a replacement. - - This involves incrementing the version, setting the announcement date, - and updating the appropriate :class:`.MonthlyBlock` and - :class:`.Listing`. - """ - if not self.current_block.can_announce(eprint): - raise ValueError('Cannot announce this e-print') - eprint = eprint.as_announced(eprint.arxiv_id, eprint.version + 1, - date.today()) - self._get_block_for_id(eprint.arxiv_id).add(eprint) - self._emit(eprint, Event.Type.REPLACED) - return eprint - - def announce_withdrawal(self, eprint: EPrint) -> EPrint: - """ - Announce a withdrawal. - - This involves incrementing the version, setting the withdrawal state - and announcement date, and updating the appropriate - :class:`.MonthlyBlock` and :class:`.Listing`. - """ - if not self.current_block.can_announce(eprint): - raise ValueError('Cannot announce this e-print') - eprint = eprint.as_withdrawn(eprint.version + 1, date.today()) - self._get_block_for_id(eprint.arxiv_id).add(eprint) - self._emit(eprint, Event.Type.WITHDRAWAL) - return eprint - - def announce_crosslist(self, eprint: EPrint) -> EPrint: - """ - Announce a cross-list. - - This involves updating the e-print and issuing a new event on the - appropriate :class:`.Listing`. - """ - self._get_block_for_id(eprint.arxiv_id).update(eprint) - self._emit(eprint, Event.Type.CROSSLIST) - return eprint - - def update(self, eprint: EPrint) -> EPrint: - """ - Update an e-print. - - This does not generate announcements, nor result in an incremented - version. Intended for correcting minor errors. - """ - self._get_block_for_id(eprint.arxiv_id).update(eprint) - self._emit(eprint, Event.Type.UPDATED) - return eprint - - def load_eprint(self, arxiv_id: Identifier, - version: Optional[int] = None) -> EPrint: - block = self._get_block_for_id(arxiv_id) - return block.load_eprint(arxiv_id, version) - - def _get_block_for_id(self, arxiv_id: Identifier) -> MonthlyBlock: - return self.blocks[(arxiv_id.year, arxiv_id.month)] - - def _make_event(self, eprint: EPrint, event_type: Event.Type, - timestamp: Optional[datetime] = None) -> Event: - if timestamp is None: - timestamp = now() - return Event(eprint.arxiv_id, timestamp, event_type, - eprint.all_categories, version=eprint.version) - - def _emit(self, eprint: EPrint, event_type: Event.Type) -> None: - listing = self.listings[date.today()] - listing.add_event(eprint, self._make_event(eprint, event_type)) \ No newline at end of file diff --git a/arxiv/canonical/domain/tests/test_block.py b/arxiv/canonical/domain/tests/test_block.py deleted file mode 100644 index 51c1984..0000000 --- a/arxiv/canonical/domain/tests/test_block.py +++ /dev/null @@ -1,160 +0,0 @@ -"""Tests for :class:`.MonthlyBlock`.""" - -from unittest import TestCase, mock -from datetime import date - -from arxiv.taxonomy import Category -from ..eprint import EPrint -from ..block import MonthlyBlock -from ..identifier import VersionedIdentifier, Identifier - - -class TestIsOpen(TestCase): - """Property :attr:`.is_open` indicates whether eprints can be added.""" - - def test_last_month(self): - """There is a block from a previous month.""" - block = MonthlyBlock(date.today().year, date.today().month - 1, {}) - self.assertFalse(block.is_open, - 'Only a block for the current month+year can be open') - self.assertTrue(block.is_closed, - 'Only a block for the current month+year can be open') - - def test_last_year(self): - """There is a block from last year.""" - block = MonthlyBlock(date.today().year - 1, date.today().month, {}) - self.assertFalse(block.is_open, - 'Only a block for the current month+year can be open') - self.assertTrue(block.is_closed, - 'Only a block for the current month+year can be open') - - def test_this_month(self): - """There is a block from this month.""" - block = MonthlyBlock(date.today().year, date.today().month, {}) - self.assertTrue(block.is_open, - 'Only a block for the current month+year can be open') - self.assertFalse(block.is_closed, - 'Only a block for the current month+year can be open') - - -class TestGetNextIdentifier(TestCase): - """The method :func:`.MonthlyBlock.get_next_identifer` gets identifiers.""" - - def test_no_eprints(self): - """There are no e-prints in the block.""" - year, month = 2043, 4 - block = MonthlyBlock(year, month, {}) - self.assertEqual(block.get_next_identifier(), '4304.00001', - 'Returns the first identifier in the series') - - def test_some_eprints(self): - """There are some e-prints in the block.""" - year, month = 2043, 4 - eprints = { - VersionedIdentifier('4304.00001v4'): mock.MagicMock(spec=EPrint), - VersionedIdentifier('4304.00005v1'): mock.MagicMock(spec=EPrint), - VersionedIdentifier('4304.00001v2'): mock.MagicMock(spec=EPrint), - VersionedIdentifier('4304.00001v3'): mock.MagicMock(spec=EPrint), - VersionedIdentifier('4304.00004v1'): mock.MagicMock(spec=EPrint), - VersionedIdentifier('4304.00003v1'): mock.MagicMock(spec=EPrint), - VersionedIdentifier('4304.00001v1'): mock.MagicMock(spec=EPrint), - VersionedIdentifier('4304.00002v1'): mock.MagicMock(spec=EPrint), - } - block = MonthlyBlock(year, month, eprints) - self.assertEqual(block.get_next_identifier(), '4304.00006', - 'Returns the first identifier in the series') - - - -class TestAddEPrints(TestCase): - """Add new eprints to the block.""" - - def test_add_eprint_to_empty_block(self): - """Add an eprint to an empty block.""" - year, month = date.today().year, date.today().month - prefix = f'{str(year)[-2:]}{str(month).zfill(2)}' - versioned_identifier = VersionedIdentifier(f'{prefix}.00001v4') - eprint = mock.MagicMock( - spec=EPrint, - arxiv_id=Identifier(f'{prefix}.00001'), - version=4, - versioned_identifier=versioned_identifier - ) - block = MonthlyBlock(year, month, {}) - block.add(eprint) - self.assertIn(versioned_identifier, block.eprints, - 'EPrint is added to the block') - self.assertEqual(block.get_next_identifier(), f'{prefix}.00002', - 'Next identifier comes after the added eprint') - - def test_add_duplicate_eprint(self): - """Add the same eprint to the block twice.""" - year, month = date.today().year, date.today().month - prefix = f'{str(year)[-2:]}{str(month).zfill(2)}' - versioned_identifier = VersionedIdentifier(f'{prefix}.00001v4') - eprint = mock.MagicMock( - spec=EPrint, - arxiv_id=Identifier(f'{prefix}.00001'), - version=4, - versioned_identifier=versioned_identifier, - ) - block = MonthlyBlock(year, month, {}) - block.add(eprint) - with self.assertRaises(ValueError): - block.add(eprint) - - def test_add_eprint_to_wrong_block(self): - """Add an eprint to an empty block.""" - year, month = date.today().year, date.today().month - prefix = f'{str(year - 1)[-2:]}{str(month).zfill(2)}' - versioned_identifier = VersionedIdentifier(f'{prefix}.00001v4') - eprint = mock.MagicMock( - spec=EPrint, - arxiv_id=Identifier(f'{prefix}.00001'), - version=4, - versioned_identifier=versioned_identifier - ) - block = MonthlyBlock(year, month, {}) - with self.assertRaises(ValueError): - block.add(eprint) - - -class TestUpdateEPrints(TestCase): - """Update existing eprints in the block.""" - - def test_update_eprint_not_in_block(self): - """Update an eprint that is not in the block.""" - year, month = date.today().year, date.today().month - prefix = f'{str(year)[-2:]}{str(month).zfill(2)}' - versioned_identifier = VersionedIdentifier(f'{prefix}.00001v4') - eprint = mock.MagicMock( - spec=EPrint, - arxiv_id=Identifier(f'{prefix}.00001'), - version=4, - versioned_identifier=versioned_identifier - ) - block = MonthlyBlock(year, month, {}) - with self.assertRaises(ValueError): - block.update(eprint) - - def test_update_eprint(self): - """Add the same eprint to the block twice.""" - year, month = date.today().year, date.today().month - prefix = f'{str(year)[-2:]}{str(month).zfill(2)}' - versioned_identifier = VersionedIdentifier(f'{prefix}.00001v4') - eprint = mock.MagicMock( - spec=EPrint, - arxiv_id=Identifier(f'{prefix}.00001'), - version=4, - versioned_identifier=versioned_identifier, - secondary_classification=[] - ) - block = MonthlyBlock(year, month, {}) - block.add(eprint) - eprint.secondary_classification.append(Category('foo.BR')) - block.update(eprint) - self.assertIn( - Category('foo.BR'), - block.eprints[eprint.versioned_identifier].secondary_classification - ) - \ No newline at end of file diff --git a/arxiv/canonical/domain/tests/test_content.py b/arxiv/canonical/domain/tests/test_content.py new file mode 100644 index 0000000..c8292ed --- /dev/null +++ b/arxiv/canonical/domain/tests/test_content.py @@ -0,0 +1,52 @@ +from unittest import TestCase + +from ..content import ContentType, SourceFileType, SourceType, \ + available_formats_by_ext + + +class TestAvailableFormatsFromFilename(TestCase): + + def test_formats_from_source_file_name(self): + """Test formats returned from file name.""" + self.assertListEqual(available_formats_by_ext('foo.pdf'), + [ContentType.pdf]) + self.assertListEqual(available_formats_by_ext('/bar.ps.gz'), + [ContentType.pdf, ContentType.ps]) + self.assertListEqual(available_formats_by_ext('abc.html.gz'), + [ContentType.html]) + + # This differs from the implementation in arxiv-browse. It's not clear + # why being gzipped or not should alter the way we handle an HTML + # source file. + self.assertListEqual(available_formats_by_ext('baz.html'), + [ContentType.html]) + + self.assertIsNone(available_formats_by_ext('')) + + +class TestSourceType(TestCase): + """Tests for :class:`.SourceType`.""" + + def test_available_formats(self): + """Tests available formats based on source type.""" + self.assertListEqual(SourceType('I').available_formats, []) + self.assertIn(ContentType.pdf, SourceType('IS').available_formats) + self.assertIn(ContentType.ps, SourceType('IS').available_formats) + + self.assertIn(ContentType.pdf, SourceType('').available_formats) + self.assertIn(ContentType.ps, SourceType('').available_formats) + + self.assertListEqual(SourceType('P').available_formats, + [ContentType.pdf, ContentType.ps]) + self.assertListEqual(SourceType('D').available_formats, + [ContentType.pdf]) + self.assertListEqual(SourceType('F').available_formats, + [ContentType.pdf]) + self.assertListEqual(SourceType('H').available_formats, + [ContentType.html]) + self.assertListEqual(SourceType('X').available_formats, + [ContentType.pdf]) + + self.assertIn(ContentType.pdf, SourceType('').available_formats) + self.assertIn(ContentType.ps, SourceType('').available_formats) + self.assertIn(ContentType.dvi, SourceType('').available_formats) \ No newline at end of file diff --git a/arxiv/canonical/domain/tests/test_eprint.py b/arxiv/canonical/domain/tests/test_eprint.py deleted file mode 100644 index 788125c..0000000 --- a/arxiv/canonical/domain/tests/test_eprint.py +++ /dev/null @@ -1,121 +0,0 @@ -"""Tests for :mod:`.eprint.""" - -from unittest import TestCase -from datetime import date - -from arxiv.taxonomy import Category - -from ..eprint import EPrint - - -class TestCategories(TestCase): - """Test handling of categories on e-prints.""" - - def setUp(self): - """Instantiate an e-print.""" - self.eprint = EPrint( - arxiv_id='2004.00111', - version=1, - announced_date=date.today(), - legacy=False, - submitted_date=date.today(), - license='http://notalicense', - primary_classification=Category('cs.AR'), - title='The Title of Everything', - abstract='Very abstract. Too short to be a real abstract.', - authors='Ima N. Author (FSU)', - source_type='tex', - size_kilobytes=1, - previous_versions=[], - secondary_classification=[Category('cs.AI'), Category('cs.DL')], - history=[] - ) - - def test_all_categories(self): - """Get all categories on the e-print.""" - self.assertIn(self.eprint.primary_classification, - self.eprint.all_categories, - 'The primary category is included') - for category in self.eprint.secondary_classification: - self.assertIn(category, self.eprint.all_categories, - 'All secondary categories are included') - self.assertEqual(self.eprint.primary_classification, - self.eprint.all_categories[0], - 'The primary category comes first') - - def test_add_secondaries(self): - """Add secondary categories to an e-print.""" - self.eprint.add_secondaries(Category('foo.CT'), Category('ww.JD')) - self.assertIn('foo.CT', self.eprint.secondary_classification) - self.assertIn('ww.JD', self.eprint.secondary_classification) - - -class TestVersionedIdentifier(TestCase): - """Test :attr:`EPrint.versioned_identifier` property.""" - - def test_with_id_and_version_set(self): - """Get the versioned identifier for an announced eprint.""" - eprint = EPrint( - arxiv_id='2004.00111', - version=5, - announced_date=date.today(), - legacy=False, - submitted_date=date.today(), - license='http://notalicense', - primary_classification='cs.AR', - title='The Title of Everything', - abstract='Very abstract. Too short to be a real abstract.', - authors='Ima N. Author (FSU)', - source_type='tex', - size_kilobytes=1, - previous_versions=[], - secondary_classification=['cs.AI', 'cs.DL'], - history=[] - ) - self.assertEqual(eprint.versioned_identifier, '2004.00111v5', - 'The versioned identifier is a concatenation of the' - 'arXiv ID and the ersion number.') - - def test_without_version(self): - """Get the versioned identifier when version is missing""" - eprint = EPrint( - arxiv_id='2004.00111', - version=None, - announced_date=date.today(), - legacy=False, - submitted_date=date.today(), - license='http://notalicense', - primary_classification='cs.AR', - title='The Title of Everything', - abstract='Very abstract. Too short to be a real abstract.', - authors='Ima N. Author (FSU)', - source_type='tex', - size_kilobytes=1, - previous_versions=[], - secondary_classification=['cs.AI', 'cs.DL'], - history=[] - ) - with self.assertRaises(ValueError): - eprint.versioned_identifier - - def test_without_arxiv_id(self): - """Get the versioned identifier when arxiv_id is missing""" - eprint = EPrint( - arxiv_id=None, - version=1, - announced_date=date.today(), - legacy=False, - submitted_date=date.today(), - license='http://notalicense', - primary_classification='cs.AR', - title='The Title of Everything', - abstract='Very abstract. Too short to be a real abstract.', - authors='Ima N. Author (FSU)', - source_type='tex', - size_kilobytes=1, - previous_versions=[], - secondary_classification=['cs.AI', 'cs.DL'], - history=[] - ) - with self.assertRaises(ValueError): - eprint.versioned_identifier \ No newline at end of file diff --git a/arxiv/canonical/domain/tests/test_file.py b/arxiv/canonical/domain/tests/test_file.py new file mode 100644 index 0000000..c6ad39c --- /dev/null +++ b/arxiv/canonical/domain/tests/test_file.py @@ -0,0 +1,109 @@ +"""Tests for :mod:`arxiv.canonical.domain`.""" + +from datetime import datetime +from unittest import TestCase + +from ..file import URI, Key, CanonicalFile, ContentType + + +class TestURIForFile(TestCase): + """URI can refer to a local file.""" + + def test_file_uri(self): + """URI is initialized with an absolute path.""" + path = '/path/to/some/data' + uri = URI(path) + self.assertTrue(uri.is_file, 'Recognized as a file reference') + self.assertFalse(uri.is_http_url, 'Not an HTTP URI') + self.assertFalse(uri.is_canonical, 'Not a canonical URI') + self.assertEqual(uri.scheme, 'file') + self.assertEqual(uri.path, path, 'Original path is preserved') + + def test_file_uri_with_relative_path(self): + """URI is initialized with a relative path.""" + path = 'path/to/some/data' + with self.assertRaises(ValueError): + URI(path) + + +class TestCanonicalURI(TestCase): + """URI can refer to a canonical resource.""" + + def test_canonical_uri(self): + """URI is initialized with an arXiv canonical URI.""" + raw = 'arxiv:///path/to/a/resource' + uri = URI(raw) + self.assertFalse(uri.is_file, 'Not a local file reference') + self.assertFalse(uri.is_http_url, 'Not an HTTP URI') + self.assertTrue(uri.is_canonical, 'Recognized as a canonical URI') + self.assertEqual(uri.scheme, 'arxiv') + self.assertEqual(uri.path, '/path/to/a/resource') + + +class TestHTTPURI(TestCase): + """URI can refer to an HTTP URI.""" + + def test_valid_http_uri(self): + """URI is initialized with a valid HTTP URI.""" + raw = 'http://asdf.com' + uri = URI(raw) + self.assertFalse(uri.is_file, 'Not a local file reference') + self.assertTrue(uri.is_http_url, 'Recognized as an HTTP URI') + self.assertFalse(uri.is_canonical, 'Not a canonical URI') + self.assertEqual(uri.scheme, 'http') + + def test_valid_https_uri(self): + """URI is initialized with a valid HTTPS URI.""" + raw = 'https://asdf.com' + uri = URI(raw) + self.assertFalse(uri.is_file, 'Not a local file reference') + self.assertTrue(uri.is_http_url, 'Recognized as an HTTP URI') + self.assertFalse(uri.is_canonical, 'Not a canonical URI') + self.assertEqual(uri.scheme, 'https') + + def test_valid_ftp_uri(self): + """URI is initialized with a valid FTP URI.""" + raw = 'ftp://asdf.com/foo' + uri = URI(raw) + self.assertFalse(uri.is_file, 'Not a local file reference') + self.assertFalse(uri.is_http_url, 'Not an HTTP URI') + self.assertFalse(uri.is_canonical, 'Not a canonical URI') + self.assertEqual(uri.scheme, 'ftp') + + +class TestKey(TestCase): + """Key is a canonical URI.""" + + def test_with_absolute_path(self): + """Key is initialized with an absolute path.""" + raw = '/path/to/a/resource' + key = Key(raw) + self.assertTrue(key.is_canonical, 'Key is a canonical URI') + self.assertIsInstance(key, URI, 'Indeed, it is an URI') + self.assertEqual(key.scheme, 'arxiv') + self.assertEqual(str(key), f'arxiv://{raw}') + + +class TestCanonicalFile(TestCase): + def setUp(self): + """Given a canonical file.""" + self.canonical_file = CanonicalFile( + modified=datetime.now(), + size_bytes=5_324, + content_type=ContentType.json, + filename='foo.json', + ref=URI('arxiv:///key/for/foo.json') + ) + + def test_dict_transformation(self): + """Transformation of CanonicalFile to/from dict preserves state.""" + self.assertEqual( + self.canonical_file, + CanonicalFile.from_dict(self.canonical_file.to_dict()) + ) + + def test_mime_type(self): + """MIME type is accessible on the file itself.""" + self.assertEqual(self.canonical_file.mime_type, + ContentType.json.mime_type) + diff --git a/arxiv/canonical/domain/tests/test_identifier.py b/arxiv/canonical/domain/tests/test_identifier.py new file mode 100644 index 0000000..22ee693 --- /dev/null +++ b/arxiv/canonical/domain/tests/test_identifier.py @@ -0,0 +1,19 @@ +"""Tests for :class:`.Identifier`.""" + +from unittest import TestCase + +from .. import Identifier, VersionedIdentifier + + +class TestIdentifierComparisons(TestCase): + """Test comparisons between identifiers.""" + + def test_compare_oldstyle_identifiers(self): + self.assertLess(Identifier('hep-ex/9802024'), + Identifier('cond-mat/9805021')) + self.assertLessEqual(Identifier('hep-ex/9802024'), + Identifier('cond-mat/9805021')) + self.assertGreater(Identifier('cond-mat/9805021'), + Identifier('hep-ex/9802024')) + self.assertGreaterEqual(Identifier('cond-mat/9805021'), + Identifier('hep-ex/9802024')) \ No newline at end of file diff --git a/arxiv/canonical/domain/tests/test_record.py b/arxiv/canonical/domain/tests/test_record.py deleted file mode 100644 index 05c2d52..0000000 --- a/arxiv/canonical/domain/tests/test_record.py +++ /dev/null @@ -1,113 +0,0 @@ -"""Tests for :mod:`.domain.record`.""" - -from unittest import TestCase, mock -from datetime import date - -from arxiv.taxonomy import Category -from ..eprint import EPrint -from ..block import MonthlyBlock -from ..identifier import VersionedIdentifier, Identifier -from ..record import CanonicalRecord -from ..listing import Listing -from ..event import Event - - -class TestCurrentBlock(TestCase): - """Property :attr:`.current_block` is the current :class:`.MonthlyBlock.""" - - def test_current_block(self): - """There are several blocks on the record.""" - year, month = date.today().year, date.today().month - blocks = { - (year - 1, month): MonthlyBlock(year - 1, month, {}), - (year, month): MonthlyBlock(year, month, {}), - (year, month + 1): MonthlyBlock(year, month + 1, {}), - } - record = CanonicalRecord(blocks, {}) - self.assertIsInstance(record.current_block, MonthlyBlock) - self.assertEqual(record.current_block.year, year) - self.assertEqual(record.current_block.month, month) - self.assertTrue(record.current_block.is_open) - - -class TestAnnounceNew(TestCase): - """We have new e-prints that require announcement.""" - - def test_unannounced_eprint(self): - """We have an e-print that is not announced.""" - eprint = EPrint( - arxiv_id=None, - version=None, - announced_date=date.today(), - legacy=False, - submitted_date=date.today(), - license='http://notalicense', - primary_classification='cs.AR', - title='The Title of Everything', - abstract='Very abstract. Too short to be a real abstract.', - authors='Ima N. Author (FSU)', - source_type='tex', - size_kilobytes=1, - previous_versions=[], - secondary_classification=['cs.AI', 'cs.DL'], - history=[] - ) - self.assertFalse(eprint.is_announced, 'E-Print is not announced') - - year, month = date.today().year, date.today().month - blocks = { - (year - 1, month): MonthlyBlock(year - 1, month, {}), - (year, month): MonthlyBlock(year, month, {}), - (year, month + 1): MonthlyBlock(year, month + 1, {}), - } - today_listing = Listing(date.today(), []) - listings = {(date.today()): today_listing} - record = CanonicalRecord(blocks, listings) - eprint = record.announce_new(eprint) - - self.assertTrue(eprint.is_announced, 'E-Print is announced') - self.assertIn(eprint.versioned_identifier, - record.current_block.eprints, - 'E-Print is in the current block') - self.assertEqual(len(today_listing.events), 1, - 'An event is added to the listing') - self.assertIsInstance(today_listing.events[0], Event) - self.assertEqual(today_listing.events[0].arxiv_id, eprint.arxiv_id, - 'Event has the correct arXiv ID') - self.assertEqual(today_listing.events[0].version, eprint.version, - 'Event has the correct version') - - def test_previously_announced_eprint(self): - """We have an e-print that is already announced.""" - eprint = EPrint( - arxiv_id='1702.00123', - version=2, - announced_date=date(year=2017, month=2, day=5), - legacy=False, - submitted_date=date.today(), - license='http://notalicense', - primary_classification='cs.AR', - title='The Title of Everything', - abstract='Very abstract. Too short to be a real abstract.', - authors='Ima N. Author (FSU)', - source_type='tex', - size_kilobytes=1, - previous_versions=[], - secondary_classification=['cs.AI', 'cs.DL'], - history=[] - ) - self.assertTrue(eprint.is_announced, 'E-Print is not announced') - - year, month = date.today().year, date.today().month - blocks = { - (year - 1, month): MonthlyBlock(year - 1, month, {}), - (year, month): MonthlyBlock(year, month, {}), - (year, month + 1): MonthlyBlock(year, month + 1, {}), - } - listings = {(date.today()): Listing(date.today(), [])} - record = CanonicalRecord(blocks, listings) - - with self.assertRaises(ValueError): - record.announce_new(eprint) - - \ No newline at end of file diff --git a/arxiv/canonical/domain/version.py b/arxiv/canonical/domain/version.py new file mode 100644 index 0000000..8f8c191 --- /dev/null +++ b/arxiv/canonical/domain/version.py @@ -0,0 +1,588 @@ +"""Provides the core domain concept and logic for individual versions.""" + +import io +from base64 import urlsafe_b64decode, urlsafe_b64encode +from datetime import datetime, date +from enum import Enum +from typing import Any, Dict, Iterable, List, Mapping, MutableSequence, \ + NamedTuple, Optional, Tuple, Union +from uuid import UUID + +from backports.datetime_fromisoformat import MonkeyPatch +from typing_extensions import Literal + +from arxiv.taxonomy import Category # pylint: disable=no-name-in-module + +from .base import CanonicalBase +from .content import ContentType, SourceType +from .identifier import Identifier, VersionedIdentifier +from .person import Person +from .file import CanonicalFile +from .license import License + +MonkeyPatch.patch_fromisoformat() + + +class Metadata(CanonicalBase): + """Submitter-provided descriptive metadata for a version.""" + + primary_classification: Category + secondary_classification: List[Category] + title: str + abstract: str + authors: str + license: License + comments: Optional[str] + journal_ref: Optional[str] + report_num: Optional[str] + doi: Optional[str] + msc_class: Optional[str] + acm_class: Optional[str] + + def __init__(self, primary_classification: Category, + secondary_classification: List[Category], + title: str, + abstract: str, + authors: str, + license: License, + comments: Optional[str] = None, + journal_ref: Optional[str] = None, + report_num: Optional[str] = None, + doi: Optional[str] = None, + msc_class: Optional[str] = None, + acm_class: Optional[str] = None) -> None: + self.primary_classification = primary_classification + self.secondary_classification = secondary_classification + self.title = title + self.abstract = abstract + self.authors = authors + self.license = license + self.comments = comments + self.journal_ref = journal_ref + self.report_num = report_num + self.doi = doi + self.msc_class = msc_class + self.acm_class = acm_class + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'Metadata': + """Reconstitute from a native dict.""" + return cls( + primary_classification=Category(data['primary_classification']), + secondary_classification=[ + Category(cat) for cat in data['secondary_classification'] + ], + title=data['title'], + abstract=data['abstract'], + authors=data['authors'], + license=License.from_dict(data['license']), + comments=data.get('comments'), + journal_ref=data.get('journal_ref'), + report_num=data.get('report_num'), + doi=data.get('doi'), + msc_class=data.get('msc_class'), + acm_class=data.get('acm_class'), + ) + + @property + def all_categories(self) -> List[str]: + """All classification categories for this version.""" + return [self.primary_classification] + self.secondary_classification + + def add_secondaries(self, *new_secondaries: Category) -> None: + """Add cross-list categories for this version.""" + for category in new_secondaries: + if category not in self.secondary_classification: + self.secondary_classification.append(category) + + def to_dict(self) -> Dict[str, Any]: + """Generate a native dict representation.""" + return { + 'primary_classification': str(self.primary_classification), + 'secondary_classification': [ + str(cat) for cat in self.secondary_classification + ], + 'title': self.title, + 'abstract': self.abstract, + 'authors': self.authors, + 'license': self.license.to_dict(), + 'comments': self.comments, + 'journal_ref': self.journal_ref, + 'report_num': self.report_num, + 'doi': self.doi, + 'msc_class': self.msc_class, + 'acm_class': self.acm_class + } + + +class VersionReference(CanonicalBase): + """An abridged reference to a particular :class:`Version`.""" + + identifier: VersionedIdentifier + """Identifier of the version.""" + + announced_date: date + """Date on which the version was announced.""" + + submitted_date: date + """Date on which the version was submitted.""" + + def __init__(self, identifier: VersionedIdentifier, + announced_date: date, + submitted_date: date) -> None: + self.identifier = identifier + self.announced_date = announced_date + self.submitted_date = submitted_date + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'VersionReference': + """Reconstitute from a native dict.""" + return cls( + identifier=VersionedIdentifier(data['identifier']), + announced_date=datetime.fromisoformat(data['announced_date']).date(), # type: ignore; pylint: disable=no-member + submitted_date=datetime.fromisoformat(data['submitted_date']).date(), # type: ignore; pylint: disable=no-member + ) + + def to_dict(self) -> Dict[str, Any]: + """Generate a native dict representation.""" + return { + 'identifier': str(self.identifier), + 'announced_date': self.announced_date.isoformat(), + 'submitted_date': self.submitted_date.isoformat(), + } + + +class Version(CanonicalBase): + """Represents a single version of an arXiv e-print in the record.""" + + identifier: VersionedIdentifier + """Unique arXiv identifier for the version.""" + + announced_date: date + """Day on which this version was announced.""" + + announced_date_first: date + """Day on which the first version of the e-print was announced.""" + + submitted_date: datetime + """Timestamp when this version was submitted to arXiv.""" + + updated_date: datetime + """The last time the record for this version was changed.""" + + metadata: Metadata + """Submitter-provided descriptive metadata for the version.""" + + events: List['EventSummary'] + """Events that are specific to this version of the e-print.""" + + previous_versions: List[VersionReference] + """References to previous versions of the e-print.""" + + submitter: Optional[Person] + """Person responsible for submitting this version.""" + + proxy: Optional[str] + """The proxy that deposited the version on behalf of the submitter.""" + + is_announced: bool + """Indicate whether or not the version is announced.""" + + is_withdrawn: bool + """Indicate whether or not the version is withdrawn.""" + + reason_for_withdrawal: Optional[str] + """The reason for the withdrawal of the e-print.""" + + is_legacy: bool + """Indicate whether this record was populated from the legacy system.""" + + source: CanonicalFile + """The original user-submitted source package.""" + + render: Optional[CanonicalFile] + """ + Human-readable representation of the e-print. + + Usually a PDF generated from the source, but may also be a user-provided + PDF. + """ + + source_type: Optional[SourceType] + """Internal code for the source type.""" + + formats: Dict[ContentType, CanonicalFile] + """Dissemination formats for this version.""" + + def __init__(self, identifier: VersionedIdentifier, + announced_date: date, + announced_date_first: date, + submitted_date: datetime, + updated_date: datetime, + metadata: Metadata, + source: CanonicalFile, + events: Optional[List['EventSummary']] = None, + previous_versions: Optional[List[VersionReference]] = None, + submitter: Optional[Person] = None, + proxy: Optional[str] = None, + is_announced: bool = False, + is_withdrawn: bool = False, + is_legacy: bool = False, + reason_for_withdrawal: Optional[str] = None, + source_type: Optional[SourceType] = None, + render: Optional[CanonicalFile] = None, + formats: Dict[ContentType, CanonicalFile] = {}) -> None: + self.identifier = identifier + self.announced_date = announced_date + self.announced_date_first = announced_date_first + self.submitted_date = submitted_date + self.updated_date = updated_date + self.metadata = metadata + self.events = events or [] + self.previous_versions = previous_versions or [] + self.submitter = submitter + self.proxy = proxy + self.is_announced = is_announced + self.is_withdrawn = is_withdrawn + self.reason_for_withdrawal = reason_for_withdrawal + self.is_legacy = is_legacy + self.render = render + self.source = source + self.source_type = source_type + self.formats = formats + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'Version': + """Reconstitute from a native dict.""" + source_type: Optional[SourceType] = None + if 'source_type' in data and data['source_type']: + source_type = SourceType(data['source_type']) + + render: Optional[CanonicalFile] = None + if 'render' in data and data['render']: + render = CanonicalFile.from_dict(data['render']) + return cls( + identifier=VersionedIdentifier(data['identifier']), + announced_date=datetime.fromisoformat(data['announced_date']).date(), # type: ignore ; pylint: disable=no-member + announced_date_first=datetime.fromisoformat(data['announced_date_first']).date(), # type: ignore ; pylint: disable=no-member + submitted_date=datetime.fromisoformat(data['submitted_date']), # type: ignore ; pylint: disable=no-member + updated_date=datetime.fromisoformat(data['updated_date']), # type: ignore ; pylint: disable=no-member + metadata=Metadata.from_dict(data['metadata']), + events=[EventSummary.from_dict(e) for e in data['events']], + previous_versions=[VersionReference.from_dict(v) for v in data['previous_versions']], + submitter=Person.from_dict(data['submitter']) if data.get('submitter') else None, + proxy=data.get('proxy'), + is_announced=data['is_announced'], + is_withdrawn=data['is_withdrawn'], + reason_for_withdrawal=data.get('reason_for_withdrawal'), + is_legacy=data['is_legacy'], + render=render, + source=CanonicalFile.from_dict(data['source']), + source_type=source_type, + formats={ + ContentType(entry["format"]): + CanonicalFile.from_dict(entry["content"]) + for entry in data.get('formats', []) + } + ) + + @property + def number_of_events(self) -> Literal[0]: + """Numer of events described by this object (0).""" + return 0 + + @property + def number_of_versions(self) -> Literal[1]: + """Number of versions described by this object (1).""" + return 1 + + # TODO: do we still need this? Holdover from classic. + @property + def size_kilobytes(self) -> int: + """Size of the source package in kb.""" + assert self.source is not None + return int(round(self.source.size_bytes / 1_028)) + + def get_format(self, desired_format: str) -> CanonicalFile: + """Get a particular dissemination format for this version.""" + if desired_format == 'source': + return self.source + if desired_format == 'render' and self.render: + return self.render + try: + return self.formats[ContentType(desired_format)] + except ValueError as e: + raise ValueError(f'Unknown format: {desired_format}') from e + except KeyError as e: + raise KeyError(f'Format {desired_format} not available' + f' for {self.identifier}') from e + + def to_dict(self) -> Dict[str, Any]: + """Generate a native dict representation.""" + return { + 'identifier': str(self.identifier), + 'announced_date': self.announced_date.isoformat(), + 'announced_date_first': self.announced_date_first.isoformat(), + 'submitted_date': self.submitted_date.isoformat(), + 'updated_date': self.updated_date.isoformat(), + 'metadata': self.metadata.to_dict(), + 'events': [s.to_dict() for s in self.events], + 'previous_versions': [ + v.to_dict() for v in self.previous_versions + ], + 'submitter': self.submitter.to_dict() + if self.submitter else None, + 'proxy': self.proxy, + 'is_announced': self.is_announced, + 'is_withdrawn': self.is_withdrawn, + 'reason_for_withdrawal': self.reason_for_withdrawal, + 'is_legacy': self.is_legacy, + 'render': self.render.to_dict() if self.render else None, + 'source': self.source.to_dict(), + 'source_type': str(self.source_type) if self.source_type else None, + 'formats': [ + { + "format": fmt.value, + "content": cf.to_dict() + } for fmt, cf in self.formats.items() + ] + } + + +class EventIdentifier(str): + """Unique identifier for an :class:`.Event`.""" + + version_id: VersionedIdentifier + """Identifier of the :class:`.Version` to which the event pertains.""" + + event_date: datetime + """Timestamp of the event.""" + + shard: str + """Shard ID for the event.""" + + def __init__(self, value: str) -> None: + decoded = urlsafe_b64decode(value).decode('utf-8') + version_id_raw, event_date_raw, self.shard = decoded.split('::', 2) + self.version_id = VersionedIdentifier(version_id_raw) + self.event_date = datetime.fromisoformat(event_date_raw) # type: ignore ; pylint: disable=no-member + + @classmethod + def from_parts(cls, identifier: VersionedIdentifier, event_date: datetime, + shard: str) -> 'EventIdentifier': + """Generate a event identifier from its parts.""" + raw = f'{identifier}::{event_date}::{shard}'.encode('utf-8') + return cls(urlsafe_b64encode(raw).decode('utf-8')) + + +class EventType(Enum): + """Supported event types.""" + + NEW = 'new' + UPDATED = 'update' + UPDATED_METADATA = 'update_metadata' + REPLACED = 'replace' + CROSSLIST = 'cross' + JREF = 'jref' # Deprecated. + WITHDRAWN = 'withdraw' + MIGRATE = 'migrate' + MIGRATE_METADATA = 'migrate_metadata' + + @property + def is_new_version(self) -> bool: + """Indicate whether or not this event type results in a new version.""" + return self in [self.NEW, self.REPLACED, self.WITHDRAWN] + + +class _EventBase(CanonicalBase): + """Core attributes of an event and its summary.""" + + identifier: VersionedIdentifier + """Identifier of the :class:`.Version` to which the event pertains.""" + + event_date: datetime + """Timestamp of the event.""" + + event_type: EventType + """The type of this event.""" + + categories: List[Category] + """ + Categories related to this event. + + This is an artifact of the format of the legacy daily.log file, and may no + longer be particularly useful. + """ + + description: str + """ + Additional information about the event. + + This is currently not used for anything, but could provide a space for + administrative notes or other information about updates not captured in + the event ontology and version metadata. + """ + + is_legacy: bool + """Indicate whether this event was populated from the legacy record.""" + + event_agent: Optional[str] # TODO: do we need this? + """Agent that generated the event.""" + + def __init__(self, identifier: VersionedIdentifier, + event_date: datetime, + event_type: EventType, + categories: Optional[List[Category]] = None, + description: str = '', + is_legacy: bool = False, + event_agent: Optional[str] = None) -> None: + self.identifier = identifier + self.event_date = event_date + self.event_type = event_type + if categories is None: + categories = [] + self.categories = categories + self.description = description + self.is_legacy = is_legacy + self.event_agent = event_agent + + +class Event(_EventBase): + """An announcement-related event.""" + + version: Version + """The current state of the version (i.e. after the event).""" + + def __init__(self, identifier: VersionedIdentifier, + event_date: datetime, + event_type: EventType, + version: Version, + categories: Optional[List[Category]] = None, + description: str = '', + is_legacy: bool = False, + event_agent: Optional[str] = None) -> None: + + self.version = version + super(Event, self).__init__(identifier, event_date, event_type, + categories=categories, + description=description, + is_legacy=is_legacy, + event_agent=event_agent) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'Event': + """Reconstitute from a native dict.""" + return cls( + identifier=VersionedIdentifier(data['identifier']), + event_date=datetime.fromisoformat(data['event_date']), # type: ignore ; pylint: disable=no-member + event_type=EventType(data['event_type']), + categories=[Category(cat) for cat in data['categories']], + version=Version.from_dict(data['version']), + description=data['description'], + is_legacy=data['is_legacy'], + event_agent=data.get('event_agent') + ) + + @classmethod + def get_default_shard(cls) -> str: + """Get the default listing shard for this event.""" + return 'listing' + + @property + def event_id(self) -> EventIdentifier: + """The unique identifier for this event.""" + return EventIdentifier.from_parts(self.identifier, self.event_date, + self.shard) + + # 2019-09-02: There is not currently a driver for sharding listings, but it + # is easier to add support for it now then to retrofit later (YAGNI be + # darned). We can readily imagine, for example, wanting to shard by event + # type or by primary category. If there is more than one possible return + # value for this function (as a function of the event data), then multiple + # listing files will be created accordingly. + @property + def shard(self) -> str: + """The shard name for this event.""" + return self.get_default_shard() + + @property + def summary(self) -> 'EventSummary': + """A summary of this event.""" + return EventSummary( + identifier=self.identifier, + event_date=self.event_date, + event_type=self.event_type, + event_id=self.event_id, + categories=self.categories, + description=self.description, + is_legacy=self.is_legacy, + event_agent=self.event_agent + ) + + def to_dict(self) -> Dict[str, Any]: + """Generate a native dict representation.""" + return { + 'identifier': str(self.identifier), + 'event_date': self.event_date.isoformat(), + 'event_type': self.event_type.value, + 'categories': [str(cat) for cat in self.categories], + 'version': self.version.to_dict(), + 'description': self.description, + 'is_legacy': self.is_legacy, + 'event_agent': self.event_agent, + 'event_id': self.event_id + } + + +class EventSummary(_EventBase): + """ + A lightweight description of an event. + + This has all of the data of the original :class:`.Event` except for the + state of the e-print version. + """ + + event_id: EventIdentifier + """Unique identifier for the event.""" + + def __init__(self, identifier: VersionedIdentifier, + event_date: datetime, + event_type: EventType, + event_id: EventIdentifier, + categories: Optional[List[Category]] = None, + description: str = '', + is_legacy: bool = False, + event_agent: Optional[str] = None) -> None: + self.event_id = event_id + super(EventSummary, self).__init__(identifier, event_date, event_type, + categories=categories, + description=description, + is_legacy=is_legacy, + event_agent=event_agent) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'EventSummary': + """Reconstitute from a native dict.""" + return cls( + identifier=VersionedIdentifier(data['identifier']), + event_date=datetime.fromisoformat(data['event_date']), # type: ignore ; pylint: disable=no-member + event_type=EventType(data['event_type']), + event_id=EventIdentifier(data['event_id']), + categories=[Category(cat) for cat in data['categories']], + description=data['description'], + is_legacy=data['is_legacy'], + event_agent=data.get('event_agent') + ) + + def to_dict(self) -> Dict[str, Any]: + """Generate a native dict representation.""" + return { + 'identifier': str(self.identifier), + 'event_date': self.event_date.isoformat(), + 'event_type': self.event_type.value, + 'categories': [str(cat) for cat in self.categories], + 'description': self.description, + 'is_legacy': self.is_legacy, + 'event_agent': self.event_agent, + 'event_id': self.event_id + } \ No newline at end of file diff --git a/arxiv/canonical/integrity/__init__.py b/arxiv/canonical/integrity/__init__.py new file mode 100644 index 0000000..c9b44a1 --- /dev/null +++ b/arxiv/canonical/integrity/__init__.py @@ -0,0 +1,116 @@ +""" +Integrity structs and collections for the canonical record. + +This module provides a class hierarchy for integrity and consistency-related +concerns pertaining to the canonical record. The classes herein generate +and validate checksums, and generate manifests. + +In order to efficiently verify the completeness and integrity of the record (or +a replica of the record), and to identify the source of inconsistencies, +consistency checks are performed at several levels of granularity (e.g. entry, +day, month, year, global). The completeness and integrity of all or a part of +the arXiv collection can be verified by comparing the checksum values at the +corresponding level of granularity. + +The way in which checksum values are calculated for each level is described +below. This is inspired by the strategy for checksum validation of large +chunked uploads to Amazon S3. All checksum values are md5 hashes, stored and +transmitted as URL-safe base64-encoded strings. + ++---------+-------------------------+------------------+----------------------+ +| Level | Contents | Completeness | Integrity | ++=========+=========================+==================+======================+ +| File | Binary data. | Presence/absence | Hash of binary file | +| | | of descriptor. | content. | ++---------+-------------------------+------------------+----------------------+ +| Version | Collection of metadata, | Presence | Hash of concatenated | +| | source, and render | of files. | (sorted by name) | +| | files. | | file hashes. | ++---------+-------------------------+------------------+----------------------+ +| E-Print | One or more sequential | Presence of | Hash of concatenated | +| | versions | version records. | (sorted) version | +| | | | hashes. | ++---------+-------------------------+------------------+----------------------+ +| Day | All e-prints the first | Presence of | Hash of concatenated | +| | version of which was | e-print records. | (sorted) e-print | +| | announced on this day. | | hashes. | ++---------+-------------------------+------------------+----------------------+ +| Month | All e-prints the first | Presence of day | Hash of concatenated | +| | version of which was | records. | (sorted) day hashes. | +| | announced in this | | | +| | month. | | | ++---------+-------------------------+------------------+----------------------+ +| Year | All e-prints the first | Presence of | Hash of concatenated | +| | version of which was | month records. | (sorted) month | +| | announced in this | | hashes. | +| | year. | | | ++---------+-------------------------+------------------+----------------------+ +| All | All e-prints. | Presence of year | Hash of concatenated | +| | | records. | (sorted) year | +| | | | hashes. | ++---------+-------------------------+------------------+----------------------+ + +The same hierarchy is used for listing files, where the terminal bitstream +is the binary serialized manifest. + +A global integrity collection, :class:`.Integrity` draws together the +e-print and listing hierarchies into a final, composite level. +""" + +from typing import Union + +from ..util import GenericMonoDict + +from .checksum import calculate_checksum +from .core import (IntegrityBase, IntegrityEntry, IntegrityEntryBase, + IntegrityEntryMembers, R) +from .listing import (IntegrityListing, IntegrityListingDay, + IntegrityListingMonth, IntegrityListingYear, + IntegrityListings) +from .metadata import IntegrityMetadata +from .version import (IntegrityVersion, IntegrityEPrint, IntegrityDay, + IntegrityMonth, IntegrityYear, IntegrityEPrints) + + +__all__ = ( + 'Integrity', + 'IntegrityBase', + 'IntegrityDay', + 'IntegrityEntryBase', + 'IntegrityEntryMembers', + 'IntegrityEPrint', + 'IntegrityEPrints', + 'IntegrityListing', + 'IntegrityListingDay', + 'IntegrityListingMonth', + 'IntegrityListingYear', + 'IntegrityListings', + 'IntegrityMetadata', + 'IntegrityMonth', + 'IntegrityVersion', + 'IntegrityYear', +) + + +TopLevelCollection = Union[IntegrityEPrints, IntegrityListings] + + +class TopLevelMembers(GenericMonoDict[str, TopLevelCollection]): + """ + A dict that returns only top level members. + + Consistent with + ``Mapping[str, Union[IntegrityEPrints, IntegrityListings]]``. + """ + + def __getitem__(self, key: str) -> TopLevelCollection: + value = dict.__getitem__(self, key) + assert isinstance(value, (IntegrityEPrints, IntegrityListings)) + return value + + +class Integrity(IntegrityBase[None, + R.Record, + str, + Union[IntegrityEPrints, IntegrityListings]]): + """Apex of the integrity collection.""" diff --git a/arxiv/canonical/integrity/checksum.py b/arxiv/canonical/integrity/checksum.py new file mode 100644 index 0000000..5e2dfef --- /dev/null +++ b/arxiv/canonical/integrity/checksum.py @@ -0,0 +1,51 @@ +import io +from base64 import urlsafe_b64encode +from hashlib import md5 +from operator import itemgetter +from typing import List, IO, Union, cast + +from ..record import RecordStream +from ..manifest import Manifest +from .exceptions import ChecksumError + + +def calculate_checksum(obj: Union[bytes, IO[bytes], Manifest, RecordStream]) \ + -> str: + if isinstance(obj, bytes): + return checksum_raw(obj) + if isinstance(obj, dict): + return checksum_manifest(obj) + if isinstance(obj, io.IOBase): + return checksum_io(obj) + if isinstance(obj, RecordStream): + assert obj.content is not None + return checksum_io(obj.content) + raise TypeError(f'Cannot generate a checksum from a {type(obj)}') + + +def checksum_raw(raw: bytes) -> str: + hash_md5 = md5() + hash_md5.update(raw) + return urlsafe_b64encode(hash_md5.digest()).decode('utf-8') + + +def checksum_io(content: IO[bytes]) -> str: + """Generate an URL-safe base64-encoded md5 hash of an IO.""" + if content.seekable: + content.seek(0) # Make sure that we are at the start of the stream. + hash_md5 = md5() + for chunk in iter(lambda: content.read(4096), b""): + + hash_md5.update(chunk) + if content.seekable: + content.seek(0) # Be a good neighbor for subsequent users. + return urlsafe_b64encode(hash_md5.digest()).decode('utf-8') + + +def checksum_manifest(manifest: Manifest) -> str: + components: List[str] = [] + for entry in sorted(manifest['entries'], key=itemgetter('key')): + if 'checksum' not in entry or entry['checksum'] is None: + raise ChecksumError(f'Missing checksum: {entry}') + components.append(entry['checksum']) + return checksum_raw(''.join(components).encode('utf-8')) \ No newline at end of file diff --git a/arxiv/canonical/integrity/core.py b/arxiv/canonical/integrity/core.py new file mode 100644 index 0000000..777f408 --- /dev/null +++ b/arxiv/canonical/integrity/core.py @@ -0,0 +1,219 @@ +"""Base classes and concepts for the integrity system.""" + +from datetime import date +from operator import attrgetter, itemgetter +from typing import IO, NamedTuple, List, Dict, Sequence, Optional, Tuple, \ + Mapping, Generic, TypeVar, Union, Iterable, Type + +from mypy_extensions import TypedDict +from typing_extensions import Literal + +from ..domain import VersionedIdentifier, Identifier, ListingIdentifier, \ + Listing, Version, CanonicalBaseCollection, EventType +# from ..record import RecordBase, RecordStream, RecordListingDay, \ +# RecordListing, RecordListingMonth, RecordListingYear, RecordVersion, \ +# RecordVersion, RecordEPrint, RecordDay, RecordMonth, RecordYear, \ +# RecordListings, RecordEPrints, Record, RecordMetadata, RecordEntry +from .. import record as R +from .. import domain as D +from ..util import GenericMonoDict +from ..manifest import Manifest, ManifestEntry, ManifestDecoder, \ + ManifestEncoder, make_empty_manifest +from .checksum import calculate_checksum +from .exceptions import ValidationError, ChecksumError + +Year = int +Month = int +YearMonth = Tuple[int, int] + + +# These TypeVars are used as placeholders in the generic IntegrityBase class, +# below. To learn more about TypeVars and Generics, see +# https://mypy.readthedocs.io/en/latest/generics.html +_Name = TypeVar('_Name') +_Record = TypeVar('_Record', bound=Union[R.RecordBase, R.RecordEntry]) +_MemberName = TypeVar('_MemberName') +_Member = TypeVar('_Member', bound=Optional['IntegrityBase']) +_Self = TypeVar('_Self', bound='IntegrityBase') + + +class IntegrityEntryMembers(GenericMonoDict[str, 'IntegrityEntry']): + """ + A dict that returns only :class: `.IntegrityEntry` instances. + + Consistent with ``Mapping[str, IntegrityEntry]``. + """ + + def __getitem__(self, key: str) -> 'IntegrityEntry': + value = dict.__getitem__(self, key) + assert isinstance(value, IntegrityEntry) + return value + + +class IntegrityBase(Generic[_Name, _Record, _MemberName, _Member]): + """ + Generic base class for all integrity collections. + + Provides a uniform protocol for integrity collections, while allowing + the name, record, member name, and member types to vary from subclass + to subclass. + """ + + member_type: Type[_Member] + """The type of members contained by an instance of a register class.""" + + def __init__(self, name: _Name, + record: Optional[_Record] = None, + members: Optional[Mapping[_MemberName, _Member]] = None, + manifest: Optional[Manifest] = None, + checksum: Optional[str] = None) -> None: + self._manifest = manifest + self._checksum = checksum + self._members = members + self._record = record + self.name = name + + @classmethod + def from_record(cls: Type[_Self], record: _Record, + checksum: Optional[str] = None, + calculate_new_checksum: bool = True) -> _Self: + members = { + key: cls.member_type.from_record(member_record, + calculate_new_checksum=True) + for key, member_record in record.members.items() + } + manifest = cls.make_manifest(members) + if calculate_new_checksum: + checksum = calculate_checksum(manifest) + return cls(record.name, record=record, members=members, + manifest=manifest, checksum=checksum) + + @classmethod + def make_manifest(cls, members: Mapping[_MemberName, _Member]) -> Manifest: + """Make a :class:`.Manifest` for this integrity collection.""" + entries = [cls.make_manifest_entry(members[n]) for n in members] + number_of_events_by_type = { + etype: sum([e['number_of_events_by_type'].get(etype, 0) + for e in entries]) + for etype in EventType + } + return Manifest( + entries=entries, + number_of_events=sum([e['number_of_events'] for e in entries]), + number_of_events_by_type=number_of_events_by_type, + number_of_versions=sum([e['number_of_versions'] for e in entries]), + ) + + @classmethod + def make_manifest_entry(cls, member: _Member) -> ManifestEntry: + return ManifestEntry(key=member.manifest_name, + checksum=member.checksum, + number_of_events=member.manifest['number_of_events'], + number_of_events_by_type=member.manifest['number_of_events_by_type'], + number_of_versions=member.manifest['number_of_versions']) + + @property + def checksum(self) -> str: + """The checksum of this integrity collection.""" + if self._checksum is None: + raise RuntimeError(f'Missing checksum for {self}') + assert self._checksum is not None + return self._checksum + + @property + def is_valid(self) -> bool: + """Indicates whether or not this collection has a valid checksum.""" + return bool(self.checksum == self.calculate_checksum()) + + @property + def manifest(self) -> Manifest: + """The :class:`.Manifest` of this integrity collection.""" + assert self._manifest is not None + return self._manifest + + @property + def manifest_name(self) -> str: + """Get the name of this object for a parent manifest.""" + return str(self.name) + + @property + def members(self) -> Mapping[_MemberName, _Member]: + """The members of this collection.""" + assert self._members is not None + return self._members + + @property + def number_of_events(self) -> int: + assert isinstance(self.record.domain.number_of_events, int) + return self.record.domain.number_of_events + + @property + def number_of_versions(self) -> int: + assert isinstance(self.record.domain.number_of_versions, int) + return self.record.domain.number_of_versions + + @property + def record(self) -> _Record: + """The record associated with this collection.""" + assert self._record is not None + return self._record + + def calculate_checksum(self) -> str: + return calculate_checksum(self.manifest) + + def extend_manifest(self, member: _Member) -> None: + entry = self.make_manifest_entry(member) + self.manifest['entries'].append(entry) + self.manifest['number_of_versions'] += entry['number_of_versions'] + self.manifest['number_of_events'] += entry['number_of_events'] + for key in self.manifest['number_of_events_by_type']: + self.manifest['number_of_events_by_type'][key] += entry['number_of_events_by_type'][key] + # print(self, type(self), self.manifest) + self.update_checksum() + + def iter_members(self) -> Iterable[_Member]: + return [self.members[name] for name in self.members] + + def update_checksum(self) -> None: + """Set the checksum for this record.""" + self._checksum = self.calculate_checksum() + + def set_record(self, record: _Record) -> None: + self._record = record + + def update_or_extend_manifest(self, member: _Member, checksum: str) \ + -> None: + """Update the checksum on a manifest entry, or add a new entry.""" + found = False + for entry in self.manifest['entries']: + # Update existing manifest entry. + if entry['key'] == member.manifest_name: + entry['checksum'] = checksum + found = True + break + if not found: # New manifest entry. + self.extend_manifest(member) + + +class IntegrityEntryBase(IntegrityBase[str, _Record, None, None]): + record_type: Type[_Record] + + +class IntegrityEntry(IntegrityEntryBase[R.RecordEntry]): + """Integrity concept for a single entry in the record.""" + + record_type = R.RecordEntry + + @classmethod + def from_record(cls: Type[_Self], record: R.RecordEntry, + checksum: Optional[str] = None, + calculate_new_checksum: bool = True) -> _Self: + """Generate an :class:`.IntegrityEntry` from a :class:`.RecordEntry.""" + if calculate_new_checksum: + checksum = calculate_checksum(record.stream) + return cls(name=record.key, record=record, checksum=checksum) + + # This is redefined since the entry has no manifest; the record entry is + # used instead. + def calculate_checksum(self) -> str: + return calculate_checksum(self.record.stream) \ No newline at end of file diff --git a/arxiv/canonical/integrity/exceptions.py b/arxiv/canonical/integrity/exceptions.py new file mode 100644 index 0000000..71bff13 --- /dev/null +++ b/arxiv/canonical/integrity/exceptions.py @@ -0,0 +1,8 @@ + + +class ValidationError(Exception): + """A data consistency problem was encountered.""" + + +class ChecksumError(ValidationError): + """An unexpected checksum value was encountered.""" diff --git a/arxiv/canonical/integrity/listing.py b/arxiv/canonical/integrity/listing.py new file mode 100644 index 0000000..acc63ca --- /dev/null +++ b/arxiv/canonical/integrity/listing.py @@ -0,0 +1,114 @@ + +from datetime import date +from typing import Optional, Type + +from ..manifest import ManifestEntry, Manifest + +from .core import (IntegrityBase, IntegrityEntryBase, D, R, _Self, + Year, Month, YearMonth, calculate_checksum) + + + +class IntegrityListing(IntegrityEntryBase[R.RecordListing]): + + record_type = R.RecordListing + + @classmethod + def from_record(cls: Type[_Self], record: R.RecordListing, + checksum: Optional[str] = None, + calculate_new_checksum: bool = True) -> _Self: + """Make an :class:`.IntegrityListing` from a :class:`.RecordListing.""" + if calculate_new_checksum: + checksum = calculate_checksum(record.stream) + return cls(name=record.key, record=record, checksum=checksum) + + # This is redefined since the entry has no manifest; the record entry is + # used instead. + def calculate_checksum(self) -> str: + return calculate_checksum(self.record.stream) + + + +class IntegrityListingDay(IntegrityBase[date, + R.RecordListingDay, + D.ListingIdentifier, + IntegrityListing]): + """Integrity collection of listings for a single day.""" + + @classmethod + def make_manifest_entry(cls, member: IntegrityListing) -> ManifestEntry: + assert isinstance(member.record.domain, D.Listing) + return ManifestEntry(key=member.manifest_name, + checksum=member.checksum, + size_bytes=member.record.stream.size_bytes, + mime_type=member.record.stream.content_type.mime_type, + number_of_versions=0, + number_of_events=len(member.record.domain.events), + number_of_events_by_type=member.record.domain.number_of_events_by_type) + + @property + def manifest_name(self) -> str: + """The name to use for this record in a parent manifest.""" + return self.name.isoformat() + + @classmethod + def from_record(cls: Type[_Self], record: R.RecordListingDay, + checksum: Optional[str] = None, + calculate_new_checksum: bool = True) -> _Self: + """ + Generate an :class:`.IntegrityListing` from a :class:`.RecordListing`. + """ + members = {name: IntegrityListing.from_record(record.members[name]) + for name in record.members} + # members = { + # record.listing.name: IntegrityEntry.from_record(record.listing) + # } + manifest = cls.make_manifest(members) + if calculate_new_checksum: + checksum = calculate_checksum(manifest) + assert not isinstance(checksum, bool) + return cls(record.name, members=members, manifest=manifest, + checksum=checksum) + + +class IntegrityListingMonth(IntegrityBase[YearMonth, + R.RecordListingMonth, + date, + IntegrityListingDay]): + """Integrity collection of listings for a single month.""" + + @property + def manifest_name(self) -> str: + """The name to use for this record in a parent manifest.""" + return f'{self.year}-{str(self.month).zfill(2)}' + + @property + def month(self) -> Month: + """The numeric month represented by this collection.""" + return self.name[1] + + @property + def year(self) -> Year: + """The numeric year represented by this collection.""" + return self.name[0] + + +class IntegrityListingYear(IntegrityBase[Year, + R.RecordListingYear, + YearMonth, + IntegrityListingMonth]): + """Integrity collection of listings for a single year.""" + + @property + def year(self) -> Year: + """The numeric year represented by this collection.""" + return self.name + + +class IntegrityListings(IntegrityBase[str, + R.RecordListings, + Year, + IntegrityListingYear]): + """Integrity collection of all listings.""" + + diff --git a/arxiv/canonical/integrity/metadata.py b/arxiv/canonical/integrity/metadata.py new file mode 100644 index 0000000..b7be4c5 --- /dev/null +++ b/arxiv/canonical/integrity/metadata.py @@ -0,0 +1,22 @@ + +from typing import Optional, Type +from .core import IntegrityEntryBase, R, _Self, calculate_checksum + + +class IntegrityMetadata(IntegrityEntryBase[R.RecordMetadata]): + """Integrity entry for a metadata bitstream in the record.""" + + record_type = R.RecordMetadata + + @classmethod + def from_record(cls: Type[_Self], record: R.RecordMetadata, + checksum: Optional[str] = None, + calculate_new_checksum: bool = True) -> _Self: + if calculate_new_checksum: + checksum = calculate_checksum(record.stream) + return cls(name=record.key, record=record, checksum=checksum) + + # This is redefined since the entry has no manifest; the record entry is + # used instead. + def calculate_checksum(self) -> str: + return calculate_checksum(self.record.stream) \ No newline at end of file diff --git a/arxiv/canonical/integrity/preservation.py b/arxiv/canonical/integrity/preservation.py new file mode 100644 index 0000000..cf7d661 --- /dev/null +++ b/arxiv/canonical/integrity/preservation.py @@ -0,0 +1,3 @@ +"""Checksums and manifests for the daily preservation record.""" + +# TODO: implement me! \ No newline at end of file diff --git a/arxiv/canonical/integrity/tests/__init__.py b/arxiv/canonical/integrity/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/arxiv/canonical/integrity/tests/test_version.py b/arxiv/canonical/integrity/tests/test_version.py new file mode 100644 index 0000000..8cca523 --- /dev/null +++ b/arxiv/canonical/integrity/tests/test_version.py @@ -0,0 +1,190 @@ +import io +import json +import os +import tempfile +from datetime import datetime +from pprint import pprint +from pytz import UTC +from typing import IO + +from unittest import TestCase, mock + +from ..version import IntegrityVersion, IntegrityEPrint, R, D + + +def fake_dereferencer(uri: D.URI) -> IO[bytes]: + """Simulates a dereferencer for canonical URIs.""" + return io.BytesIO(b'fake content for ' + uri.encode('utf-8')) + + +class TestIntegrityVersion(TestCase): + def setUp(self): + """We have a RecordVersion...""" + self.identifier = D.VersionedIdentifier('2901.00345v1') + created = datetime(2029, 1, 29, 20, 4, 23, tzinfo=UTC) + self.version = D.Version( + identifier=self.identifier, + announced_date=created.date(), + announced_date_first=created.date(), + submitted_date=created, + updated_date=created, + is_announced=True, + events=[], + previous_versions=[], + metadata=D.Metadata( + primary_classification=D.Category('cs.DL'), + secondary_classification=[D.Category('cs.IR')], + title='Foo title', + abstract='It is abstract', + authors='Ima N. Author (FSU)', + license=D.License(href="http://some.license") + ), + source=D.CanonicalFile( + filename='2901.00345v1.tar', + modified=created, + size_bytes=4_304, + content_type=D.ContentType.tar, + ref=D.URI('/fake/path.tar'), + is_gzipped=False, + ), + render=D.CanonicalFile( + filename='2901.00345v1.pdf', + modified=created, + size_bytes=404, + content_type=D.ContentType.pdf, + ref=D.URI('/fake/path.pdf') + ) + ) + self.record = R.RecordVersion.from_domain(self.version, + fake_dereferencer) + + def test_manifest(self): + """IntegrityVersion makes a manifest from an IntegrityRecord.""" + integrity = IntegrityVersion.from_record(self.record) + expected_entries = sorted([ + {'key': + 'arxiv:///e-prints/2029/01/2901.00345/v1/2901.00345v1.json', + 'checksum': 'xLOiGxEmoytrXeB7Nw3lHw==', + 'size_bytes': 1187, + 'mime_type': 'application/json'}, + {'key': 'arxiv:///e-prints/2029/01/2901.00345/v1/2901.00345v1.pdf', + 'checksum': '7OdqCRhN09_flc5fVUZ1Tg==', + 'size_bytes': 404, + 'mime_type': 'application/pdf'}, + {'key': + 'arxiv:///e-prints/2029/01/2901.00345/v1/2901.00345v1.tar', + 'checksum': '1GR0xuZYavi6N04v3-1wIw==', + 'size_bytes': 4304, + 'mime_type': 'application/x-tar'} + ], key=lambda e: e['key']) + + manifest_entries = sorted(integrity.manifest['entries'], + key=lambda e: e['key']) + + self.assertListEqual( + [e['key'] for e in manifest_entries], + [e['key'] for e in expected_entries], + 'Manifest contains the expected keys' + ) + self.assertListEqual( # Exclude the abs file. + [e['size_bytes'] for e in manifest_entries + if not e['key'].endswith('2901.00345v1.json')], + [e['size_bytes'] for e in expected_entries + if not e['key'].endswith('2901.00345v1.json')], + 'Manifest contains the expected sizes' + ) + self.assertListEqual( + [e['mime_type'] for e in manifest_entries], + [e['mime_type'] for e in expected_entries], + 'Manifest contains the mime types.' + ) + self.assertEqual(integrity.manifest['number_of_versions'], 1, + 'One version is included in the manifest') + + def test_checksum(self): + """A checksum is calculated for the whole Version.""" + integrity = IntegrityVersion.from_record(self.record) + self.assertIsNotNone(integrity.checksum) + + +class TestIntegrityEPrint(TestCase): + def setUp(self): + """We have a RecordEPrint...""" + self.identifier = D.VersionedIdentifier('2901.00345v1') + created = datetime(2029, 1, 29, 20, 4, 23, tzinfo=UTC) + self.version = D.Version( + identifier=self.identifier, + announced_date=created.date(), + announced_date_first=created.date(), + submitted_date=created, + updated_date=created, + is_announced=True, + events=[], + previous_versions=[], + metadata=D.Metadata( + primary_classification=D.Category('cs.DL'), + secondary_classification=[D.Category('cs.IR')], + title='Foo title', + abstract='It is abstract', + authors='Ima N. Author (FSU)', + license=D.License(href="http://some.license") + ), + source=D.CanonicalFile( + filename='2901.00345v1.tar', + modified=created, + size_bytes=4_304, + content_type=D.ContentType.tar, + ref=D.URI('/fake/path.tar'), + is_gzipped=True + ), + render=D.CanonicalFile( + filename='2901.00345v1.pdf', + modified=created, + size_bytes=404, + content_type=D.ContentType.pdf, + ref=D.URI('/fake/path.pdf') + ) + ) + self.eprint = D.EPrint(self.identifier.arxiv_id, + versions={self.identifier: self.version}) + self.record = R.RecordEPrint( + self.identifier.arxiv_id, + members={ + self.identifier: + R.RecordVersion.from_domain(self.version, + fake_dereferencer) + }, + domain=self.eprint + ) + + def test_checksum(self): + """A checksum is calculated for the whole EPrint.""" + integrity = IntegrityEPrint.from_record(self.record) + self.assertIsNotNone(integrity.checksum) + + def test_manifest(self): + """A manifest is generated for the EPrint.""" + integrity = IntegrityEPrint.from_record(self.record) + expected_entries = [ + {'key': '2901.00345v1', + 'checksum': 'Nodg72IZ_8yIBJ9p6Y5DcQ==', + 'number_of_versions': 1, + 'number_of_events': 0, + 'number_of_events_by_type': {}} + ] + self.assertEqual(len(integrity.manifest['entries']), + len(expected_entries), + 'There is one entry') + self.assertEqual(integrity.manifest['entries'][0]['key'], + expected_entries[0]['key'], + 'Expected key is present') + self.assertEqual( + integrity.manifest['entries'][0]['number_of_versions'], + expected_entries[0]['number_of_versions'], + 'Number of versions is correct' + ) + + def test_members(self): + integrity = IntegrityEPrint.from_record(self.record) + self.assertEqual(integrity.members[self.identifier].record.domain, + self.version, 'Original Version is preserved') \ No newline at end of file diff --git a/arxiv/canonical/integrity/version.py b/arxiv/canonical/integrity/version.py new file mode 100644 index 0000000..89315b1 --- /dev/null +++ b/arxiv/canonical/integrity/version.py @@ -0,0 +1,235 @@ + +from datetime import date +from typing import Dict, Mapping, Optional, Type, Union + +from ..manifest import ManifestEntry, Manifest, checksum_from_manifest + +from .core import (IntegrityBase, IntegrityEntryBase, IntegrityEntryMembers, + IntegrityEntry, D, R, _Self, Year, Month, YearMonth, + calculate_checksum, GenericMonoDict) +from .metadata import IntegrityMetadata + +_VersionMember = Union[IntegrityEntry, IntegrityMetadata] + + +class IntegrityVersionMembers(GenericMonoDict[str, _VersionMember]): + """Member mapping that supports IntegrityEntry and IntegrityMetadata.""" + + def __getitem__(self, key: str) -> _VersionMember: + value = dict.__getitem__(self, key) + assert isinstance(value, (IntegrityEntry, IntegrityMetadata)) + return value + + +class IntegrityVersion(IntegrityBase[D.VersionedIdentifier, + R.RecordVersion, + str, + _VersionMember]): + """Integrity collection for an e-print version.""" + + @classmethod + def from_record(cls: Type[_Self], version: R.RecordVersion, + checksum: Optional[str] = None, + calculate_new_checksum: bool = True, + manifest: Optional[Manifest] = None) -> _Self: + """ + Get an :class:`.IntegrityVersion` from a :class:`.RecordVersion`. + + Parameters + ---------- + version : :class:`.RecordVersion` + The record for which this integrity object is to be generated. + checksum : str or None + manifest : dict + If provided, checksum values for member files will be retrieved + from this manifest. Otherwise they will be calculated from the + file content. + calculate_new_checksum : bool + If ``True``, a new checksum will be calculated from the manifest. + + Returns + ------- + :class:`.IntegrityVersion` + + """ + calculate_new_checksum_for_members = bool(manifest is None) + render_checksum: Optional[str] = None + source_checksum: Optional[str] = None + format_checksums: Dict[D.ContentType, Optional[str]] = {} + if manifest: + source_checksum = checksum_from_manifest( + manifest, + R.RecordVersion.make_key( + version.identifier, + version.source.domain.filename + ) + ) + format_checksums = { + fmt: checksum_from_manifest( + manifest, + R.RecordVersion.make_key(version.identifier, + cf.domain.filename) + ) for fmt, cf in version.formats.items() + } + if version.render: + render_checksum = checksum_from_manifest( + manifest, + R.RecordVersion.make_key( + version.identifier, + version.render.domain.filename + ) + ) + formats = { + fmt.value: IntegrityEntry.from_record( + cf, + checksum=format_checksums.get(fmt), + calculate_new_checksum=calculate_new_checksum_for_members + ) for fmt, cf in version.formats.items() + } + if version.render: + formats['render'] = IntegrityEntry.from_record( + version.render, + checksum=render_checksum, + calculate_new_checksum=calculate_new_checksum_for_members + ) + members = IntegrityVersionMembers( + metadata=IntegrityMetadata.from_record(version.metadata), + source=IntegrityEntry.from_record( + version.source, + checksum=source_checksum, + calculate_new_checksum=calculate_new_checksum_for_members + ), + **formats + ) + manifest = cls.make_manifest(members) + if calculate_new_checksum: + checksum = calculate_checksum(manifest) + return cls(version.identifier, record=version, members=members, + manifest=manifest, checksum=checksum) + + @classmethod + def make_manifest(cls, members: Mapping[str, _VersionMember]) -> Manifest: + """Make a :class:`.Manifest` for this integrity collection.""" + return Manifest( + entries=[cls.make_manifest_entry(members[n]) for n in members], + number_of_events=0, + number_of_events_by_type={}, + number_of_versions=1 + ) + + @classmethod + def make_manifest_entry(cls, member: _VersionMember) -> ManifestEntry: + return ManifestEntry( + key=member.manifest_name, + checksum=member.checksum, + size_bytes=member.record.stream.size_bytes, + mime_type=member.record.stream.content_type.mime_type + ) + + @property + def metadata(self) -> IntegrityMetadata: + assert isinstance(self.members['metadata'], IntegrityMetadata) + return self.members['metadata'] + + @property + def render(self) -> Optional[IntegrityEntry]: + if 'render' in self.members: + assert isinstance(self.members['render'], IntegrityEntry) + return self.members['render'] + return None + + @property + def source(self) -> IntegrityEntry: + assert isinstance(self.members['source'], IntegrityEntry) + return self.members['source'] + + @property + def formats(self) -> Dict[D.ContentType, IntegrityEntry]: + return {D.ContentType(fmt): cf for fmt, cf in self.members.items() + if fmt not in ['metadata', 'source', 'render'] + and isinstance(cf, IntegrityEntry)} + + +class IntegrityEPrint(IntegrityBase[D.Identifier, + R.RecordEPrint, + D.VersionedIdentifier, + IntegrityVersion]): + """Integrity collection for an :class:`.EPrint`.""" + + member_type = IntegrityVersion + + @classmethod + def make_manifest_entry(cls, member: IntegrityVersion) -> ManifestEntry: + return ManifestEntry(key=member.manifest_name, + checksum=member.checksum, + number_of_versions=1, + number_of_events=0, + number_of_events_by_type={}) + + +class IntegrityDay(IntegrityBase[date, + R.RecordDay, + D.Identifier, + IntegrityEPrint]): + """ + Integrity collection for e-prints associated with a single day. + + Specifically, this includes all versions of e-prints the first version of + which was announced on this day. + """ + + @property + def day(self) -> date: + """The numeric day represented by this collection.""" + return self.name + + +class IntegrityMonth(IntegrityBase[YearMonth, + R.RecordMonth, + date, + IntegrityDay]): + """ + Integrity collection for e-prints associated with a single month. + + Specifically, this includes all versions of e-prints the first version of + which was announced in this month. + """ + + @property + def manifest_name(self) -> str: + """The name to use for this record in a parent manifest.""" + return f'{self.year}-{str(self.month).zfill(2)}' + + @property + def month(self) -> Month: + """The numeric month represented by this collection.""" + return self.name[1] + + @property + def year(self) -> Year: + """The numeric year represented by this collection.""" + return self.name[0] + + +class IntegrityYear(IntegrityBase[Year, + R.RecordYear, + YearMonth, + IntegrityMonth]): + """ + Integrity collection for e-prints associated with a single year. + + Specifically, this includes all versions of e-prints the first version of + which was announced in this year. + """ + + @property + def year(self) -> Year: + """The numeric year represented by this collection.""" + return self.name + + +class IntegrityEPrints(IntegrityBase[str, + R.RecordEPrints, + Year, + IntegrityYear]): + """Integrity collection for all e-prints in the canonical record.""" \ No newline at end of file diff --git a/arxiv/canonical/log/__init__.py b/arxiv/canonical/log/__init__.py new file mode 100644 index 0000000..465f46e --- /dev/null +++ b/arxiv/canonical/log/__init__.py @@ -0,0 +1,3 @@ +"""Provides the write log for the canonical record.""" + +from .log import Log, LogEntry, WRITE, READ \ No newline at end of file diff --git a/arxiv/canonical/log/log.py b/arxiv/canonical/log/log.py new file mode 100644 index 0000000..04a8081 --- /dev/null +++ b/arxiv/canonical/log/log.py @@ -0,0 +1,148 @@ +"""Provides the write log for the canonical record.""" + +import json +import os +from datetime import datetime +from typing import Iterable, Optional + +from backports.datetime_fromisoformat import MonkeyPatch +from pytz import timezone + +from .. import domain as D + +MonkeyPatch.patch_fromisoformat() + +Action = str +Outcome = str + +SUCCEEDED: Outcome = 'SUCCESS' +FAILED: Outcome = 'FAILED' + +DEREFERENCE: Action = 'DEREF' +READ: Action = 'READ' +WRITE: Action = 'WRITE' + +ET = timezone('US/Eastern') + + +class LogEntry: + timestamp: datetime + """The time of the log entry.""" + + event_id: D.EventIdentifier + """Identifier of the event being handled.""" + + # key: D.Key + # """Specific key being handled.""" + + action: Action + """Action being performed by the agent.""" + + state: Outcome + """Outcome of the action.""" + + message: str + """Additional unstructured information about the action.""" + + def __init__(self, timestamp: datetime, + event_id: D.EventIdentifier, + # key: D.Key, + action: Action, + state: Outcome, + message: str) -> None: + self.timestamp = timestamp + self.event_id = event_id + self.action = action + self.state = state + self.message = message + + @classmethod + def from_repr(cls, repr: str) -> 'LogEntry': + data = json.loads(repr) + return cls( + timestamp=datetime.fromisoformat(data['timestamp']), # type: ignore ; pylint: disable=no-member + event_id=D.EventIdentifier(data['event_id']), + # key=D.Key(data['key']), + action=data['action'], + state=data['state'], + message=data.get('message', '') + ) + + def __repr__(self) -> str: + return json.dumps({ + 'timestamp': self.timestamp.isoformat(), + 'event_id': self.event_id, + 'action': self.action, + 'state': self.state, + 'message': self.message + }) + + +class Log: + """Action log for a canonical agent.""" + + def __init__(self, path: str) -> None: + """Initialize with a reader and writer.""" + self.path = os.path.abspath(path) + if not os.path.exists(self.path): + raise RuntimeError(f'No such path: {self.path}') + try: + self._writer = open(self.current_log_path, 'a') + self._reader = open(self.current_log_path, 'r') + except Exception as e: + raise RuntimeError(f'Could not open {self.path} for writing') + + @property + def current_log_path(self) -> str: + """The path to the current log file.""" + return f'{self.path}/.{datetime.now(ET).date().isoformat()}.log' + + def write(self, + event_id: D.EventIdentifier, + action: Action, + state: Outcome, + message: str) -> LogEntry: + """Write a log entry.""" + entry = LogEntry(datetime.now(ET), + event_id, + action, + state, + message) + self._writer.write(f'{entry}\n') + self._writer.flush() # So that the reader can see what's up. + return entry + + def log_success(self, + event_id: D.EventIdentifier, + # key: D.Key, + action: Action, + message: str = '') -> LogEntry: + """Log a successful action.""" + return self.write(event_id, action, SUCCEEDED, message) + + def log_failure(self, + event_id: D.EventIdentifier, + # key: D.Key, + action: Action, + message: str = '') -> LogEntry: + """Log a failed action.""" + return self.write(event_id, action, FAILED, message) + + def read_last_entry(self) -> LogEntry: + """Read the last entry in the log.""" + entry = LogEntry.from_repr(self._reader.readlines()[-1]) + self._reader.seek(0) + return entry + + def read_last_succeeded(self) -> Optional[LogEntry]: + """Read the last SUCCEEDED entry in the log.""" + lines = self._reader.readlines() + for i in range(1, len(lines)): + entry = LogEntry.from_repr(lines[-i]) + if entry.state == SUCCEEDED: + return entry + return None + + def read_all(self) -> Iterable[LogEntry]: + for line in self._reader.readlines(): + yield LogEntry.from_repr(line) diff --git a/arxiv/canonical/log/tests.py b/arxiv/canonical/log/tests.py new file mode 100644 index 0000000..5ee1449 --- /dev/null +++ b/arxiv/canonical/log/tests.py @@ -0,0 +1,36 @@ +from datetime import datetime +import tempfile +from unittest import TestCase +from pytz import timezone +from . import log +from .log import D + +ET = timezone('US/Eastern') + + +class TestLog(TestCase): + def setUp(self): + """Create a new log.""" + self.path = tempfile.mkdtemp() + self.log = log.Log(self.path) + + def test_path(self): + """Log paths based on the root path and current date.""" + self.assertEqual( + self.log.current_log_path, + f'{self.path}/.{datetime.now(ET).date().isoformat()}.log' + ) + + def test_deref_success(self): + """Log a successful dereference action.""" + vid = D.VersionedIdentifier('1902.00123v3') + event_id = D.EventIdentifier.from_parts(vid, datetime.now(ET), 'foo') + key = D.Key('file:///foo/baz/bat.tar.gz') + + entry = self.log.log_success(event_id, key, log.DEREFERENCE) + + self.assertEqual(self.log.read_last_entry().__dict__, + entry.__dict__, + "Logged success is the last entry") + + diff --git a/arxiv/canonical/manifest.py b/arxiv/canonical/manifest.py new file mode 100644 index 0000000..bb29b6b --- /dev/null +++ b/arxiv/canonical/manifest.py @@ -0,0 +1,82 @@ +"""Defines the structure of manifest records, used to store integrity info.""" + +import json +from enum import Enum +from typing import Optional, List, Dict, Any +from mypy_extensions import TypedDict + +from .domain.version import EventType + + +class ManifestEntry(TypedDict, total=False): + """Structure of a single entry in a manifest.""" + + key: str + checksum: Optional[str] + size_bytes: int + mime_type: str + number_of_events: int + number_of_events_by_type: Dict[EventType, int] + number_of_versions: int + + +class Manifest(TypedDict): + """Structure of a manifest record.""" + + entries: List[ManifestEntry] + number_of_events: int + number_of_events_by_type: Dict[EventType, int] + number_of_versions: int + + +class ManifestEncoder(json.JSONEncoder): + """JSON encoder for manifests.""" + + def unpack(self, obj: Any) -> Any: + """Convert manifests and their members to native Python types.""" + if isinstance(obj, Enum): + return obj.value + elif isinstance(obj, dict): + return {self.unpack(key): self.unpack(value) + for key, value in obj.items()} + elif isinstance(obj, list): + return [self.unpack(item) for item in obj] + return obj + + def encode(self, obj: Any) -> Any: + """Serialize manifest objects.""" + return super(ManifestEncoder, self).encode(self.unpack(obj)) + + +class ManifestDecoder(json.JSONDecoder): + """JSON decoder for manifests.""" + + def __init__(self, *args: Any, **kwargs: Any) -> None: + """Pass :func:`object_hook` to the base constructor.""" + kwargs['object_hook'] = kwargs.get('object_hook', self.object_hook) + super(ManifestDecoder, self).__init__(*args, **kwargs) + + def object_hook(self, obj: dict, **extra: Any) -> Any: # pylint: disable=method-hidden + """Decode the manifest to domain types.""" + if 'number_of_events_by_type' in obj: + obj['number_of_events_by_type'] = { + EventType(key): value + for key, value in obj['number_of_events_by_type'].items() + } + return obj + + +def make_empty_manifest() -> Manifest: + """Generate a new empty manifest.""" + return Manifest(entries=[], + number_of_events=0, + number_of_versions=0, + number_of_events_by_type={}) + + +def checksum_from_manifest(manifest: Manifest, key: str) -> Optional[str]: + """Retrieve a checksum for a key from a manifest.""" + for entry in manifest['entries']: + if entry['key'] == key: + return entry['checksum'] + raise KeyError(f'Not found: {key}') \ No newline at end of file diff --git a/arxiv/canonical/preservation.py b/arxiv/canonical/preservation.py new file mode 100644 index 0000000..277434a --- /dev/null +++ b/arxiv/canonical/preservation.py @@ -0,0 +1,7 @@ +"""Provides the high-level API for the daily preservation record.""" + +from .core import IPreservationAPI + + +class PreservationAPI(IPreservationAPI): + """Implementation of the high-level API for the preservation record.""" \ No newline at end of file diff --git a/arxiv/canonical/record/__init__.py b/arxiv/canonical/record/__init__.py new file mode 100644 index 0000000..33e3d94 --- /dev/null +++ b/arxiv/canonical/record/__init__.py @@ -0,0 +1,72 @@ +""" +Defines how the canonical record is represented in a key-binary system. + +The bulk of this module is concerned with how keys for records and record +manifests are generated. + +Classes in this module are largely isomorphic to those in :mod:`.domain`. +:class:`.RecordEntry` represents content at the bitstream level, e.g. a file +containing a listings document or a render PDF. Collections of entries are +based on :class:`RecordBase`, and are composed hierarchically with the apex +at :class:`.Record`. +""" + +from typing import Union +from .core import RecordBase, RecordEntry, RecordStream, D +from .file import RecordFile +from .listing import (RecordListing, RecordListingDay, RecordListingMonth, + RecordListingYear, RecordListings) +from .metadata import RecordMetadata +from .version import (RecordVersion, RecordEPrint, RecordDay, RecordMonth, + RecordYear, RecordEPrints) + +__all__ = ( + 'Record', + 'RecordBase', + 'RecordDay', + 'RecordEntry', + 'RecordEPrint', + 'RecordEPrints', + 'RecordFile', + 'RecordListing', + 'RecordListingDay', + 'RecordListingMonth', + 'RecordListingYear', + 'RecordListings', + 'RecordMetadata', + 'RecordMonth', + 'RecordStream', + 'RecordVersion', + 'RecordYear', +) + + +class Record(RecordBase[str, + str, + Union[RecordEPrints, RecordListings], + D.Canon]): + """The apex container for the canonical record.""" + + @classmethod + def make_manifest_key(cls, _: str) -> D.Key: + """ + Make a key for global manifest. + + Returns + ------- + str + + """ + return D.Key(f'global.manifest.json') + + @property + def eprints(self) -> RecordEPrints: + assert 'eprints' in self.members + assert isinstance(self.members['eprints'], RecordEPrints) + return self.members['eprints'] + + @property + def listings(self) -> RecordListings: + assert 'listings' in self.members + assert isinstance(self.members['listings'], RecordListings) + return self.members['listings'] \ No newline at end of file diff --git a/arxiv/canonical/record/core.py b/arxiv/canonical/record/core.py new file mode 100644 index 0000000..b9c4826 --- /dev/null +++ b/arxiv/canonical/record/core.py @@ -0,0 +1,116 @@ +"""Base classes and core concepts for :mod:`arxiv.canonical.record`.""" + +import datetime +import os +from abc import ABC +from io import BytesIO +from json import dumps, load +from typing import NamedTuple, List, IO, Iterator, Tuple, Optional, Dict, \ + Callable, Iterable, MutableMapping, Mapping, Generic, Type, TypeVar, \ + Union, Any + +from ..serialize.decoder import CanonicalDecoder +from ..serialize.encoder import CanonicalEncoder + +from .. import domain as D +from ..util import GenericMonoDict + +Year = int +Month = int +YearMonth = Tuple[Year, Month] + + +class RecordStream(NamedTuple): + """A single bitstream in the record.""" + + domain: D.CanonicalFile + + content: Optional[IO[bytes]] + """Raw content of the entry.""" + + content_type: D.ContentType + """MIME-type of the content.""" + + size_bytes: int + """Size of ``content`` in bytes.""" + + +class RecordEntryMembers(GenericMonoDict[str, 'RecordEntry']): + """ + A dict that returns only :class: `.RecordEntry` instances. + + Consistent with ``Mapping[str, RecordEntry]``. + """ + def __getitem__(self, key: str) -> 'RecordEntry': + value = dict.__getitem__(self, key) + assert isinstance(value, RecordEntry) + return value + + +_EDomain = TypeVar('_EDomain', bound=D.CanonicalBase) +_Self = TypeVar('_Self', bound='RecordEntry') + + +class RecordEntry(Generic[_EDomain]): + """ + An entry in the canonical record. + + Comprised of a :class:`.RecordStream` and a domain representation of the + entry (i.e. the application-level interpretation of the stream). + """ + + key: D.Key + """Full key (path) at which the entry is stored.""" + domain: _EDomain + stream: RecordStream + + def __init__(self, key: D.Key, stream: RecordStream, domain: _EDomain) \ + -> None: + self.key = key + self.domain = domain + self.stream = stream + + @property + def name(self) -> str: + fname = os.path.split(self.key)[1] + return os.path.splitext(fname)[0] + + @classmethod + def from_domain(cls: Type[_Self], d: _EDomain) -> _Self: + raise NotImplementedError("Must be implemented by child class") + + @classmethod + def to_domain(cls, stream: RecordStream) -> _EDomain: + raise NotImplementedError("Must be implemented by child class") + + +# These TypeVars are used as placeholders in the generic RecordBase class, +# below. To learn more about TypeVars and Generics, see +# https://mypy.readthedocs.io/en/latest/generics.html +Name = TypeVar('Name') +MemberName = TypeVar('MemberName') +Member = TypeVar('Member', bound=Union['RecordBase', RecordEntry]) +Domain = TypeVar('Domain') + + +class RecordBase(Generic[Name, MemberName, Member, Domain]): + """ + Generic base class for record collections in this module. + + This produces a uniform protocol for record collections, while allowing + name, member, and member name types to vary across collection subclasses. + """ + + def __init__(self, name: Name, + members: Mapping[MemberName, Member], + domain: Domain) -> None: + """Register the name and members of this record instance.""" + self.name = name + self.members = members + self.domain = domain + + @classmethod + def make_manifest_key(cls, name: Name) -> D.Key: # pylint: disable=unused-argument + """Generate a full key that can be used to store a manifest.""" + ... # pylint: disable=pointless-statement ; this is a stub. + diff --git a/arxiv/canonical/record/file.py b/arxiv/canonical/record/file.py new file mode 100644 index 0000000..be19d99 --- /dev/null +++ b/arxiv/canonical/record/file.py @@ -0,0 +1,5 @@ +from .core import RecordEntry, D + + +class RecordFile(RecordEntry[D.CanonicalFile]): + """An entry that is handled as an otherwise-uninterpreted file.""" \ No newline at end of file diff --git a/arxiv/canonical/record/listing.py b/arxiv/canonical/record/listing.py new file mode 100644 index 0000000..a09eca8 --- /dev/null +++ b/arxiv/canonical/record/listing.py @@ -0,0 +1,134 @@ + +import datetime +from io import BytesIO +from json import dumps, load +from typing import Type, IO, Iterable, Tuple + +from .core import RecordBase, RecordEntry, RecordStream, D, _Self, \ + Year, YearMonth + + +class RecordListing(RecordEntry[D.Listing]): + """A listing entry.""" + + @classmethod + def from_domain(cls: Type[_Self], listing: D.Listing) -> _Self: + """Serialize a :class:`.Listing`.""" + content, size_bytes = RecordListing._encode(listing) + key = RecordListing.make_key(listing.identifier) + return cls( + key=key, + stream=RecordStream( + domain=D.CanonicalFile( + modified=listing.end_datetime, + size_bytes=size_bytes, + content_type=D.ContentType.json, + filename=key.filename, + ref=key + ), + content=content, + content_type=D.ContentType.json, + size_bytes=size_bytes + ), + domain=listing + ) + + @classmethod + def from_stream(cls, key: D.Key, stream: RecordStream) -> 'RecordListing': + return cls(key=key, stream=stream, + domain=cls.to_domain(stream)) + + @classmethod + def make_key(cls, identifier: D.ListingIdentifier) -> D.Key: + prefix = cls.make_prefix(identifier.date) + value: str = identifier.date.strftime( + f'{prefix}/%Y-%m-%d-{identifier.name}.json' + ) + return D.Key(value) + + @classmethod + def make_prefix(cls, date: datetime.date) -> str: + return date.strftime(f'announcement/%Y/%m/%d') + + @classmethod + def to_domain(cls, stream: RecordStream) -> D.Listing: + assert stream.content is not None + listing = D.Listing.from_dict(load(stream.content), + ) + if stream.content.seekable: + stream.content.seek(0) + return listing + + @classmethod + def _encode(cls, listing: D.Listing) -> Tuple[IO[bytes], int]: + content = dumps(listing.to_dict()).encode('utf-8') + return BytesIO(content), len(content) + + @property + def created(self) -> datetime.datetime: + return self.domain.start_datetime + + @property + def name(self) -> str: + return 'listing' + + +class RecordListingDay(RecordBase[datetime.date, + D.ListingIdentifier, + RecordListing, + D.ListingDay]): + + @classmethod + def make_manifest_key(cls, date: datetime.date) -> D.Key: + return D.Key(date.strftime('announcement/%Y/%m/%Y-%m-%d.manifest.json')) + + +class RecordListingMonth(RecordBase[YearMonth, + datetime.date, + RecordListing, + D.ListingMonth]): + @classmethod + def make_manifest_key(cls, year_and_month: YearMonth) -> D.Key: + """ + Make a key for a monthly listing manifest. + + Returns + ------- + str + + """ + yr, month = year_and_month + return D.Key(f'announcement/{yr}/{yr}-{str(month).zfill(2)}.manifest.json') + + +class RecordListingYear(RecordBase[Year, + YearMonth, + RecordListingMonth, + D.ListingYear]): + + @classmethod + def make_manifest_key(cls, year: Year) -> D.Key: + """ + Make a key for a yearly listing manifest. + + Returns + ------- + str + + """ + return D.Key(f'announcement/{year}.manifest.json') + + +class RecordListings(RecordBase[str, Year, RecordListingYear, D.AllListings]): + + @classmethod + def make_manifest_key(cls, _: str) -> D.Key: + """ + Make a key for a root listing manifest. + + Returns + ------- + str + + """ + return D.Key('announcement.manifest.json') \ No newline at end of file diff --git a/arxiv/canonical/record/metadata.py b/arxiv/canonical/record/metadata.py new file mode 100644 index 0000000..2feffa2 --- /dev/null +++ b/arxiv/canonical/record/metadata.py @@ -0,0 +1,81 @@ +import datetime +from io import BytesIO +from json import dumps, load +from typing import Type, IO, Iterable, Tuple + +from .core import RecordEntry, RecordStream, D, _Self + + +class RecordMetadata(RecordEntry[D.Version]): + """An entry for version metadata.""" + + @classmethod + def make_key(cls, identifier: D.VersionedIdentifier) -> D.Key: + if identifier.is_old_style: + filename = f'{identifier.numeric_part}v{identifier.version}.json' + else: + filename = f'{identifier}.json' + return D.Key(f'{cls.make_prefix(identifier)}/{filename}') + + @classmethod + def make_prefix(cls, ident: D.VersionedIdentifier) -> str: + """ + Make a key prefix for an e-print record. + + Parameters + ---------- + date : datetime.date + The day on which the first version of the e-print was announced. + ident : str + arXiv identifier + + Returns + ------- + str + + """ + date_part = f'e-prints/{ident.year}/{str(ident.month).zfill(2)}' + if ident.is_old_style: + return f'{date_part}/{ident.category_part}/{ident.numeric_part}/v{ident.version}' + return f'{date_part}/{ident.arxiv_id}/v{ident.version}' + + @classmethod + def from_domain(cls: Type[_Self], version: D.Version) -> _Self: + content, size_bytes = RecordMetadata._encode(version, + ) + content_type = D.ContentType.json + key = RecordMetadata.make_key(version.identifier) + return cls( + key=key, + stream=RecordStream( + domain=D.CanonicalFile( + modified=version.updated_date, + size_bytes=size_bytes, + content_type=content_type, + ref=key, + filename=key.filename + ), + content=content, + content_type=D.ContentType.json, + size_bytes=size_bytes + ), + domain=version + ) + + @classmethod + def _encode(cls, version: D.Version) -> Tuple[IO[bytes], int]: + content = dumps(version.to_dict(), indent=2).encode('utf-8') + return BytesIO(content), len(content) + + @classmethod + def to_domain(cls, stream: RecordStream) -> D.Version: + assert stream.content is not None + version = D.Version.from_dict(load(stream.content), + ) + if stream.content.seekable: + stream.content.seek(0) + return version # RecordVersion.post_to_domain(version, load_content) + + @classmethod + def from_stream(cls, key: D.Key, stream: RecordStream) -> 'RecordMetadata': + return cls(key=key, stream=stream, domain=cls.to_domain(stream)) \ No newline at end of file diff --git a/arxiv/canonical/record/preservation.py b/arxiv/canonical/record/preservation.py new file mode 100644 index 0000000..c9cfe37 --- /dev/null +++ b/arxiv/canonical/record/preservation.py @@ -0,0 +1,3 @@ +"""Key generation for elements of the daily preservation record.""" + +# TODO: implement me! \ No newline at end of file diff --git a/arxiv/canonical/record/tests/__init__.py b/arxiv/canonical/record/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/arxiv/canonical/record/tests/test_listing.py b/arxiv/canonical/record/tests/test_listing.py new file mode 100644 index 0000000..22c5ed7 --- /dev/null +++ b/arxiv/canonical/record/tests/test_listing.py @@ -0,0 +1,105 @@ +import io +import json +import os +import tempfile +from datetime import datetime +from pytz import UTC +from typing import IO +from unittest import TestCase, mock + +import jsonschema + +from ..core import RecordEntry +from ..listing import RecordListing +from ..version import RecordVersion, D + + +def fake_dereferencer(uri: D.URI) -> IO[bytes]: + """Simulates a dereferencer for canonical URIs.""" + return io.BytesIO(b'fake content for ' + uri.encode('utf-8')) + + +class TestRecordListing(TestCase): + """RecordListing provides keys and serialization for Listings.""" + + SCHEMA_PATH = os.path.abspath('schema/resources/Listing.json') + + def setUp(self): + """We have a Listing...""" + with open(self.SCHEMA_PATH) as f: + self.schema = json.load(f) + + self.resolver = jsonschema.RefResolver( + 'file://%s/' % os.path.abspath(os.path.dirname(self.SCHEMA_PATH)), + None + ) + + self.identifier = D.VersionedIdentifier('2901.00345v1') + self.created = datetime(2029, 1, 29, 20, 4, 23, tzinfo=UTC) + self.listing_id = D.ListingIdentifier.from_parts(self.created.date(), + 'foo') + + self.version = D.Version( + identifier=self.identifier, + announced_date=self.created.date(), + announced_date_first=self.created.date(), + submitted_date=self.created, + updated_date=self.created, + is_announced=True, + events=[], + previous_versions=[], + metadata=D.Metadata( + primary_classification=D.Category('cs.DL'), + secondary_classification=[D.Category('cs.IR')], + title='Foo title', + abstract='It is abstract', + authors='Ima N. Author (FSU)', + license=D.License(href="http://some.license") + ), + source=D.CanonicalFile( + filename='2901.00345v1.tar', + modified=self.created, + size_bytes=4_304, + content_type=D.ContentType.tar, + ref=D.URI('/fake/path.tar'), + is_gzipped=False, + ), + render=D.CanonicalFile( + filename='2901.00345v1.pdf', + modified=self.created, + size_bytes=404, + content_type=D.ContentType.pdf, + ref=D.URI('/fake/path.pdf') + ) + ) + self.event = D.Event( + identifier=self.identifier, + event_date=self.created, + event_type=D.EventType.NEW, + categories=[D.Category('cs.DL')], + version=self.version + ) + self.listing = D.Listing(self.listing_id, [self.event]) + + def test_from_domain(self): + """Can load a RecordListing from a Listing domain object.""" + record = RecordListing.from_domain(self.listing) + self.assertEqual( + record.key, + 'arxiv:///announcement/2029/01/29/2029-01-29-foo.json', + 'Key for listing file is generated correctly' + ) + self.assertEqual(record.key, record.stream.domain.ref) + self.assertEqual(record.stream.content_type, D.ContentType.json, + 'Correctly identified as a JSON resource') + + def test_schema(self): + """Serialized record is schema compliant.""" + record = RecordListing.from_domain(self.listing) + raw = json.load(record.stream.content) + jsonschema.validate(raw, self.schema, resolver=self.resolver) + + def test_to_domain(self): + """Re-casting to domain should preserve state.""" + record = RecordListing.from_domain(self.listing) + self.assertEqual(RecordListing.to_domain(record.stream), self.listing) diff --git a/arxiv/canonical/record/tests/test_version.py b/arxiv/canonical/record/tests/test_version.py new file mode 100644 index 0000000..7de9442 --- /dev/null +++ b/arxiv/canonical/record/tests/test_version.py @@ -0,0 +1,124 @@ + +import io +import json +import os +import tempfile +from datetime import datetime +from pprint import pprint +from pytz import UTC +from typing import IO +from unittest import TestCase, mock + +import jsonschema + +from ..core import RecordEntry +from ..metadata import RecordMetadata +from ..version import RecordVersion, D + + +def fake_dereferencer(uri: D.URI) -> IO[bytes]: + """Simulates a dereferencer for canonical URIs.""" + return io.BytesIO(b'fake content for ' + uri.encode('utf-8')) + + +class TestRecordVersion(TestCase): + """RecordVersion provides keys and serialization for Versions.""" + + SCHEMA_PATH = os.path.abspath('schema/resources/Version.json') + + def setUp(self): + """We have a Version...""" + with open(self.SCHEMA_PATH) as f: + self.schema = json.load(f) + + self.resolver = jsonschema.RefResolver( + 'file://%s/' % os.path.abspath(os.path.dirname(self.SCHEMA_PATH)), + None + ) + + self.identifier = D.VersionedIdentifier('2901.00345v1') + created = datetime(2029, 1, 29, 20, 4, 23, tzinfo=UTC) + self.version = D.Version( + identifier=self.identifier, + announced_date=created.date(), + announced_date_first=created.date(), + submitted_date=created, + updated_date=created, + is_announced=True, + events=[], + previous_versions=[], + metadata=D.Metadata( + primary_classification=D.Category('cs.DL'), + secondary_classification=[D.Category('cs.IR')], + title='Foo title', + abstract='It is abstract', + authors='Ima N. Author (FSU)', + license=D.License(href="http://some.license") + ), + source=D.CanonicalFile( + filename='2901.00345v1.tar', + modified=created, + size_bytes=4_304, + content_type=D.ContentType.tar, + ref=D.URI('/fake/path.tar'), + is_gzipped=False, + ), + render=D.CanonicalFile( + filename='2901.00345v1.pdf', + modified=created, + size_bytes=404, + content_type=D.ContentType.pdf, + ref=D.URI('/fake/path.pdf') + ) + ) + + def test_identifier(self): + """RecordVersion exposes the identifier of the domain object.""" + record = RecordVersion.from_domain(self.version, fake_dereferencer) + self.assertEqual(record.identifier, self.identifier, + 'Version identifier is accessible') + + def test_from_domain(self): + """Can load a RecordVersion from a Version domain object.""" + record = RecordVersion.from_domain(self.version, fake_dereferencer) + self.assertTrue(record.metadata.key.is_canonical) + self.assertEqual( + record.metadata.key, + 'arxiv:///e-prints/2029/01/2901.00345/v1/2901.00345v1.json', + 'Key for metadadata record is generated correctly' + ) + + self.assertTrue(record.render.key.is_canonical) + self.assertEqual( + record.render.key, + 'arxiv:///e-prints/2029/01/2901.00345/v1/2901.00345v1.pdf', + 'Key for render is generated correctly' + ) + self.assertEqual(record.render.stream.content.read(), + b'fake content for file:///fake/path.pdf', + 'Render resource is dereferenced correctly') + + self.assertTrue(record.source.key.is_canonical) + self.assertEqual( + record.source.key, + 'arxiv:///e-prints/2029/01/2901.00345/v1/2901.00345v1.tar', + 'Key for source package is generated correctly' + ) + self.assertEqual(record.source.stream.content.read(), + b'fake content for file:///fake/path.tar', + 'Source resource is dereferenced correctly') + + def test_schema(self): + """Serialized record is schema compliant.""" + record = RecordVersion.from_domain(self.version, fake_dereferencer) + raw = json.load(record.metadata.stream.content) + jsonschema.validate(raw, self.schema, resolver=self.resolver) + + def test_to_domain(self): + """Re-casting to domain should preserve state.""" + record = RecordVersion.from_domain(self.version, fake_dereferencer) + cast_version = record.instance_to_domain() + for key in self.version.__dict__.keys(): + self.assertEqual(getattr(cast_version, key), + getattr(self.version, key), + f'{key} should match') diff --git a/arxiv/canonical/record/version.py b/arxiv/canonical/record/version.py new file mode 100644 index 0000000..75f4417 --- /dev/null +++ b/arxiv/canonical/record/version.py @@ -0,0 +1,292 @@ +import datetime +from typing import Callable, Dict, IO, Iterable, Optional + +from .core import RecordBase, RecordEntry, RecordEntryMembers, RecordStream, \ + D, Year, YearMonth +from .file import RecordFile +from .metadata import RecordMetadata + + +class RecordVersion(RecordBase[D.VersionedIdentifier, + str, + RecordEntry, + D.Version]): + """ + A collection of serialized components that make up a version record. + + A version record is comprised of (1) a metadata record, (2) a source + package, containing the original content provided by the submitter, and (3) + a canonical rendering of the version (e.g. in PDF format). + + The key prefix structure for an version record is: + + ``` + e-prints////v/ + ``` + + Where ``YYYY`` is the year and ``MM`` the month during which the first + version of the e-print was announced. + + Sub-keys are: + + - Metadata record: ``v.json`` + - Source package: ``v.tar`` + - PDF: ``v.render`` + - Manifest: ``v.manifest.json`` + + """ + + @classmethod + def from_domain(cls, version: D.Version, + dereferencer: Callable[[D.URI], IO[bytes]], + metadata: Optional[RecordMetadata] = None) -> 'RecordVersion': + """Serialize an :class:`.Version` to an :class:`.RecordVersion`.""" + if version.source is None: + raise ValueError('Source is missing') + if version.announced_date_first is None: + raise ValueError('First announcement date not set') + + # Dereference the bitstreams, wherever they happen to live. + source_content = dereferencer(version.source.ref) + format_content = {fmt: dereferencer(cf.ref) + for fmt, cf in version.formats.items()} + + source_key = RecordVersion.make_key(version.identifier, + version.source.filename) + format_keys = { + fmt: RecordVersion.make_key(version.identifier, cf.filename) + for fmt, cf in version.formats.items() + } + + source = RecordFile( + key=source_key, + stream=RecordStream( + domain=version.source, + content=source_content, + content_type=version.source.content_type, + size_bytes=version.source.size_bytes, + ), + domain=version.source + ) + + formats = { + fmt.value: RecordFile( + key=format_keys[fmt], + stream=RecordStream( + domain=cf, + content=format_content[fmt], + content_type=cf.content_type, + size_bytes=cf.size_bytes, + ), + domain=cf + ) for fmt, cf in version.formats.items() + } + if version.render: + render_content = dereferencer(version.render.ref) + render_key = RecordVersion.make_key(version.identifier, + version.render.filename) + version.render.ref = render_key + formats['render'] = RecordFile( + key=render_key, + stream=RecordStream( + domain=version.render, + content=render_content, + content_type=version.render.content_type, + size_bytes=version.render.size_bytes, + ), + domain=version.render + ) + + if metadata is None: + metadata = RecordMetadata.from_domain(version) + + # From now on we refer to bitstreams with canonical URIs. + version.source.ref = source_key + for fmt, cf in version.formats.items(): + cf.ref = format_keys[fmt] + + return RecordVersion( + version.identifier, + members=RecordEntryMembers( + metadata=metadata, + source=source, + **formats + ), + domain=version + ) + + @classmethod + def make_key(cls, identifier: D.VersionedIdentifier, + filename: Optional[str] = None) -> D.Key: + if filename is None: + return RecordMetadata.make_key(identifier) + return D.Key(f'{cls.make_prefix(identifier)}/{filename}') + + @classmethod + def make_manifest_key(cls, ident: D.VersionedIdentifier) -> D.Key: + date_part = f'e-prints/{ident.year}/{str(ident.month).zfill(2)}' + if ident.is_old_style: + return D.Key(f'{date_part}/{ident.category_part}/{ident.numeric_part}/{ident.numeric_part}.manifest.json') + return D.Key(f'{date_part}/{ident.arxiv_id}/{ident}.manifest.json') + + @classmethod + def make_prefix(cls, ident: D.VersionedIdentifier) -> str: + """ + Make a key prefix for an e-print record. + + Parameters + ---------- + date : datetime.date + The day on which the first version of the e-print was announced. + ident : str + arXiv identifier + + Returns + ------- + str + + """ + date_part = f'e-prints/{ident.year}/{str(ident.month).zfill(2)}' + if ident.is_old_style: + return (f'{date_part}/{ident.category_part}/{ident.numeric_part}/' + f'v{ident.version}') + return f'{date_part}/{ident.arxiv_id}/v{ident.version}' + + @property + def identifier(self) -> D.VersionedIdentifier: + return self.name + + @property + def metadata(self) -> RecordMetadata: + """JSON document containing canonical e-print metadata.""" + assert 'metadata' in self.members + member = self.members['metadata'] + assert isinstance(member, RecordMetadata) + return member + + @property + def render(self) -> Optional[RecordEntry]: + """Canonical PDF for the e-print.""" + if 'render' in self.members: + return self.members['render'] + return None + + @property + def formats(self) -> Dict[D.ContentType, RecordEntry]: + return {D.ContentType(fmt): entry + for fmt, entry in self.members.items() + if fmt not in ['metadata', 'source', 'render']} + + @property + def source(self) -> RecordEntry: + """Gzipped tarball containing the e-print source.""" + assert 'source' in self.members + return self.members['source'] + + def instance_to_domain(self) -> D.Version: + """Deserialize an :class:`.RecordVersion` to an :class:`.Version`.""" + version = self.metadata.to_domain(self.metadata.stream) + if version.source is None or version.render is None: + raise ValueError('Failed to to_domain source or render metadata') + return version + + +class RecordEPrint(RecordBase[D.Identifier, + D.VersionedIdentifier, + RecordVersion, + D.EPrint]): + @classmethod + def make_key(cls, idn: D.Identifier) -> D.Key: + """ + Make a key prefix for an e-print record. + + Parameters + ---------- + idn : str + arXiv identifier + + Returns + ------- + str + + """ + return D.Key(f'e-prints/{idn.year}/{str(idn.month).zfill(2)}/{idn}') + + @classmethod + def make_manifest_key(cls, ident: D.Identifier) -> D.Key: + """ + Make a key for an e-print manifest. + + Returns + ------- + str + + """ + return D.Key(f'{cls.make_key(ident)}.manifest.json') + + +class RecordDay(RecordBase[datetime.date, + D.Identifier, + RecordEPrint, + D.EPrintDay]): + @classmethod + def make_manifest_key(cls, date: datetime.date) -> D.Key: + """ + Make a key for a daily e-print manifest. + + Returns + ------- + str + + """ + return D.Key(date.strftime('e-prints/%Y/%m/%Y-%m-%d.manifest.json')) + + +class RecordMonth(RecordBase[YearMonth, + datetime.date, + RecordDay, + D.EPrintMonth]): + @classmethod + def make_manifest_key(cls, year_and_month: YearMonth) -> D.Key: + """ + Make a key for a monthly e-print manifest. + + Returns + ------- + str + + """ + y, m = year_and_month + return D.Key(f'e-prints/{y}/{y}-{str(m).zfill(2)}.manifest.json') + + +class RecordYear(RecordBase[Year, + YearMonth, + RecordMonth, + D.EPrintYear]): + + @classmethod + def make_manifest_key(cls, year: Year) -> D.Key: + """ + Make a key for a yearly e-print manifest. + + Returns + ------- + str + + """ + return D.Key(f'e-prints/{year}.manifest.json') + + +class RecordEPrints(RecordBase[str, Year, RecordYear, D.AllEPrints]): + @classmethod + def make_manifest_key(cls, _: str) -> D.Key: + """ + Make a key for all e-print manifest. + + Returns + ------- + str + + """ + return D.Key(f'e-prints.manifest.json') \ No newline at end of file diff --git a/arxiv/canonical/register/__init__.py b/arxiv/canonical/register/__init__.py new file mode 100644 index 0000000..1dce3a8 --- /dev/null +++ b/arxiv/canonical/register/__init__.py @@ -0,0 +1,40 @@ +""" +Register for the canonical record. + +This module implements the high-level API for the arXiv canonical record. It +orchestrates the classes in :mod:`arxiv.canonical.domain`, +:mod:`arxiv.canonical.record`, and :mod:`arxiv.canonical.integrity` to +implement reading from and writing to the record. +""" + +from .api import (RegisterAPI, IRegisterAPI, ICanonicalStorage, + ICanonicalSource, IStorableEntry, + Base, RegisterDay, RegisterEPrint, + RegisterEPrints, RegisterListing, RegisterListings, + RegisterListingDay, RegisterListingMonth, + RegisterListingYear, RegisterMetadata, RegisterMonth, + RegisterVersion, RegisterYear, NoSuchResource, + ConsistencyError) + +__all__ = ( + 'Base', + 'ConsistencyError', + 'IRegisterAPI', + 'ICanonicalStorage', + 'ICanonicalSource', + 'IStorableEntry', + 'NoSuchResource', + 'RegisterAPI', + 'RegisterDay', + 'RegisterEPrint', + 'RegisterEPrints', + 'RegisterListing', + 'RegisterListings', + 'RegisterListingDay', + 'RegisterListingMonth', + 'RegisterListingYear', + 'RegisterMetadata', + 'RegisterMonth', + 'RegisterVersion', + 'RegisterYear' +) \ No newline at end of file diff --git a/arxiv/canonical/register/api.py b/arxiv/canonical/register/api.py new file mode 100644 index 0000000..01f347e --- /dev/null +++ b/arxiv/canonical/register/api.py @@ -0,0 +1,249 @@ +""" +Provides the main public API for the canonical register. + +See :class:`.RegisterAPI`. +""" + +import datetime +from collections import abc +from typing import (Any, Optional, IO, Iterable, Iterator, Union, Sequence, + Tuple, overload) + +from typing_extensions import Protocol, Literal + +from ..manifest import Manifest +from .core import (D, R, I, ICanonicalStorage, ICanonicalSource, Base, + Year, Month, YearMonth, IStorableEntry, Selector, + IRegisterAPI) +from .eprint import (RegisterEPrint, RegisterDay, RegisterMonth, RegisterYear, + RegisterEPrints) +from .exceptions import NoSuchResource, ConsistencyError +from .listing import (RegisterListing, RegisterListingDay, + RegisterListingMonth, RegisterListingYear, + RegisterListings) +from .metadata import RegisterMetadata +from .version import RegisterVersion + +_ID = Union[D.VersionedIdentifier, D.Identifier] + + +class RegisterAPI(IRegisterAPI): + """The main public API for the register.""" + + def __init__(self, storage: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + name: str = 'all') -> None: + """Initialize the API with a storage backend.""" + self._storage = storage + self._sources = sources + self._register = Register.load(self._storage, sources, name) + + def add_events(self, *events: D.Event) -> None: + """Add new events to the register.""" + self._register.add_events(self._storage, self._sources, *events) + self._register.save(self._storage) + + def load_eprint(self, identifier: D.Identifier) -> D.EPrint: + """Load an :class:`.EPrint` from the record.""" + eprint = RegisterEPrint.load(self._storage, self._sources, identifier) + if len(eprint.domain.versions) == 0: + raise NoSuchResource(f'No versions exist for {identifier}') + return eprint.domain + + def load_event(self, identifier: str) -> D.Event: + """Load an :class:`.Event` by identifier.""" + return RegisterListingDay.load_event(self._storage, self._sources, + D.EventIdentifier(identifier)) + + def load_events(self, selector: Selector) -> Tuple[Iterable[D.Event], int]: + """ + Load all :class:`.Event`s for a day, month, or year. + + Returns an :class:`.Event` generator that loads event data lazily from + the underlying storage, so that in general we are loading only the data + that we are actually consuming. Events are generated in order. + + **But be warned!** Evaluating the entire generator all at once (e.g. by + coercing it to a ``list``) may load a considerable amount of data into + memory (and use a lot of i/o), especially if events for an entire year + are loaded. + + Parameters + ---------- + selector : int, tuple, or :class:`datetime.date` + Indicates the year (int), month (Tuple[int, int]), or day for which + events should be loaded. + + Returns + ------- + generator + Yields :class:`.Event` instances in chronological order. + int + An estimate of the number of events that will be generated. Note + that the actual number may change (especially for large selections) + because the record may be updated while the generator is being + consumed. + + """ + if isinstance(selector, datetime.date): + return self._load_events_date(selector) + if isinstance(selector, tuple): + return self._load_events_month(selector) + if isinstance(selector, Year): + return self._load_events_year(selector) + raise ValueError(f'Cannot load events for {selector}; invalid type') + + def load_history(self, identifier: _ID) -> Iterable[D.EventSummary]: + """Load the event history of an :class:`.EPrint`.""" + if isinstance(identifier, D.Identifier): + epr = RegisterEPrint.load(self._storage, self._sources, identifier) + if len(epr.domain.versions) == 0: + raise NoSuchResource(f'No versions exist for {identifier}') + + return (summary + for version in epr.domain.versions + for summary in epr.domain.versions[version].events) + if isinstance(identifier, D.VersionedIdentifier): + return (summary + for summary in self.load_version(identifier).events) + raise ValueError(f'Cannot load event history for {identifier};' + ' invalid type') + + def load_listing(self, date: datetime.date, + shard: str = D.Event.get_default_shard()) -> D.Listing: # pylint: disable=no-member + """Load a :class:`.Listing` for a particulate date.""" + identifier = D.ListingIdentifier.from_parts(date, shard) + lst = RegisterListing.load(self._storage, self._sources, identifier) + return lst.domain + + def load_render(self, identifier: D.VersionedIdentifier) \ + -> Tuple[D.CanonicalFile, IO[bytes]]: + version = self._load_version(identifier) + if version.record.render is None \ + or version.record.render.stream.content is None: + raise NoSuchResource(f'Cannot load render for {identifier}') + assert version.domain.render is not None + return version.domain.render, version.record.render.stream.content + + def load_source(self, identifier: D.VersionedIdentifier) \ + -> Tuple[D.CanonicalFile, IO[bytes]]: + version = self._load_version(identifier) + if version.record.source.stream.content is None: + raise NoSuchResource(f'Cannot load source for {identifier}') + return version.domain.source, version.record.source.stream.content + + def load_version(self, identifier: D.VersionedIdentifier) -> D.Version: + """Load an e-print :class:`.Version` from the record.""" + return self._load_version(identifier).domain + + def _load_events_date(self, selector: datetime.date) \ + -> Tuple[Iterable[D.Event], int]: + listing = self.load_listing(selector) + return ((event for event in listing.events), len(listing.events)) + + def _load_events_month(self, selector: YearMonth) \ + -> Tuple[Iterable[D.Event], int]: + assert len(selector) == 2 + assert isinstance(selector[0], int), isinstance(selector[1], int) + listing_month = RegisterListingMonth.load(self._storage, self._sources, + selector) + return ( + (event + for listing_day in listing_month.iter_members() + for listing in listing_day.iter_members() + for event in listing.record.domain.events), + listing_month.number_of_events + ) + + def _load_events_year(self, selector: Year) \ + -> Tuple[Iterable[D.Event], int]: + listing_year = RegisterListingYear.load(self._storage, self._sources, + selector) + return ( + (event + for listing_month in listing_year.iter_members() + for listing_day in listing_month.iter_members() + for listing in listing_day.iter_members() + for event in listing.record.domain.events), + listing_year.number_of_events + ) + + def _load_version(self, identifier: D.VersionedIdentifier) \ + -> RegisterVersion: + try: + return RegisterVersion.load(self._storage, self._sources, + identifier) + except Exception as e: # TODO: make this more specific. + raise NoSuchResource(f'No such version: {identifier}') from e + + +listings_key = Literal['listings'] +eprints_key = Literal['eprints'] +_TopLevelNames = Union[listings_key, eprints_key] +_TopLevelMembers = Union[RegisterListings, RegisterEPrints] + + +class Register(Base[str, + D.Canon, + R.Record, + I.Integrity, + _TopLevelNames, + _TopLevelMembers]): + domain_type = D.Canon + record_type = R.Record + integrity_type = I.Integrity + member_type = _TopLevelMembers # type: ignore + + @classmethod + def _member_name(cls, _: D.Event) -> Iterable[_TopLevelNames]: + return ['listings', 'eprints'] + + @classmethod + def _get_members(cls, s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + _: Manifest) -> '_TopMapping': + return _TopMapping(RegisterListings.load(s, sources, 'listings'), + RegisterEPrints.load(s, sources, 'eprints')) + + def add_events(self, s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + *events: D.Event) -> None: + """Add events to this register.""" + for event in events: + event.version.events.append(event.summary) + super(Register, self).add_events(s, sources, *events) + + +class _TopMapping(abc.MutableMapping): + def __init__(self, listings: RegisterListings, + eprints: RegisterEPrints) -> None: + """Initilize with listings and eprints registers.""" + self.eprints = eprints + self.listings = listings + + @overload + def __getitem__(self, obj: listings_key) -> RegisterListings: ... + @overload + def __getitem__(self, obj: eprints_key) -> RegisterEPrints: ... # pylint: disable=function-redefined + def __getitem__(self, obj: Any) -> Any: # pylint: disable=function-redefined + if obj == 'eprints': + return self.eprints + if obj == 'listings': + return self.listings + raise KeyError('No such resource') + + def __delitem__(self, obj: Any) -> None: + raise NotImplementedError('Does not support deletion') + + def __setitem__(self, obj: Any, value: Any) -> None: + if obj == 'eprints' and isinstance(value, RegisterEPrints): + self.eprints = value + if obj == 'listings' and isinstance(value, RegisterListings): + self.listings = value + raise ValueError('Not supported') + + def __iter__(self) -> Iterator[Any]: + return iter([self.eprints, self.listings]) + + def __len__(self) -> int: + return 2 \ No newline at end of file diff --git a/arxiv/canonical/register/core.py b/arxiv/canonical/register/core.py new file mode 100644 index 0000000..40aaf76 --- /dev/null +++ b/arxiv/canonical/register/core.py @@ -0,0 +1,233 @@ +"""Core structures of the canonical register.""" + +import io +import os +from collections import abc, defaultdict +from datetime import date, datetime +from functools import partial +from itertools import groupby +from operator import attrgetter +from typing import (Dict, Tuple, Iterator, List, + Generic, TypeVar, MutableMapping, Optional, Iterable, + Callable, Any, Type, Union, Set, IO, Sequence, cast) + +from typing_extensions import Literal, Protocol + +from ..core import ICanonicalSource, ICanonicalStorage, IManifestStorage, \ + IStorableEntry, dereference, IRegisterAPI, Year, Month, YearMonth, Selector +from .. import domain as D +from .. import record as R +from .. import integrity as I +from ..manifest import Manifest, ManifestEntry, make_empty_manifest + +from .util import LazyMap, LazyMapView + + +_Name = TypeVar('_Name') +_Domain = TypeVar('_Domain') +_Record = TypeVar('_Record', bound=Union[R.RecordBase, R.RecordEntry]) +_Integrity = TypeVar('_Integrity', bound=I.IntegrityBase) +_Member = TypeVar('_Member', bound=Optional['Base']) +_MemberName = TypeVar('_MemberName') +_Self = TypeVar('_Self', bound='Base') + + +class Base(Generic[_Name, _Domain, _Record, _Integrity, _MemberName, _Member]): + """ + Generic base class for all register classes. + + This defines the abstract structure of a register class. It specifies thatpecifically that + instances of a register class are composed of a domain object, a record + object, an integrity object, and a set of members. This allows us to + define register classes that align domain, record, and integrity classes + at a specific level of the record hierarchy. + """ + + domain: _Domain + """The domain object on a register instance.""" + + domain_type: Type[_Domain] + """The type of the domain object on a register instance.""" + + record: _Record + """The record object on a register instance.""" + + record_type: Type[_Record] + """The type of the record object on a register instance.""" + + integrity: _Integrity + """The integrity object on a register instance.""" + + integrity_type: Type[_Integrity] + """The type of the integrity object on a register instance.""" + + member_type: Type[_Member] + """The type of members contained by an instance of a register class.""" + + def __init__( + self, + name: _Name, + domain: _Domain, + record: _Record, + integrity: _Integrity, + members: Optional[MutableMapping[_MemberName, _Member]] = None + ) -> None: + """Set public and private attributes.""" + self.domain = domain + self.record = record + self.integrity = integrity + self.name = name + self._members = members + + @classmethod + def _member_name(cls, event: D.Event) -> Iterable[_MemberName]: + """ + Get the name of a member that contains an event. + + This method routes events to members when new events are added to the + record. + """ + raise NotImplementedError(f'Must be implemented by child ({cls}),' + ' if supported') + + @classmethod + def _manifest_to_member_name(cls, key: str) -> _MemberName: + """Get the name of the member corresponding to a manifest key.""" + raise NotImplementedError('Must be implemented by a child class') + + @classmethod + def _get_members(cls, s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + manifest: Manifest) \ + -> MutableMapping[_MemberName, _Member]: + """Generate a member mapping from a :class:`Manifest`.""" + return LazyMap([cls._manifest_to_member_name(entry['key']) + for entry in manifest['entries']], + partial(cls.member_type.load, s, sources)) + + @classmethod + def load(cls: Type[_Self], s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], name: _Name, + checksum: Optional[str] = None) -> _Self: + """Load an instance of the register class from storage.""" + manifest_key = cls.record_type.make_manifest_key(name) + try: + manifest = s.load_manifest(manifest_key) + except Exception: # TODO: need a storage exception here. + manifest = make_empty_manifest() + + members = cls._get_members(s, sources, manifest) + d = cls.domain_type(name, LazyMapView(members, _get_domain)) + r = cls.record_type(name, LazyMapView(members, _get_record), d) + i = cls.integrity_type( + name, + record=r, + members=LazyMapView(members, _get_integrity), + manifest=manifest, + checksum=checksum + ) + return cls(name, domain=d, record=r, integrity=i, members=members) + + @classmethod + def _load_content(cls: Type[_Self], s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + identifier: D.VersionedIdentifier, + filename: str) -> IO[bytes]: + return dereference(sources, + R.RecordVersion.make_key(identifier, filename)) + + @property + def members(self) -> MutableMapping[_MemberName, _Member]: + """Accessor for the members of a register instance.""" + assert self._members is not None + return self._members + + @property + def number_of_events(self) -> int: + """Number of events contained within a register instance.""" + return self.integrity.manifest.get('number_of_events', -1) + + @property + def number_of_versions(self) -> int: + """Number of e-print versions contained within a register instance.""" + return self.integrity.manifest.get('number_of_versions', -1) + + def add_events(self, s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + *events: D.Event) -> None: + """Add events to this register.""" + self.save_members( + s, + self._add_events(s, sources, events, self._member_name) + ) + assert self.integrity.manifest is not None + self.integrity.update_checksum() + + def iter_members(self) -> Iterable[_Member]: + """Get an iterator over members in this register.""" + assert self.members is not None + return (self.members[name] for name in self.members) + + def save(self, s: ICanonicalStorage) -> str: + """Store changes to the integrity manifest for this register.""" + s.store_manifest(self.record.make_manifest_key(self.name), + self.integrity.manifest) + return self.integrity.checksum + + def save_members(self, s: ICanonicalStorage, + members: Iterable[_Member]) -> None: + """Save members that have changed, and update our manifest.""" + for member in members: + checksum = member.save(s) + assert checksum is not None + self.integrity.update_or_extend_manifest(member.integrity, + checksum) + + def _add_events(self, s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + events: Iterable[D.Event], + fkey: Callable[[D.Event], Iterable[_MemberName]]) \ + -> Iterable[_Member]: + assert self.members is not None + altered = set() + grouped: Dict[_MemberName, List[D.Event]] = defaultdict(list) + for event in events: + for name in fkey(event): + grouped[name].append(event) + for name, m_events in grouped.items(): + member = self.members[name] + member.add_events(s, sources, *m_events) + altered.add(member) + return iter(altered) + + +def _get_domain(register: Base[_Name, + _Domain, + _Record, + _Integrity, + _MemberName, + _Member]) -> _Domain: + return register.domain + + +def _get_record(register: Base[_Name, + _Domain, + _Record, + _Integrity, + _MemberName, + _Member]) -> _Record: + return register.record + + +def _get_integrity(register: Base[_Name, + _Domain, + _Record, + _Integrity, + _MemberName, + _Member]) -> _Integrity: + return register.integrity + + + + + diff --git a/arxiv/canonical/register/eprint.py b/arxiv/canonical/register/eprint.py new file mode 100644 index 0000000..f0fef99 --- /dev/null +++ b/arxiv/canonical/register/eprint.py @@ -0,0 +1,223 @@ +""" +Provides structs for organizing e-print metadata and content in the register. + +The classes in this module extend :class:`.Base` with methods for naming +themselves and manifests. +""" + +from datetime import date, datetime +from typing import Any, Callable, Iterable, List, Optional, Set, Sequence, Type + +from .core import (Base, D, R, I, ICanonicalStorage, ICanonicalSource, _Self, + Year, Month, YearMonth, dereference) +from .exceptions import ConsistencyError +from .file import RegisterFile +from .metadata import RegisterMetadata +from .version import RegisterVersion + + +class RegisterEPrint(Base[D.Identifier, + D.EPrint, + R.RecordEPrint, + I.IntegrityEPrint, + D.VersionedIdentifier, + RegisterVersion]): + """ + Representation of an e-print in the canonical register. + + Organizes a series of one or more :class:`.RegisterVersion`s. + """ + + domain_type = D.EPrint + record_type = R.RecordEPrint + integrity_type = I.IntegrityEPrint + member_type = RegisterVersion + + @classmethod + def _member_name(cls, event: D.Event) -> Iterable[D.VersionedIdentifier]: + return [event.version.identifier] + + @classmethod + def _manifest_to_member_name(cls, key: str) -> D.VersionedIdentifier: + return D.VersionedIdentifier(key) + + # Single-dispatch based on the event type, using the ``add_event_`` methods + # defined below. + def _add_events(self, s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + events: Iterable[D.Event], + _: Callable) -> Iterable[RegisterVersion]: + added: Set[RegisterVersion] = set() + for event in events: + adder = getattr(self, f'add_event_{event.event_type.value}', None) + assert adder is not None + added |= set(adder(s, sources, event)) + return added + + def _add_versions(self, s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + versions: Iterable[D.Version], + fkey: Callable[[D.Version], Any]) \ + -> Iterable[RegisterVersion]: + assert self.members is not None + altered = set() + for version in versions: + key = fkey(version) + if key in self.members: + raise ConsistencyError('Version already exists') + member = self.member_type.create(s, sources, version) + self.members[key] = member + altered.add(member) + return iter(altered) + + def add_event_new(self, s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + event: D.Event) -> List[RegisterVersion]: + """Add an event that results in a new version.""" + assert self.members is not None + altered: List[RegisterVersion] = [] + for key in self._member_name(event): + if key in self.members: + raise ConsistencyError(f'Version already exists: {key}') + self.members[key] \ + = self.member_type.create(s, sources, event.version) + altered.append(self.members[key]) + return altered + + def add_event_update(self, s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + event: D.Event) -> List[RegisterVersion]: + """Add an event that results in an update to a version.""" + assert self.members is not None + altered: List[RegisterVersion] = [] + for key in self._member_name(event): + if key not in self.members: + raise ConsistencyError(f'No such version: {event.identifier}') + self.members[key].update(s, sources, event.version) + altered.append(self.members[key]) + return altered + + def add_event_update_metadata(self, s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + event: D.Event) -> List[RegisterVersion]: + """Add an event that results in an update to metadata of a version.""" + return self.add_event_update(s, sources, event) + + def add_event_replace(self, s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + event: D.Event) -> List[RegisterVersion]: + """Add an event that generates a replacement version.""" + return self.add_event_new(s, sources, event) + + def add_event_cross(self, s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + event: D.Event) -> List[RegisterVersion]: + """Add a cross-list event.""" + return self.add_event_update_metadata(s, sources, event) + + def add_event_migrate(self, s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + event: D.Event) -> List[RegisterVersion]: + """Add a data-migration event.""" + return self.add_event_update(s, sources, event) + + def add_event_migrate_metadata(self, s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + event: D.Event) -> List[RegisterVersion]: + """Add a metadata-migration event.""" + return self.add_event_update_metadata(s, sources, event) + + def add_event_withdraw(self, s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + event: D.Event) -> List[RegisterVersion]: + """Add an event that withdraws an e-print.""" + return self.add_event_new(s, sources, event) + + +class RegisterDay(Base[date, + D.EPrintDay, + R.RecordDay, + I.IntegrityDay, + D.Identifier, + RegisterEPrint]): + """Representation of a day-block of e-prints in the canonical register.""" + + domain_type = D.EPrintDay + record_type = R.RecordDay + integrity_type = I.IntegrityDay + member_type = RegisterEPrint + + @classmethod + def _member_name(cls, event: D.Event) -> Iterable[D.Identifier]: + return [event.version.identifier.arxiv_id] + + @classmethod + def _manifest_to_member_name(cls, key: str) -> D.Identifier: + return D.Identifier(key) + + +class RegisterMonth(Base[YearMonth, + D.EPrintMonth, + R.RecordMonth, + I.IntegrityMonth, + date, + RegisterDay]): + """Representation of a month-block in the canonical register.""" + + domain_type = D.EPrintMonth + record_type = R.RecordMonth + integrity_type = I.IntegrityMonth + member_type = RegisterDay + + @classmethod + def _member_name(cls, event: D.Event) -> Iterable[date]: + return [event.version.announced_date_first] + + @classmethod + def _manifest_to_member_name(cls, key: str) -> date: + return datetime.strptime(key[:10], '%Y-%m-%d').date() + + +class RegisterYear(Base[Year, + D.EPrintYear, + R.RecordYear, + I.IntegrityYear, + YearMonth, + RegisterMonth]): + """Representation of a year-block in the canonical register.""" + + domain_type = D.EPrintYear + record_type = R.RecordYear + integrity_type = I.IntegrityYear + member_type = RegisterMonth + + @classmethod + def _member_name(cls, event: D.Event) -> Iterable[YearMonth]: + return [(event.version.identifier.year, + event.version.identifier.month)] + + @classmethod + def _manifest_to_member_name(cls, key: str) -> YearMonth: + year_part, month_part = key.split('-', 1) + return int(year_part), int(month_part) + + +class RegisterEPrints(Base[str, + D.AllEPrints, + R.RecordEPrints, + I.IntegrityEPrints, + Year, + RegisterYear]): + """Representation of the complete set of e-prints in the register.""" + domain_type = D.AllEPrints + record_type = R.RecordEPrints + integrity_type = I.IntegrityEPrints + member_type = RegisterYear + + @classmethod + def _member_name(cls, event: D.Event) -> Iterable[Year]: + return [event.version.identifier.year] + + @classmethod + def _manifest_to_member_name(cls, key: str) -> Year: + return int(key) \ No newline at end of file diff --git a/arxiv/canonical/register/exceptions.py b/arxiv/canonical/register/exceptions.py new file mode 100644 index 0000000..326c088 --- /dev/null +++ b/arxiv/canonical/register/exceptions.py @@ -0,0 +1,6 @@ +class ConsistencyError(Exception): + """Operation was attempted that would violate consistency of the record.""" + + +class NoSuchResource(Exception): + """Operation was attempted on a non-existant resource.""" \ No newline at end of file diff --git a/arxiv/canonical/register/file.py b/arxiv/canonical/register/file.py new file mode 100644 index 0000000..e2c5a6b --- /dev/null +++ b/arxiv/canonical/register/file.py @@ -0,0 +1,30 @@ + + +from .core import (Base, D, R, I, ICanonicalStorage, ICanonicalSource, _Self) + + +class RegisterFile(Base[str, + D.CanonicalFile, + R.RecordFile, + I.IntegrityEntry, + None, + None]): + + domain_type = D.CanonicalFile + record_type = R.RecordFile + integrity_type = I.IntegrityEntry + member_type = type(None) + + def save(self, s: ICanonicalStorage) -> str: + """ + Save this file. + + Overrides the base method since this is a terminal record, not a + collection. + """ + s.store_entry(self.integrity) + self.integrity.update_checksum() + return self.integrity.checksum + + def delete(self, s: ICanonicalStorage) -> None: + raise NotImplementedError('not yet; do this please') \ No newline at end of file diff --git a/arxiv/canonical/register/listing.py b/arxiv/canonical/register/listing.py new file mode 100644 index 0000000..b6960d5 --- /dev/null +++ b/arxiv/canonical/register/listing.py @@ -0,0 +1,196 @@ +import os +from datetime import date, datetime +from functools import partial +from typing import Any, Callable, Iterable, List, Optional, Set, Sequence, Type + +from .core import (Base, D, R, I, ICanonicalStorage, ICanonicalSource, _Self, + Year, Month, YearMonth, dereference) +from .exceptions import NoSuchResource + + +class RegisterListing(Base[D.ListingIdentifier, + D.Listing, + R.RecordListing, + I.IntegrityListing, + None, + None]): + + domain_type = D.Listing + record_type = R.RecordListing + integrity_type = I.IntegrityListing + member_type = type(None) + + @classmethod + def create(cls, s: ICanonicalStorage, sources: Sequence[ICanonicalSource], + d: D.Listing) -> 'RegisterListing': + r = R.RecordListing.from_domain(d) + i = I.IntegrityListing.from_record(r) + s.store_entry(i) + return cls(d.identifier, domain=d, record=r, integrity=i) + + @classmethod + def load(cls: Type[_Self], s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + identifier: D.ListingIdentifier, + checksum: Optional[str] = None) -> _Self: + + try: + key = R.RecordListing.make_key(identifier) + stream, _checksum = s.load_entry(key) + + d = R.RecordListing.to_domain(stream) + r = R.RecordListing(key=key, stream=stream, domain=d) + if checksum is not None: + assert checksum == _checksum + i = I.IntegrityListing.from_record(r, checksum=_checksum, + calculate_new_checksum=False) + except Exception: + d = D.Listing(identifier, events=[]) + r = R.RecordListing.from_domain(d) + i = I.IntegrityListing.from_record(r) + return cls(identifier, domain=d, integrity=i, record=r) + + @property + def number_of_events(self) -> int: + return self.domain.number_of_events + + @property + def number_of_versions(self) -> int: + return self.domain.number_of_versions + + def add_events(self, _: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + *events: D.Event) -> None: + """ + Add events to the terminal listing R. + + Overrides the base method since this is a terminal record, not a + collection. + """ + N = len(events) + for i, event in enumerate(events): + self.domain.events.insert(N + i, event) + self.record = R.RecordListing.from_domain(self.domain) + self.integrity = I.IntegrityListing.from_record(self.record) + + def save(self, s: ICanonicalStorage) -> str: + """ + Save this file. + + Overrides the base method since this is a terminal record, not a + collection. + """ + s.store_entry(self.integrity) + self.integrity.update_checksum() + return self.integrity.checksum + + def delete(self, s: ICanonicalStorage) -> None: + raise NotImplementedError('not yet; do this please') + + +class RegisterListingDay(Base[date, + D.ListingDay, + R.RecordListingDay, + I.IntegrityListingDay, + D.ListingIdentifier, + RegisterListing]): + domain_type = D.ListingDay + record_type = R.RecordListingDay + integrity_type = I.IntegrityListingDay + member_type = RegisterListing + + @classmethod + def _member_name(cls, event: D.Event) \ + -> Iterable[D.ListingIdentifier]: + return [D.ListingIdentifier.from_parts(event.event_date.date(), + event.event_id.shard)] + + @classmethod + def _manifest_to_member_name(cls, key: str) -> D.ListingIdentifier: + # return ListingIdentifier(key) + base, term = os.path.split(key) + term, _ = os.path.splitext(term) + y, m, d, shrd = term.split('-', 3) + return D.ListingIdentifier.from_parts(date(int(y), int(m), + int(d)), shrd) + + @classmethod + def load_event(cls, s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + identifier: D.EventIdentifier) -> D.Event: + listing = cls.load(s, sources, identifier.event_date) + for member in listing.members: + for event in listing.members[member].domain.events: + if event.event_id == identifier: + return event + raise NoSuchResource(f'No such event: {identifier}') + + def add_listing(self, s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + d: D.Listing) -> None: + assert self.members is not None + member = RegisterListing.create(s, sources, d) + self.members[member.domain.identifier] = member + self.integrity.extend_manifest(member.integrity) + + +class RegisterListingMonth(Base[YearMonth, + D.ListingMonth, + R.RecordListingMonth, + I.IntegrityListingMonth, + date, + RegisterListingDay]): + + domain_type = D.ListingMonth + record_type = R.RecordListingMonth + integrity_type = I.IntegrityListingMonth + member_type = RegisterListingDay + + @classmethod + def _member_name(cls, event: D.Event) -> Iterable[date]: + return [event.event_date.date()] + + @classmethod + def _manifest_to_member_name(cls, key: str) -> date: + return datetime.strptime(key, '%Y-%m-%d').date() + + +class RegisterListingYear(Base[Year, + D.ListingYear, + R.RecordListingYear, + I.IntegrityListingYear, + YearMonth, + RegisterListingMonth]): + domain_type = D.ListingYear + record_type = R.RecordListingYear + integrity_type = I.IntegrityListingYear + member_type = RegisterListingMonth + + @classmethod + def _member_name(cls, event: D.Event) -> Iterable[YearMonth]: + return [(event.event_date.year, event.event_date.month)] + + @classmethod + def _manifest_to_member_name(cls, key: str) -> YearMonth: + year_part, month_part = key.split('-', 1) + return int(year_part), int(month_part) + + +class RegisterListings(Base[str, + D.AllListings, + R.RecordListings, + I.IntegrityListings, + Year, + RegisterListingYear]): + domain_type = D.AllListings + record_type = R.RecordListings + integrity_type = I.IntegrityListings + member_type = RegisterListingYear + + @classmethod + def _member_name(cls, event: D.Event) -> Iterable[Year]: + return [event.event_date.year] + + @classmethod + def _manifest_to_member_name(cls, key: str) -> Year: + return int(key) \ No newline at end of file diff --git a/arxiv/canonical/register/metadata.py b/arxiv/canonical/register/metadata.py new file mode 100644 index 0000000..39ab1fb --- /dev/null +++ b/arxiv/canonical/register/metadata.py @@ -0,0 +1,28 @@ +from .core import Base, D, R, I, ICanonicalStorage + + +class RegisterMetadata(Base[str, + D.Version, + R.RecordMetadata, + I.IntegrityMetadata, + None, + None]): + + domain_type = D.Version + record_type = R.RecordMetadata + integrity_type = I.IntegrityMetadata + member_type = type(None) + + def save(self, s: ICanonicalStorage) -> str: + """ + Save this file. + + Overrides the base method since this is a terminal record, not a + collection. + """ + s.store_entry(self.integrity) + self.integrity.update_checksum() + return self.integrity.checksum + + def delete(self, s: ICanonicalStorage) -> None: + raise NotImplementedError('not yet; do this please') \ No newline at end of file diff --git a/arxiv/canonical/register/methods.py b/arxiv/canonical/register/methods.py new file mode 100644 index 0000000..57e23ea --- /dev/null +++ b/arxiv/canonical/register/methods.py @@ -0,0 +1,123 @@ +# import collections +# import datetime +# from functools import partial +# from operator import attrgetter +# from typing import Tuple, List, Optional, TypeVar, Callable, Dict, \ +# MutableMapping, Iterable, cast, Any, Type, Iterator, Mapping, Sequence, \ +# Generic + +# from backports.datetime_fromisoformat import MonkeyPatch + +# from .base import ICanonicalStorage, RegisterVersion, RegisterListing, \ +# RegisterEPrint, RegisterMonth, RegisterDay, \ +# RegisterListingMonth, get_domain, get_integrity, get_record, \ +# Domain, Record, Integrity, Manifest + +# from .util import LazyMap, LazyMapView, LazyMap, LazyMapView + +# from ..domain import Identifier, VersionedIdentifier, Version, Listing, \ +# EPrint, EPrintMonth, EPrintDay, ListingMonth +# from ..serialize.record import VersionSerializer, ListingSerializer, \ +# MetadataSerializer +# from ..record import RecordVersion, RecordListing, \ +# RecordEPrint, RecordDay, RecordMonth +# from ..integrity import IntegrityVersion, IntegrityListing, IntegrityEPrint, \ +# ValidationError, IntegrityDay, IntegrityMonth, IntegrityListingMonth + +# MonkeyPatch.patch_fromisoformat() + + +# def store_version(storage: ICanonicalStorage, version: Version) -> None: +# ie = IntegrityVersion.from_record(VersionSerializer.serialize(version)) +# map(storage.store_entry, ie.iter_entries()) + + + +# # def mapping_type(key_type: KeyType, value_type: ValueType) \ +# # -> Type[MutableMapping[KeyType, ValueType]]: + + + +# # def load_record(storage: ICanonicalStorage) -> CanonicalRecord: +# # """ +# # Initialize and return the :class:`.CanonicalRecord`. +# # """ +# # raise NotImplementedError('Implement me!') + + +# # def load_block(storage: ICanonicalStorage, year: int, month: int) -> Month: +# # """ +# # Load a :class:`.Month`. + +# # Parameters +# # ---------- +# # year : int +# # month : int + +# # Returns +# # ------- +# # :class:`.Month` + +# # """ + + + +# # def store_listing(storage: ICanonicalStorage, listing: Listing) -> None: +# # """ +# # Store a :class:`.Listing`. + +# # Should complain loudly if ``self.read_only`` is ``True``. +# # """ +# # if self.read_only: +# # raise RuntimeError('This is a read-only session') +# # record = listing_serializer.serialize(listing) +# # self._write_key(record.key, record.content, record.checksum) + +# # def load_listing(storage: ICanonicalStorage, listing_date: date) -> Listing: +# # """ +# # Load a :class:`.Listing`. + +# # If ``self.read_only`` is ``False``, the ``events`` member of the +# # listing must be a subclass of ``list``, and implement an +# # ``append(event: Event) -> None`` method that, when called, writes the +# # current state of the listing to S3. + +# # Parameters +# # ---------- +# # listing_date : datetime +# # Date for selecting listing events. + +# # Returns +# # ------- +# # :class:`.Listing` + +# # """ +# # record = listing_serializer.load(listing_date, self._loader) +# # return listing_serializer.deserialize(record, self) + +# # def store_eprint(storage: ICanonicalStorage, e_print: EPrint) -> None: +# # """ +# # Store an :class:`.EPrint`. + +# # If the :attr:`.EPrint.source` or :attr:`.EPrint.pdf` content +# # has changed, those should also be stored. + +# # Should complain loudly if ``self.read_only`` is ``True``. +# # """ +# # if self.read_only: +# # raise RuntimeError('This is a read-only session') +# # for key, entry in eprint_serializer.serialize(e_print).iter_members(): +# # self._write_key(key, entry.content, entry.checksum) + +# # def load_eprint(storage: ICanonicalStorage, identifier: VersionedIdentifier) -> EPrint: +# # """ +# # Load an :class:`.EPrint`. + +# # The content of the :attr:`.EPrint.source` and +# # :attr:`.EPrint.pdf.content` should provide a ``read()`` method that, +# # when called, retrieves the content of the corresponding resource from +# # storage. +# # """ +# # record = eprint_serializer.load(identifier.arxiv_id, +# # identifier.version, self._loader) +# # return eprint_serializer.deserialize(record) diff --git a/arxiv/canonical/register/tests/__init__.py b/arxiv/canonical/register/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/arxiv/canonical/register/tests/test_api.py b/arxiv/canonical/register/tests/test_api.py new file mode 100644 index 0000000..4df2b54 --- /dev/null +++ b/arxiv/canonical/register/tests/test_api.py @@ -0,0 +1,206 @@ +import io +from datetime import datetime +from pytz import UTC +from typing import Callable, Tuple, Dict, List, IO +from unittest import TestCase, mock + +from ...services.store import InMemoryStorage +from ..api import (RegisterAPI, ICanonicalSource, ICanonicalStorage, + IStorableEntry, Manifest, NoSuchResource, D, R) + + +class TestAPI(TestCase): + """RegisterAPI provides a high-level API for the register.""" + + def setUp(self): + self.mock_source = mock.MagicMock(spec=ICanonicalSource) + self.mock_source.can_resolve.return_value = True + + self.mock_source.load = \ + lambda *a, **k: io.BytesIO(b'foocontent') + self.storage = InMemoryStorage() + self.api = RegisterAPI(self.storage, [self.storage, self.mock_source]) + + identifier = D.VersionedIdentifier('2901.00345v1') + created = datetime(2029, 1, 29, 20, 4, 23, tzinfo=UTC) + listing_id = D.ListingIdentifier.from_parts(created.date(), 'foo') + + version = D.Version( + identifier=identifier, + announced_date=created.date(), + announced_date_first=created.date(), + submitted_date=created, + updated_date=created, + is_announced=True, + events=[], + previous_versions=[], + metadata=D.Metadata( + primary_classification=D.Category('cs.DL'), + secondary_classification=[D.Category('cs.IR')], + title='Foo title', + abstract='It is abstract', + authors='Ima N. Author (FSU)', + license=D.License(href="http://some.license") + ), + source=D.CanonicalFile( + filename='2901.00345v1.tar', + modified=created, + size_bytes=4_304, + content_type=D.ContentType.tar, + ref=D.URI('/fake/path.tar'), + is_gzipped=False, + ), + render=D.CanonicalFile( + filename='2901.00345v1.pdf', + modified=created, + size_bytes=404, + content_type=D.ContentType.pdf, + ref=D.URI('/fake/path.pdf') + ) + ) + self.event = D.Event( + identifier=identifier, + event_date=created, + event_type=D.EventType.NEW, + categories=[D.Category('cs.DL')], + version=version + ) + + def test_add_load_event(self): + """Can add and load an event.""" + self.api.add_events(self.event) + self.assertEqual(self.api.load_event(self.event.event_id), self.event, + 'Added event can be loaded again') + + def test_add_load_events_by_date(self): + """Can add events and load them using date selector.""" + events, N = self.api.load_events(self.event.event_date.date()) + self.assertEqual(N, 0, 'There are no events') + self.assertEqual(len(list(events)), N, 'There are truly no events') + + self.api.add_events(self.event) + + events, N = self.api.load_events(self.event.event_date.date()) + self.assertEqual(N, 1, 'There is now one event') + event_list = list(events) + self.assertEqual(len(event_list), N, 'There is truly 1 event') + self.assertEqual(event_list[0], self.event, + 'And that event is the one that we just added') + + events, N = self.api.load_events(datetime.now().date()) + self.assertEqual(N, 0, 'But there are no events from today') + self.assertEqual(len(list(events)), N, 'Indeed, no events') + + def test_add_load_events_by_month(self): + """Can add events and load them using month selector.""" + events, N = self.api.load_events((self.event.event_date.year, + self.event.event_date.month)) + self.assertEqual(N, 0, 'There are no events') + self.assertEqual(len(list(events)), N, 'There are truly no events') + + self.api.add_events(self.event) + + events, N = self.api.load_events((self.event.event_date.year, + self.event.event_date.month)) + self.assertEqual(N, 1, 'There is now one event') + event_list = list(events) + self.assertEqual(len(event_list), N, 'There is truly 1 event') + self.assertEqual(event_list[0], self.event, + 'And that event is the one that we just added') + + events, N = self.api.load_events((datetime.now().year, + datetime.now().month)) + self.assertEqual(N, 0, 'But there are no events from this month') + self.assertEqual(len(list(events)), N, 'Indeed, no events') + + def test_add_load_events_by_year(self): + """Can add events and load them using year selector.""" + events, N = self.api.load_events(self.event.event_date.year) + self.assertEqual(N, 0, 'There are no events') + self.assertEqual(len(list(events)), N, 'There are truly no events') + + self.api.add_events(self.event) + + events, N = self.api.load_events(self.event.event_date.year) + self.assertEqual(N, 1, 'There is now one event') + event_list = list(events) + self.assertEqual(len(event_list), N, 'There is truly 1 event') + self.assertEqual(event_list[0], self.event, + 'And that event is the one that we just added') + + events, N = self.api.load_events(datetime.now().year) + self.assertEqual(N, 0, 'But there are no events from this year') + self.assertEqual(len(list(events)), N, 'Indeed, no events') + + def test_can_load_listing(self): + """Can load listings.""" + listing = self.api.load_listing(self.event.event_date.date()) + self.assertEqual(len(listing.events), 0, 'Listing has no events') + + self.api.add_events(self.event) + + listing = self.api.load_listing(self.event.event_date.date()) + self.assertEqual(len(listing.events), 1, + 'But it has one after we add an event') + self.assertEqual(listing.events[0], self.event, + 'And it is the event that we added') + + def test_can_load_version(self): + """Can load a Version that was created via an event.""" + with self.assertRaises(Exception): + self.api.load_version(self.event.identifier) + + self.api.add_events(self.event) + + version = self.api.load_version(self.event.identifier) + self.assertEqual(version, self.event.version, + 'Can load the Version that we just added') + + def test_can_load_eprint(self): + """Can load an EPrint that was created via an event.""" + with self.assertRaises(Exception): + self.api.load_eprint(self.event.identifier.arxiv_id) + + self.api.add_events(self.event) + + eprint = self.api.load_eprint(self.event.identifier.arxiv_id) + self.assertEqual(eprint.versions[self.event.identifier], + self.event.version, + 'Can load the Version that we just added') + + def test_can_load_history(self): + """Can load the event history of a Version or EPrint.""" + with self.assertRaises(NoSuchResource): + self.api.load_history(self.event.identifier.arxiv_id) + with self.assertRaises(NoSuchResource): + self.api.load_history(self.event.identifier) + + self.api.add_events(self.event) + + summary = next(self.api.load_history(self.event.identifier.arxiv_id)) + self.assertEqual(summary, self.event.summary, + 'History includes a summary of our event') + + summary = next(self.api.load_history(self.event.identifier)) + self.assertEqual(summary, self.event.summary, + 'History includes a summary of our event') + + def test_can_load_render(self): + """Can load an EPrint that was created via an event.""" + with self.assertRaises(NoSuchResource): + self.api.load_render(self.event.identifier) + + self.api.add_events(self.event) + + cf, content = self.api.load_render(self.event.identifier) + self.assertEqual(cf, self.event.version.render, + 'Loads the render file') + self.assertEqual(content.read(), b'foocontent', 'Loads render content') + + cf, content = self.api.load_source(self.event.identifier) + self.assertEqual(cf, self.event.version.source, + 'Loads the source file') + self.assertEqual(content.read(), b'foocontent', 'Loads source content') + + + diff --git a/arxiv/canonical/register/util.py b/arxiv/canonical/register/util.py new file mode 100644 index 0000000..b75c6eb --- /dev/null +++ b/arxiv/canonical/register/util.py @@ -0,0 +1,62 @@ + +import collections +from uuid import uuid4 +from typing import Callable, MutableMapping, Any, Dict, Iterator, List, \ + Sequence + + +class LazyMapView(collections.abc.MutableMapping): + def __init__(self, mapping: MutableMapping[Any, Any], + getter: Callable[[Any], Any]) -> None: + self._mapping = mapping + self._getter = getter + + def __len__(self) -> int: + return len(self._mapping) + + def __getitem__(self, key: Any) -> Any: + return self._getter(self._mapping[key]) + + def __iter__(self) -> Iterator[Any]: + return iter(self._mapping) + + def __delitem__(self, key: Any) -> None: + raise NotImplementedError('Deletion is not allowed') + + def __setitem__(self, key: Any, value: Any) -> None: + raise NotImplementedError('not yet') + + +class LazyMap(collections.abc.MutableMapping): + def __init__(self, keys: List[Any], load: Callable[[Any], Any], + strict: bool = False) -> None: + self._keys = keys + self._load = load + self._data: Dict[Any, Any] = {} + self._strict = strict + + def __getitem__(self, key: Any) -> Any: + if self._strict and key not in self._keys: + raise KeyError(f'No such key: {key}') + try: + if key not in self._data: + self._data[key] = self._load(key) + return self._data[key] + except Exception as e: + raise KeyError(f'{key} not found or not supported') from e + + def __len__(self) -> int: + return len(self._keys) + + def __iter__(self) -> Iterator[Any]: + return iter(self._keys) + + def __contains__(self, key: Any) -> bool: + return bool(key in self._keys) + + def __delitem__(self, key: Any) -> None: + raise NotImplementedError('Deletion is not allowed') + + def __setitem__(self, key: Any, value: Any) -> None: + self._data[key] = value + self._keys.append(key) \ No newline at end of file diff --git a/arxiv/canonical/register/version.py b/arxiv/canonical/register/version.py new file mode 100644 index 0000000..127a763 --- /dev/null +++ b/arxiv/canonical/register/version.py @@ -0,0 +1,188 @@ +from datetime import date +from functools import partial +from typing import Dict, Iterable, Optional, Sequence, Set, Type, Union + +from .core import (Base, D, R, I, ICanonicalStorage, ICanonicalSource, _Self, + dereference) +from .file import RegisterFile +from .metadata import RegisterMetadata + + +class RegisterVersion(Base[D.VersionedIdentifier, + D.Version, + R.RecordVersion, + I.IntegrityVersion, + str, + Union[RegisterFile, RegisterMetadata]]): + domain_type = D.Version + record_type = R.RecordVersion + integrity_type = I.IntegrityVersion + member_type = RegisterFile + + @classmethod + def create(cls, s: ICanonicalStorage, sources: Sequence[ICanonicalSource], + d: D.Version, save_members: bool = True) -> 'RegisterVersion': + r = R.RecordVersion.from_domain(d, partial(dereference, sources)) + i = I.IntegrityVersion.from_record(r, calculate_new_checksum=True) + members = RegisterVersion._get_v_members(s, i, save_members) + return cls(r.name, domain=d, record=r, integrity=i, members=members) + + @classmethod + def load(cls: Type[_Self], s: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + identifier: D.VersionedIdentifier, + checksum: Optional[str] = None) -> _Self: + """ + Load an e-print :class:`.Version` from s. + + This method is overridden since it uses a different member mapping + struct than higher-level collection types. + """ + # Most of the data needed to reconstitute the Version is in the + # metadata record. + key = R.RecordMetadata.make_key(identifier) + stream, _ = s.load_entry(key) + d = R.RecordMetadata.to_domain(stream) # self.load + _r = R.RecordMetadata(key=key, stream=stream, domain=d) + + # The manifest provides pre-calculated checksums for version members + # (source, render, other formats, etc). + manifest \ + = s.load_manifest(R.RecordVersion.make_manifest_key(identifier)) + r = R.RecordVersion.from_domain(d, partial(dereference, sources), + metadata=_r) + i = I.IntegrityVersion.from_record( + r, + checksum=checksum, + calculate_new_checksum=bool(checksum is None), + manifest=manifest + ) + # This just makes references to the members based on what is already + # loaded in the IntegrityVersion. + members = RegisterVersion._get_v_members(s, i, False) + + return cls(r.name, domain=d, record=r, integrity=i, + members=members) + + @classmethod + def _get_v_members(cls, s: ICanonicalStorage, + integrity: I.IntegrityVersion, + save_members: bool = True) \ + -> Dict[str, Union[RegisterFile, RegisterMetadata]]: + """ + Describe members of this version. + + This is a little different from the base ``_get_members()`` method, + in that we are working from an Integrity object rather than a manifest + alone. + """ + members: Dict[str, Union[RegisterFile, RegisterMetadata]] = {} + member: Union[RegisterFile, RegisterMetadata] + meta: Optional[I.IntegrityMetadata] = None + for i_member in integrity.iter_members(): + if isinstance(i_member.record, R.RecordFile): + assert isinstance(i_member, I.IntegrityEntry) + assert isinstance(i_member.record.domain, D.CanonicalFile) + member = RegisterFile(i_member.name, + domain=i_member.record.domain, + record=i_member.record, + integrity=i_member) + elif isinstance(i_member.record, R.RecordMetadata): + assert isinstance(i_member.record.domain, D.Version) + assert isinstance(i_member, I.IntegrityMetadata) + # Defer handling the metadata member until the end (see below). + meta = i_member + continue + if save_members: + member.save(s) + members[member.name] = member + + # We have deferred handling the metadata until the end, since (if we + # are saving members, especially for the first time) it is possible + # that some of the other members will have changed during the storage + # process due to gzip decompression. + if meta is None: + raise RuntimeError('No IntegrityMetadata member') + meta_record = meta.record + # If we are currently saving, we need to rebuild the metadata record + # that will be stored. + if save_members: + meta_record = R.RecordMetadata.from_domain(meta.record.domain) + meta.set_record(meta_record) + member = RegisterMetadata(meta.name, + domain=meta.record.domain, + record=meta_record, + integrity=meta) + if save_members: + member.save(s) + members[member.name] = member + return members + + @property + def member_names(self) -> Set[str]: + assert self.members is not None + return set([name for name in self.members]) + + @property + def number_of_events(self) -> int: + return 0 + + @property + def number_of_versions(self) -> int: + return 1 + + def update(self, s: ICanonicalStorage, sources: Sequence[ICanonicalSource], + version: D.Version) -> None: + """ + Update a version in place. + + Removes any members (files) not in the passed ``Version``, and retains + and ignores members without any content (assumes that this is a partial + update). Saves any new/changed members, and updates the manifest. + """ + new_version = self.create(s, sources, version, save_members=False) + # assert self.members is not None and new_version.members is not None + to_remove = self.member_names - new_version.member_names + + to_add = [name for name in new_version.members + # Ignore any members without content, as this may be a + # partial update only. + if new_version.members[name].domain is not None + # Select members not already present, or... + and (name not in self.members + # ...that appear to have changed. + or self.members[name].integrity.checksum + != new_version.members[name].integrity.checksum)] + for name in to_remove: + self.members[name].delete(s) + del self.members[name] + altered = set() + for name in to_add: + self.members[name] = new_version.members[name] + altered.add(self.members[name]) + self.save_members(s, altered) # Updates our manifest. + + def save_members(self, s: ICanonicalStorage, + members: Iterable[Union[RegisterFile, RegisterMetadata]]) -> None: + """Save members that have changed, and update our manifest.""" + meta: Optional[RegisterMetadata] = None + for member in members: + if isinstance(member, RegisterMetadata): + meta = member + checksum = member.save(s) + assert checksum is not None + self.integrity.update_or_extend_manifest(member.integrity, + checksum) + + # We have deferred handling the metadata until the end, since it is + # possible that some of the other members will have changed during the + # storage process due to gzip decompression. + if meta is None: + raise RuntimeError('No RegisterMetadata member') + meta.record = R.RecordMetadata.from_domain(meta.record.domain) + meta.integrity.set_record(meta.record) + checksum = meta.save(s) + assert checksum is not None + self.integrity.update_or_extend_manifest(meta.integrity, checksum) + + diff --git a/arxiv/canonical/role/__init__.py b/arxiv/canonical/role/__init__.py new file mode 100644 index 0000000..3e6315e --- /dev/null +++ b/arxiv/canonical/role/__init__.py @@ -0,0 +1,3 @@ +"""Provides high-level APIs for components of the announcement subsystem.""" + +from .role import Primary, Replicant, Observer, Repository diff --git a/arxiv/canonical/role/proxy.py b/arxiv/canonical/role/proxy.py new file mode 100644 index 0000000..531bba5 --- /dev/null +++ b/arxiv/canonical/role/proxy.py @@ -0,0 +1,28 @@ +from typing import Any, Generic, List, Type, TypeVar + +from ..core import IEventStream, IRegisterAPI + + +_Inner = TypeVar('_Inner') + + +class _BaseProxy(Generic[_Inner]): + def __init__(self, inner: _Inner, supported: List[str]) -> None: + self._inner = inner + self._supported = supported + + def __getattribute__(self, key: str) -> Any: + if not key.startswith('_'): + if key in self._supported: + return getattr(self._inner, key) + elif hasattr(self._inner, key): + raise AttributeError(f'{key} is not supported by this proxy') + return object.__getattribute__(self, key) + + +class RegisterAPIProxy(_BaseProxy[IRegisterAPI]): + pass + + +class EventStreamProxy(_BaseProxy[IEventStream]): + pass \ No newline at end of file diff --git a/arxiv/canonical/role/register.py b/arxiv/canonical/role/register.py new file mode 100644 index 0000000..9ca62d7 --- /dev/null +++ b/arxiv/canonical/role/register.py @@ -0,0 +1,44 @@ + +from abc import ABC +from typing import Any, List, Optional, Sequence + +from ..register import ICanonicalStorage, RegisterAPI, IRegisterAPI, \ + ICanonicalSource +from .proxy import RegisterAPIProxy + + +class RegisterRole(ABC): + register_supported: List[str] = [] + + @property + def register(self) -> IRegisterAPI: + assert self._register is not None + return self._register + + def set_register(self, storage: Any, sources: Sequence[ICanonicalSource], + name: str = 'all') -> None: + self._register = RegisterAPIProxy(RegisterAPI(storage, sources, name), + self.register_supported) + + +class NoRegister(RegisterRole, ABC): + pass + + +class Reader(RegisterRole, ABC): + register_supported = [ + 'load_listing', + 'load_version', + 'load_eprint', + 'load_history', + 'load_event', + 'load_events', + 'load_source', + 'load_render' + ] + + +class Writer(Reader, ABC): + register_supported = Reader.register_supported + [ + 'add_events', + ] \ No newline at end of file diff --git a/arxiv/canonical/role/role.py b/arxiv/canonical/role/role.py new file mode 100644 index 0000000..2963d84 --- /dev/null +++ b/arxiv/canonical/role/role.py @@ -0,0 +1,75 @@ + +from abc import ABC +from typing import Any, Sequence + +from .. import domain as D +from ..core import IEventStream, ICanonicalStorage, IRegisterAPI, \ + ICanonicalSource + +from .register import Reader, Writer, NoRegister +from .stream import Listener, Emitter, NoStream + + +class Role(ABC): + def __init__(self, storage: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + stream: IEventStream, name: str = 'all') -> None: + self.set_register(storage, sources, name) + self.set_stream(stream, sources, name) + + @property + def register(self) -> IRegisterAPI: + raise NotImplementedError('Must be implemented by child role') + + @property + def stream(self) -> IEventStream: + raise NotImplementedError('Must be implemented by child role') + + def set_register(self, storage: ICanonicalStorage, + sources: Sequence[ICanonicalSource], + name: str = 'all') -> None: + raise NotImplementedError('Must be implemented by child role') + + def set_stream(self, stream: IEventStream, + sources: Sequence[ICanonicalSource], + name: str = 'all') -> None: + raise NotImplementedError('Must be implemented by child role') + + +class Primary(Writer, Emitter, Role): + """ + The primary canonical record. + + All events are first written to and emitted from this authoritative + record. + """ + pass + + +class Replicant(Writer, Listener, Role): + """ + A system that transcribes events to a secondary record. + + The primary use-case is for mirror sites. + """ + + def on_event(self, event: D.Event) -> None: + self.register.add_events(event) + + +class Repository(Reader, NoStream, Role): + """A read-only API onto the canonical record.""" + pass + + +class Observer(NoRegister, Listener, Role): + """ + A system that processes canonical e-print events. + + Such a system might perform operations in response to canonical events that + fall outside of the maintenance of the canonical record. For example, it + might update a secondary index with a subset of data in the event stream. + """ + def on_event(self, event: D.Event) -> None: + raise NotImplementedError('Must be implemented by a child class') + diff --git a/arxiv/canonical/role/stream.py b/arxiv/canonical/role/stream.py new file mode 100644 index 0000000..9f32cea --- /dev/null +++ b/arxiv/canonical/role/stream.py @@ -0,0 +1,37 @@ + +from abc import ABC +from typing import Any, List, Optional, Sequence + +from .. import domain as D +from ..core import IEventStream, ICanonicalSource + +from .proxy import EventStreamProxy + + +class StreamRole(ABC): + event_supported: List[str] = [] + + @property + def stream(self) -> IEventStream: + assert self._stream is not None + return self._stream + + def set_stream(self, stream: IEventStream, + sources: Sequence[ICanonicalSource], + name: str = 'all') -> None: + self._stream = EventStreamProxy(stream, self.event_supported) + + +class NoStream(StreamRole, ABC): + pass + + +class Listener(StreamRole, ABC): + event_supported = ['listen'] + + def on_event(self, event: D.Event) -> None: + raise NotImplementedError('Must be implemented by a child class') + + +class Emitter(StreamRole, ABC): + event_supported = ['emit'] \ No newline at end of file diff --git a/arxiv/canonical/role/tests/__init__.py b/arxiv/canonical/role/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/arxiv/canonical/role/tests/test_register.py b/arxiv/canonical/role/tests/test_register.py new file mode 100644 index 0000000..0b2b84f --- /dev/null +++ b/arxiv/canonical/role/tests/test_register.py @@ -0,0 +1,376 @@ +import io +from datetime import datetime +from pytz import UTC +from typing import Callable, Tuple, Dict, List, IO +from unittest import TestCase, mock + +from ...register.api import (RegisterAPI, ICanonicalSource, ICanonicalStorage, + IStorableEntry, Manifest, NoSuchResource, D, R) +from ...services.store import InMemoryStorage +from ..register import Reader, Writer +from ..role import Primary, Repository + + +class RegisterTestCase(TestCase): + def setUp(self): + self.mock_source = mock.MagicMock(spec=ICanonicalSource) + self.mock_source.can_resolve.return_value = True + + self.mock_source.load = \ + lambda *a, **k: io.BytesIO(b'foocontent') + self.storage = InMemoryStorage() + self.primary = Primary(self.storage, [self.storage, self.mock_source], mock.MagicMock()) + self.repository = Repository(self.storage, [self.storage, self.mock_source], mock.MagicMock()) + + identifier = D.VersionedIdentifier('2901.00345v1') + created = datetime(2029, 1, 29, 20, 4, 23, tzinfo=UTC) + listing_id = D.ListingIdentifier.from_parts(created.date(), 'foo') + + version = D.Version( + identifier=identifier, + announced_date=created.date(), + announced_date_first=created.date(), + submitted_date=created, + updated_date=created, + is_announced=True, + events=[], + previous_versions=[], + metadata=D.Metadata( + primary_classification=D.Category('cs.DL'), + secondary_classification=[D.Category('cs.IR')], + title='Foo title', + abstract='It is abstract', + authors='Ima N. Author (FSU)', + license=D.License(href="http://some.license") + ), + source=D.CanonicalFile( + filename='2901.00345v1.tar', + modified=created, + size_bytes=4_304, + content_type=D.ContentType.tar, + ref=D.URI('/fake/path.tar'), + is_gzipped=False, + ), + render=D.CanonicalFile( + filename='2901.00345v1.pdf', + modified=created, + size_bytes=404, + content_type=D.ContentType.pdf, + ref=D.URI('/fake/path.pdf') + ) + ) + self.event = D.Event( + identifier=identifier, + event_date=created, + event_type=D.EventType.NEW, + categories=[D.Category('cs.DL')], + version=version + ) + self.timestamp = created + self.event_date = self.timestamp.date() + + +class TestPrimary(RegisterTestCase): + """Primary has access to the read and write API for the register.""" + + def test_add_load_event(self): + """Primary can add + load an event.""" + self.primary.register.add_events(self.event) + + event_id = self.event.event_id + self.assertEqual(self.primary.register.load_event(event_id), + self.event, + 'Added event can be loaded again') + self.assertEqual(self.repository.register.load_event(event_id), + self.event, + 'Added event can be loaded again') + + def test_add_load_events_by_date(self): + """Can add events and load them using date selector.""" + events, N = self.primary.register.load_events(self.event_date) + self.assertEqual(N, 0, 'There are no events') + self.assertEqual(len(list(events)), N, 'There are truly no events') + + self.primary.register.add_events(self.event) + + events, N = self.primary.register.load_events(self.event_date) + self.assertEqual(N, 1, 'There is now one event') + event_list = list(events) + self.assertEqual(len(event_list), N, 'There is truly 1 event') + self.assertEqual(event_list[0], self.event, + 'And that event is the one that we just added') + + events, N = self.primary.register.load_events(datetime.now().date()) + self.assertEqual(N, 0, 'But there are no events from today') + self.assertEqual(len(list(events)), N, 'Indeed, no events') + + + def test_add_load_events_by_month(self): + """Can add events and load them using month selector.""" + events, N = self.primary.register.load_events((self.event_date.year, + self.event_date.month)) + self.assertEqual(N, 0, 'There are no events') + self.assertEqual(len(list(events)), N, 'There are truly no events') + + self.primary.register.add_events(self.event) + + events, N = self.primary.register.load_events((self.event_date.year, + self.event_date.month)) + self.assertEqual(N, 1, 'There is now one event') + event_list = list(events) + self.assertEqual(len(event_list), N, 'There is truly 1 event') + self.assertEqual(event_list[0], self.event, + 'And that event is the one that we just added') + + events, N = self.primary.register.load_events((datetime.now().year, + datetime.now().month)) + self.assertEqual(N, 0, 'But there are no events from this month') + self.assertEqual(len(list(events)), N, 'Indeed, no events') + + def test_add_load_events_by_year(self): + """Can add events and load them using year selector.""" + events, N = self.primary.register.load_events(self.event_date.year) + self.assertEqual(N, 0, 'There are no events') + self.assertEqual(len(list(events)), N, 'There are truly no events') + + self.primary.register.add_events(self.event) + + events, N = self.primary.register.load_events(self.event_date.year) + self.assertEqual(N, 1, 'There is now one event') + event_list = list(events) + self.assertEqual(len(event_list), N, 'There is truly 1 event') + self.assertEqual(event_list[0], self.event, + 'And that event is the one that we just added') + + events, N = self.primary.register.load_events(datetime.now().year) + self.assertEqual(N, 0, 'But there are no events from this year') + self.assertEqual(len(list(events)), N, 'Indeed, no events') + + def test_can_load_listing(self): + """Can load listings.""" + listing = self.primary.register.load_listing(self.event_date) + self.assertEqual(len(listing.events), 0, 'Listing has no events') + + self.primary.register.add_events(self.event) + + listing = self.primary.register.load_listing(self.event_date) + self.assertEqual(len(listing.events), 1, + 'But it has one after we add an event') + self.assertEqual(listing.events[0], self.event, + 'And it is the event that we added') + + def test_can_load_version(self): + """Can load a Version that was created via an event.""" + with self.assertRaises(Exception): + self.primary.register.load_version(self.event.identifier) + + self.primary.register.add_events(self.event) + + version = self.primary.register.load_version(self.event.identifier) + self.assertEqual(version, self.event.version, + 'Can load the Version that we just added') + + def test_can_load_eprint(self): + """Can load an EPrint that was created via an event.""" + with self.assertRaises(Exception): + self.primary.register.load_eprint(self.event.identifier.arxiv_id) + + self.primary.register.add_events(self.event) + + eprint = self.primary.register.load_eprint(self.event.identifier.arxiv_id) + self.assertEqual(eprint.versions[self.event.identifier], + self.event.version, + 'Can load the Version that we just added') + + def test_can_load_history(self): + """Can load the event history of a Version or EPrint.""" + with self.assertRaises(NoSuchResource): + self.primary.register.load_history(self.event.identifier.arxiv_id) + with self.assertRaises(NoSuchResource): + self.primary.register.load_history(self.event.identifier) + + self.primary.register.add_events(self.event) + + summary = next( + self.primary.register.load_history(self.event.identifier.arxiv_id) + ) + self.assertEqual(summary, self.event.summary, + 'History includes a summary of our event') + + summary = next( + self.primary.register.load_history(self.event.identifier) + ) + self.assertEqual(summary, self.event.summary, + 'History includes a summary of our event') + + def test_can_load_render(self): + """Can load an EPrint that was created via an event.""" + with self.assertRaises(NoSuchResource): + self.primary.register.load_render(self.event.identifier) + + self.primary.register.add_events(self.event) + + cf, content = self.primary.register.load_render(self.event.identifier) + self.assertEqual(cf, self.event.version.render, + 'Loads the render file') + self.assertEqual(content.read(), b'foocontent', 'Loads render content') + + cf, content = self.primary.register.load_source(self.event.identifier) + self.assertEqual(cf, self.event.version.source, + 'Loads the source file') + self.assertEqual(content.read(), b'foocontent', 'Loads source content') + + +class TestRepository(RegisterTestCase): + """Repository is a read-only role.""" + + def test_cannot_add_events(self): + """Repository cannot add events.""" + with self.assertRaises(AttributeError): + self.repository.register.add_events(self.event) + + def test_load_events_by_date(self): + """Can load events using date selector.""" + events, N = self.repository.register.load_events(self.event_date) + self.assertEqual(N, 0, 'There are no events') + self.assertEqual(len(list(events)), N, 'There are truly no events') + + self.primary.register.add_events(self.event) + + events, N = self.repository.register.load_events(self.event_date) + self.assertEqual(N, 1, 'There is now one event') + event_list = list(events) + self.assertEqual(len(event_list), N, 'There is truly 1 event') + self.assertEqual(event_list[0], self.event, + 'And that event is the one that we just added') + + events, N = self.repository.register.load_events(datetime.now().date()) + self.assertEqual(N, 0, 'But there are no events from today') + self.assertEqual(len(list(events)), N, 'Indeed, no events') + + def test_add_load_events_by_month(self): + """Can add events and load them using month selector.""" + selector = (self.event_date.year, self.event_date.month) + events, N = self.repository.register.load_events(selector) + self.assertEqual(N, 0, 'There are no events') + self.assertEqual(len(list(events)), N, 'There are truly no events') + + self.primary.register.add_events(self.event) + + events, N = self.repository.register.load_events(selector) + self.assertEqual(N, 1, 'There is now one event') + event_list = list(events) + self.assertEqual(len(event_list), N, 'There is truly 1 event') + self.assertEqual(event_list[0], self.event, + 'And that event is the one that we just added') + + selector = (datetime.now().year, datetime.now().month) + events, N = self.repository.register.load_events(selector) + self.assertEqual(N, 0, 'But there are no events from this month') + self.assertEqual(len(list(events)), N, 'Indeed, no events') + + def test_add_load_events_by_year(self): + """Can load events using year selector.""" + events, N = self.repository.register.load_events(self.event_date.year) + self.assertEqual(N, 0, 'There are no events') + self.assertEqual(len(list(events)), N, 'There are truly no events') + + self.primary.register.add_events(self.event) + + events, N = self.repository.register.load_events(self.event_date.year) + self.assertEqual(N, 1, 'There is now one event') + event_list = list(events) + self.assertEqual(len(event_list), N, 'There is truly 1 event') + self.assertEqual(event_list[0], self.event, + 'And that event is the one that we just added') + + events, N = self.repository.register.load_events(datetime.now().year) + self.assertEqual(N, 0, 'But there are no events from this year') + self.assertEqual(len(list(events)), N, 'Indeed, no events') + + def test_can_load_listing(self): + """Can load listings.""" + listing = self.repository.register.load_listing(self.event_date) + self.assertEqual(len(listing.events), 0, 'Listing has no events') + + self.primary.register.add_events(self.event) + + listing = self.repository.register.load_listing(self.event_date) + self.assertEqual(len(listing.events), 1, + 'But it has one after we add an event') + self.assertEqual(listing.events[0], self.event, + 'And it is the event that we added') + + def test_can_load_version(self): + """Can load a Version that was created via an event.""" + with self.assertRaises(Exception): + self.repository.register.load_version(self.event.identifier) + + self.primary.register.add_events(self.event) + + version = self.repository.register.load_version(self.event.identifier) + self.assertEqual(version, self.event.version, + 'Can load the Version that we just added') + + def test_can_load_eprint(self): + """Can load an EPrint that was created via an event.""" + with self.assertRaises(Exception): + self.repository.register.load_eprint( + self.event.identifier.arxiv_id + ) + + self.primary.register.add_events(self.event) + + eprint = self.repository.register.load_eprint( + self.event.identifier.arxiv_id + ) + self.assertEqual(eprint.versions[self.event.identifier], + self.event.version, + 'Can load the Version that we just added') + + def test_can_load_history(self): + """Can load the event history of a Version or EPrint.""" + with self.assertRaises(NoSuchResource): + self.repository.register.load_history( + self.event.identifier.arxiv_id + ) + with self.assertRaises(NoSuchResource): + self.repository.register.load_history(self.event.identifier) + + self.primary.register.add_events(self.event) + + summary = next( + self.repository.register.load_history( + self.event.identifier.arxiv_id + ) + ) + self.assertEqual(summary, self.event.summary, + 'History includes a summary of our event') + + summary = next( + self.repository.register.load_history(self.event.identifier) + ) + self.assertEqual(summary, self.event.summary, + 'History includes a summary of our event') + + def test_can_load_render(self): + """Can load an EPrint that was created via an event.""" + with self.assertRaises(NoSuchResource): + self.repository.register.load_render(self.event.identifier) + + self.primary.register.add_events(self.event) + + cf, content = self.repository.register.load_render( + self.event.identifier + ) + self.assertEqual(cf, self.event.version.render, + 'Loads the render file') + self.assertEqual(content.read(), b'foocontent', 'Loads render content') + + cf, content = self.repository.register.load_source( + self.event.identifier + ) + self.assertEqual(cf, self.event.version.source, + 'Loads the source file') + self.assertEqual(content.read(), b'foocontent', 'Loads source content') + diff --git a/arxiv/canonical/serialize/__init__.py b/arxiv/canonical/serialize/__init__.py index 8a0f5da..9a5e18d 100644 --- a/arxiv/canonical/serialize/__init__.py +++ b/arxiv/canonical/serialize/__init__.py @@ -3,15 +3,15 @@ from typing import Any import json -from .encoder import CanonicalJSONEncoder -from .decoder import CanonicalJSONDecoder +from .encoder import CanonicalEncoder +from .decoder import CanonicalDecoder def dumps(obj: Any) -> str: """Generate JSON from a Python object.""" - return json.dumps(obj, cls=CanonicalJSONEncoder) + return json.dumps(obj, cls=CanonicalEncoder) def loads(data: str) -> Any: """Load a Python object from JSON.""" - return json.loads(data, cls=CanonicalJSONDecoder) + return json.loads(data, cls=CanonicalDecoder) diff --git a/arxiv/canonical/serialize/classic/__init__.py b/arxiv/canonical/serialize/classic/__init__.py deleted file mode 100644 index a6d7e58..0000000 --- a/arxiv/canonical/serialize/classic/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""(De)Serialization of the classic announcement record.""" - -from . import abs, daily diff --git a/arxiv/canonical/serialize/classic/abs.py b/arxiv/canonical/serialize/classic/abs.py deleted file mode 100644 index bb98753..0000000 --- a/arxiv/canonical/serialize/classic/abs.py +++ /dev/null @@ -1,210 +0,0 @@ -"""Parse fields from a single arXiv abstract (.abs) file.""" - -import os -import re -from typing import Any, Dict, List, Optional, Tuple -from functools import wraps -from dateutil import parser -from pytz import timezone -from datetime import datetime -from dateutil.tz import tzutc, gettz - -from ... import domain - -EASTERN = gettz('US/Eastern') - -RE_ABS_COMPONENTS = re.compile(r'^\\\\\n', re.MULTILINE) -RE_FROM_FIELD = re.compile( - r'(?PFrom:\s*)(?P[^<]+)?\s+(<(?P.*)>)?') -RE_DATE_COMPONENTS = re.compile( - r'^Date\s*(?::|\(revised\s*(?P.*?)\):)\s*(?P.*?)' - r'(?:\s+\((?P\d+)kb,?(?P.*)\))?$') -RE_FIELD_COMPONENTS = re.compile( - r'^(?P[-a-z\)\(]+\s*):\s*(?P.*)', re.IGNORECASE) -RE_ARXIV_ID_FROM_PREHISTORY = re.compile( - r'(Paper:\s+|arXiv:)(?P\S+)') - -NAMED_FIELDS = ['Title', 'Authors', 'Categories', 'Comments', 'Proxy', - 'Report-no', 'ACM-class', 'MSC-class', 'Journal-ref', - 'DOI', 'License'] -""" -Fields that may be parsed from the key-value pairs in second -major component of .abs string. Field names are not normalized. -""" - -REQUIRED_FIELDS = ['title', 'authors', 'abstract'] -""" -Required parsed fields with normalized field names. - -Note the absense of 'categories' as a required field. A subset of version- -affixed .abs files with the old identifiers predate the introduction of -categories and therefore do not have a "Categories:" line; only the (higher- -level) archive and group can be be inferred, and this must be done via the -identifier itself. - -The latest versions of these papers should always have the "Categories:" line. -""" - -# arXiv ID format used from 1991 to 2007-03 -RE_ARXIV_OLD_ID = re.compile( - r'^(?P[a-z]{1,}(\-[a-z]{2,})?)(\.([a-zA-Z\-]{2,}))?\/' - r'(?P(?P\d\d)(?P\d\d))(?P\d\d\d)' - r'(v(?P[1-9]\d*))?([#\/].*)?$') - -# arXiv ID format used from 2007-04 to present -RE_ARXIV_NEW_ID = re.compile( - r'^(?P(?P\d\d)(?P\d\d))\.(?P\d{4,5})' - r'(v(?P[1-9]\d*))?([#\/].*)?$' -) - -ASSUMED_LICENSE = domain.License( - href='http://arxiv.org/licenses/nonexclusive-distrib/1.0/' -) - - -def parse(path: str) -> domain.EPrint: - with open(path, mode='r', encoding='latin-1') as f: - raw = f.read() - - # TODO: clean up - modified = datetime.fromtimestamp(os.path.getmtime(path), tz=EASTERN) - modified = modified.astimezone(tz=tzutc()) - - # there are two main components to an .abs file that contain data, - # but the split must always return four components - components = RE_ABS_COMPONENTS.split(raw) - if not len(components) == 4: - raise IOError('Unexpected number of components parsed from .abs.') - - # everything else is in the second main component - prehistory, misc_fields = re.split(r'\n\n', components[1]) - - fields: Dict[str, Any] = _parse_metadata(key_value_block=misc_fields) - fields['abstract'] = components[2] # abstract is the first main component - - id_match = RE_ARXIV_ID_FROM_PREHISTORY.match(prehistory) - - if not id_match: - raise IOError('Could not extract arXiv ID from prehistory component.') - - arxiv_id = id_match.group('arxiv_id') - prehistory = re.sub(r'^.*\n', '', prehistory) - parsed_version_entries = re.split(r'\n', prehistory) - - # submitter data - from_match = RE_FROM_FIELD.match(parsed_version_entries.pop(0)) - if not from_match: - raise IOError('Could not extract submitter data.') - - name = from_match.group('name') - if name is not None: - name = name.rstrip() - - # get the version history for this particular version of the document - if not len(parsed_version_entries) >= 1: - raise IOError('At least one version entry expected.') - - versions = _parse_versions(arxiv_id=arxiv_id, - version_entry_list=parsed_version_entries) - - secondary_classification = [] - - if 'categories' in fields and fields['categories']: - classifications = fields['categories'].split() - primary_classification = classifications[0] - secondary_classification = classifications[1:] - else: - match = RE_ARXIV_OLD_ID.match(arxiv_id) - if not match: - raise IOError('Could not determine primary classification') - primary_classification = match.group('archive') - - if 'license' in fields: - license = domain.License(fields['license']) - else: - license = ASSUMED_LICENSE - - return domain.EPrint( - arxiv_id=arxiv_id, - version=versions[-1].version, - legacy=True, - submitter=domain.Person(full_name=name) if name else None, - submitted_date=versions[-1].submitted_date, - announced_date='', - license=license, - primary_classification=primary_classification, - title=fields['title'], - abstract=fields['abstract'], - authors=fields['authors'], - source_type=versions[-1].source_type, - size_kilobytes=versions[-1].size_kilobytes, - secondary_classification=secondary_classification, - journal_ref=fields.get('journal_ref'), - report_num=fields.get('report_num'), - doi=fields.get('doi'), - msc_class=fields.get('msc_class'), - acm_class=fields.get('acm_class'), - proxy=fields.get('proxy'), - comments=fields.get('comments', ''), - previous_versions=versions[:-1], - history=[] - ) - - -def _parse_metadata(key_value_block: str) -> Dict[str, str]: - """Parse the key-value block from the arXiv .abs string.""" - key_value_block = key_value_block.lstrip() - field_lines = re.split(r'\n', key_value_block) - field_name = 'unknown' - fields_builder: Dict[str, str] = {} - for field_line in field_lines: - field_match = RE_FIELD_COMPONENTS.match(field_line) - if field_match and field_match.group('field') in NAMED_FIELDS: - field_name = field_match.group( - 'field').lower().replace('-', '_') - field_name = re.sub(r'_no$', '_num', field_name) - fields_builder[field_name] = field_match.group( - 'value').rstrip() - elif field_name != 'unknown': - # we have a line with leading spaces - fields_builder[field_name] += re.sub(r'^\s+', ' ', field_line) - return fields_builder - - -def _parse_announced(arxiv_id: str) -> str: - match = RE_ARXIV_OLD_ID.match(arxiv_id) - if not match: - match = RE_ARXIV_NEW_ID.match(arxiv_id) - if not match: - raise ValueError('Not a valid arXiv ID') - yy = int(match.group('yy')) - mm = int(match.group('mm')) - year = f'19{yy}' if yy > 90 else f'20{yy}' - return f'{year}-{mm}' - - -def _parse_versions(arxiv_id: str, version_entry_list: List) \ - -> List[domain.VersionReference]: - """Parse the version entries from the arXiv .abs file.""" - version_entries = list() - for parsed_version_entry in version_entry_list: - date_match = RE_DATE_COMPONENTS.match(parsed_version_entry) - if not date_match: - raise IOError('Could not extract date components from date line.') - try: - sd = date_match.group('date') - submitted_date = parser.parse(date_match.group('date')) - except (ValueError, TypeError): - raise IOError(f'Could not parse submitted date {sd} as datetime') - - source_type = date_match.group('source_type') - size_kilobytes = int(date_match.group('size_kilobytes')) - version_entries.append( - domain.VersionReference(arxiv_id=arxiv_id, - version=len(version_entries) + 1, - submitted_date=submitted_date, - announced_date=_parse_announced(arxiv_id), - source_type=source_type, - size_kilobytes=size_kilobytes)) - - return version_entries diff --git a/arxiv/canonical/serialize/decoder.py b/arxiv/canonical/serialize/decoder.py index 1d22747..93ebd80 100644 --- a/arxiv/canonical/serialize/decoder.py +++ b/arxiv/canonical/serialize/decoder.py @@ -1,37 +1,41 @@ -"""Provides a :class:`.CanonicalJSONDecoder` for domain objects.""" +"""Provides a :class:`.CanonicalDecoder` for domain objects.""" import json -from typing import Any, Union, List, Dict from datetime import datetime, date from enum import Enum +from typing import Any, Union, List, Dict, GenericMeta +from typing import TypingMeta # type: ignore ; it's really there... +from uuid import UUID from backports.datetime_fromisoformat import MonkeyPatch -from . import classic from .. import domain MonkeyPatch.patch_fromisoformat() -class CanonicalJSONDecoder(json.JSONDecoder): +class CanonicalDecoder(json.JSONDecoder): """Decode domain objects.""" def __init__(self, *args: Any, **kwargs: Any) -> None: """Pass :func:`object_hook` to the base constructor.""" kwargs['object_hook'] = kwargs.get('object_hook', self.object_hook) - super(CanonicalJSONDecoder, self).__init__(*args, **kwargs) + super(CanonicalDecoder, self).__init__(*args, **kwargs) def _try_isoparse(self, value: Any) -> Any: """Attempt to parse a value as an ISO8601 datetime.""" if type(value) is not str: return value try: - return datetime.fromisoformat(value) # type: ignore + return date.fromisoformat(value) # type: ignore ; pylint: disable=no-member except ValueError: - return value + try: + return datetime.fromisoformat(value) # type: ignore ; pylint: disable=no-member + except ValueError: + return value - def object_hook(self, obj: dict, **extra: Any) -> Any: + def object_hook(self, obj: dict, **extra: Any) -> Any: # pylint: disable=method-hidden """Decode domain objects in this package.""" if isinstance(obj, dict): for key, value in obj.items(): @@ -40,9 +44,26 @@ def object_hook(self, obj: dict, **extra: Any) -> Any: else: obj[key] = self._try_isoparse(value) - obj_type = obj.pop('@type') + # Look for and instantiate the domain class that corresponds to the + # stated type of the data. + obj_type = obj.pop('@type', None) + if obj_type is None: + return obj for domain_class in domain.domain_classes: if domain_class.__name__ == obj_type: + # Look for easy wins on casting field data to the correct + # type. The main use-case is for enums. + for field, ftype in domain_class.__annotations__.items(): # pylint: disable=protected-access + # These are things like Union, List, etc that don't + # have a concrete type. Too hard to take this on. + if isinstance(ftype, GenericMeta) \ + or isinstance(type(ftype), TypingMeta): + continue + # Otherwise, this is a concrete type. We can try + # to cast here. + if field in obj \ + and not isinstance(obj[field], ftype): + obj[field] = ftype(obj[field]) return domain_class(**obj) return obj diff --git a/arxiv/canonical/serialize/encoder.py b/arxiv/canonical/serialize/encoder.py index 33672a1..bb83c9d 100644 --- a/arxiv/canonical/serialize/encoder.py +++ b/arxiv/canonical/serialize/encoder.py @@ -1,14 +1,16 @@ -"""Provides a :class:`.CanonicalJSONEncoder` for domain objects.""" +"""Provides a :class:`.CanonicalEncoder` for domain objects.""" -import re import json -from typing import Any, Union, List, Dict +import re + from datetime import datetime, date from enum import Enum +from typing import Any, Union, List, Dict, Type +from uuid import UUID from backports.datetime_fromisoformat import MonkeyPatch -from . import classic +from .. import classic from .. import domain @@ -20,18 +22,18 @@ def _camel_to_snake(camel: str) -> str: return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() -class CanonicalJSONEncoder(json.JSONEncoder): +class CanonicalEncoder(json.JSONEncoder): """Encodes domain objects in this package for serialization.""" def unpack(self, obj: Any) -> Any: """Recursively search for domain objects, and unpack them to dicts.""" if isinstance(obj, dict): - return {key: self.unpack(value) for key, value in obj.items()} + return {self.unpack(key): self.unpack(value) for key, value in obj.items()} elif isinstance(obj, list): return [self.unpack(value) for value in obj] - elif type(obj) in domain.domain_classes: + elif isinstance(obj, domain.CanonicalBase): type_snake = _camel_to_snake(type(obj).__name__) - unpack_obj = getattr(self, f'unpack_{type_snake}', + unpack_obj = getattr(self, f'unpack_{type_snake}', self.unpack_default) data = unpack_obj(obj) data['@type'] = type(obj).__name__ @@ -46,15 +48,18 @@ def unpack(self, obj: Any) -> Any: def encode(self, obj: Any) -> Any: """Serialize objects in this application domain.""" - return super(CanonicalJSONEncoder, self).encode(self.unpack(obj)) - + return super(CanonicalEncoder, self).encode(self.unpack(obj)) + def unpack_default(self, obj: Any) -> Dict: """Fallback unpack method for any domain object.""" - return {key: self.unpack(val) for key, val in obj._asdict().items()} - - def unpack_file(self, obj: domain.File) -> Dict: + return {key: self.unpack(getattr(obj, key)) + for key in obj.__annotations__.keys()} + + def unpack_canonical_file(self, obj: domain.CanonicalFile) -> Dict: """Unpack a :class:`.domain.File`.""" - return {key: self.unpack(val) for key, val in obj._asdict().items() - if key != 'content'} + return {key: self.unpack(getattr(obj, key)) + for key in obj.__annotations__.keys() if key != 'content'} + def unpack_uuid(self, obj: UUID) -> Dict: + return {'hex': obj.hex} diff --git a/arxiv/canonical/serialize/record.py b/arxiv/canonical/serialize/record.py new file mode 100644 index 0000000..ffea075 --- /dev/null +++ b/arxiv/canonical/serialize/record.py @@ -0,0 +1,30 @@ +""" +Serializers for low-level elements of the canonical record. + +Specifically, this maps concepts in :mod:`.domain` to low-level elements in +:mod:`arxiv.canonical.record` and visa-versa. +""" + +from io import BytesIO +from json import dumps, load +from typing import Callable, IO, Tuple + +from ..domain import Version, ContentType, Listing, CanonicalFile, \ + VersionedIdentifier, URI +from ..record import RecordStream, RecordVersion, RecordMetadata, \ + RecordEntryMembers, RecordListing +from .decoder import CanonicalDecoder +from .encoder import CanonicalEncoder + +Key = str +ContentLoader = Callable[[Key], IO[bytes]] + + + + + + + + + + diff --git a/arxiv/canonical/serialize/tests/__init__.py b/arxiv/canonical/serialize/tests/__init__.py new file mode 100644 index 0000000..4894929 --- /dev/null +++ b/arxiv/canonical/serialize/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for :mod:`.serialize.record`.""" \ No newline at end of file diff --git a/arxiv/canonical/tests/test_serialize.py b/arxiv/canonical/serialize/tests/test_serialize.py similarity index 97% rename from arxiv/canonical/tests/test_serialize.py rename to arxiv/canonical/serialize/tests/test_serialize.py index ebd2808..b9852a3 100644 --- a/arxiv/canonical/tests/test_serialize.py +++ b/arxiv/canonical/serialize/tests/test_serialize.py @@ -6,7 +6,7 @@ import jsonschema -from .. import domain, serialize +from ... import domain, serialize class TestSerializeDeserialize(TestCase): diff --git a/arxiv/canonical/services/__init__.py b/arxiv/canonical/services/__init__.py index 7955239..85a359d 100644 --- a/arxiv/canonical/services/__init__.py +++ b/arxiv/canonical/services/__init__.py @@ -1,3 +1,5 @@ """Service integrations.""" -from .store import FakeCanonicalStore, CanonicalStore \ No newline at end of file +from .filesystem import Filesystem, CanonicalFilesystem +from .remote import RemoteSource +from .store import CanonicalStore, InMemoryStorage \ No newline at end of file diff --git a/arxiv/canonical/services/filesystem.py b/arxiv/canonical/services/filesystem.py new file mode 100644 index 0000000..b09a1e1 --- /dev/null +++ b/arxiv/canonical/services/filesystem.py @@ -0,0 +1,152 @@ +import gzip +import io +import json +import logging +import os +from datetime import datetime +from http import HTTPStatus as status +from typing import IO, Iterable, List, Tuple, Union +from urllib3.util.retry import Retry + +import requests +from pytz import timezone, UTC + +from .. import domain as D +from .. import record as R +from .. import integrity as I +from ..manifest import Manifest, ManifestEncoder, ManifestDecoder +from ..register import ICanonicalStorage, IStorableEntry, ICanonicalSource +from .readable import BytesIOProxy + +logger = logging.getLogger(__name__) +logger.setLevel(int(os.environ.get('LOGLEVEL', '40'))) + +ET = timezone('US/Eastern') + + +class Filesystem(ICanonicalSource): + """Retrieves content from a filesystem (outside the canonical record).""" + + def __init__(self, base_path: str) -> None: + self._base_path = base_path + + def _make_path(self, uri: D.URI) -> str: + return os.path.abspath(uri.path) + + def can_resolve(self, uri: D.URI) -> bool: + return uri.is_file and self._base_path in os.path.abspath(uri.path) + + def load(self, uri: D.URI) -> IO[bytes]: + """Make an IO that waits to load from the record until it is read().""" + if not self.can_resolve(uri): + raise RuntimeError(f'Cannot resolve this URI: {uri}') + return open(self._make_path(uri), 'rb') + + +class CanonicalFilesystem(Filesystem, ICanonicalStorage): + """Filesystem storage for the canonical record.""" + + def can_resolve(self, uri: D.URI) -> bool: + return uri.is_canonical + + def can_store(self, key: D.Key) -> bool: + return key.is_canonical + + def _make_path(self, uri: D.URI) -> str: + return os.path.join(self._base_path, uri.path.lstrip('/')) + + def list_subkeys(self, key: D.URI) -> List[str]: + """List all of the subkeys for ``key`` in the record.""" + if not self.can_resolve(key): + raise RuntimeError(f'Cannot resolve this URI: {key}') + return os.listdir(self._make_path(key)) + + def load_entry(self, uri: D.URI) -> Tuple[R.RecordStream, str]: + """Load an entry from the record.""" + if not self.can_resolve(uri): + raise RuntimeError(f'Cannot resolve this URI: {uri}') + assert isinstance(uri, D.Key) + path = self._make_path(uri) + pointer = open(path, 'rb') + content_type = D.ContentType.from_filename(path) + size_bytes = os.stat(path).st_size + stream = R.RecordStream( + domain=D.CanonicalFile( + modified=datetime.fromtimestamp(os.path.getmtime(path), tz=ET) + .astimezone(tz=UTC), + filename=uri.filename, + size_bytes=size_bytes, + content_type=content_type, + ref=uri + ), + content=pointer, + content_type=content_type, + size_bytes=size_bytes + ) + return stream, I.calculate_checksum(pointer) + + def load_manifest(self, key: D.Key) -> Manifest: + """Load an integrity manifest.""" + if not self.can_store(key): + raise RuntimeError(f'Cannot load this manifest: {key}') + with open(self._make_path(key), 'r') as f: + manifest: Manifest = json.load(f, cls=ManifestDecoder) + return manifest + + def store_entry(self, ri: IStorableEntry) -> None: + """Store an entry in the record.""" + if not self.can_store(ri.record.key) or not ri.record.stream.content: + logger.error(f'Cannot store: {ri.record.key}') + raise RuntimeError(f'Cannot store: {ri.record.key}') + + path = self._make_path(ri.record.key) + # Make sure that we have a place to put the file. + parent, _ = os.path.split(path) + if not os.path.exists(parent): + os.makedirs(parent) + + # Ensure that we are starting from the beginning of the stream. + logger.debug('Ready to write to %s from %s', path, ri.record.stream) + if ri.record.stream.content.seekable(): + ri.record.stream.content.seek(0) + + # Write the content to the target file, being sure to decompress if + # necessary. + content: Union[IO[bytes], gzip.GzipFile] + if ri.record.stream.domain.is_gzipped: + content = gzip.GzipFile(fileobj=ri.record.stream.content) + else: + content = ri.record.stream.content + with open(path, 'wb') as f: + while True: + chunk = content.read(4096) + if not chunk: + break + f.write(chunk) + + # Sanity check. + size_bytes = os.path.getsize(path) + logger.debug('Wrote %i bytes to %s', size_bytes, path) + if size_bytes == 0: + raise IOError(f'Wrote {size_bytes} bytes to {path}') + + # Update the CanonicalFile to reflect the fact that we decompressed + # the content. + if ri.record.stream.domain.is_gzipped: + ri.record.stream.domain.is_gzipped = False + ri.record.stream.domain.size_bytes = size_bytes + ri.record.stream = ri.record.stream._replace( + content=self.load(ri.record.key) + ) + ri.update_checksum() + + def store_manifest(self, key: D.Key, manifest: Manifest) -> None: + """Store an integrity manifest.""" + if not self.can_store(key): + raise RuntimeError(f'Cannot store this manifest: {key}') + path = self._make_path(key) + parent, _ = os.path.split(path) + if not os.path.exists(parent): + os.makedirs(parent) # Pave the way! + with open(path, 'w') as f: + f.write(json.dumps(manifest, cls=ManifestEncoder)) diff --git a/arxiv/canonical/services/readable.py b/arxiv/canonical/services/readable.py new file mode 100644 index 0000000..8047841 --- /dev/null +++ b/arxiv/canonical/services/readable.py @@ -0,0 +1,162 @@ +"""Provides :class:`.BytesIOProxy`.""" + +import io +from typing import Any, Callable, IO, List, Optional, Iterable, Iterator + +from typing_extensions import Literal + + +class BytesIOProxy(io.BytesIO): + """ + A readable object that wraps a ``read()`` callable. + + This gives us lazy, proxied read access to a (presumably expensive) + resource that is consistent with ``io.IOBase``. + """ + + def __init__(self, read: Callable[[], bytes]) -> None: + self._read: Optional[Callable[[], bytes]] = read + self._content: Optional[IO[bytes]] = None + + @property + def _loaded_content(self) -> IO[bytes]: + if self._read is None: + raise ValueError('Resource is closed') + if self._content is None: + self._content = io.BytesIO(self._read()) + return self._content + + def close(self) -> None: + """Flush and close this stream.""" + if self._content is not None: + self._content.close() + else: + self._read = None + super(BytesIOProxy, self).close() + + def fileno(self) -> int: + """Return the underlying file descriptor of the stream if it exists.""" + raise OSError('No underlying file') + + def flush(self) -> None: + """Flush the write buffers of the stream if applicable.""" + return + + def isatty(self) -> bool: + """Return True if the stream is interactive.""" + return False + + def readable(self) -> bool: + """Return True if the stream can be read from.""" + if self._content is not None: + return self._content.readable() + if self._read is None: + raise ValueError('I/O attempted on closed stream') + return True + + def readline(self, size: int = -1) -> bytes: + """Read and return one line from the stream.""" + return self._loaded_content.readline(size) + + def readlines(self, hint: int = -1) -> List[bytes]: + """Read and return a list of lines from the stream.""" + return self._loaded_content.readlines(hint) + + def read(self, size: Optional[int] = -1) -> bytes: + """Read from the stream.""" + if size is not None: + return self._loaded_content.read(size) + return self._loaded_content.read() + + def seek(self, offset: int, whence: int = 0) -> int: + """Change the stream position to the given byte offset.""" + if self._content is not None: + return self._content.seek(offset, whence) + return 0 + + def seekable(self) -> bool: + """Return True if the stream supports random access.""" + if self._content is not None: + return self._content.seekable() + return bool(self._read is not None) + + def tell(self) -> int: + """Return the current stream position.""" + if self._content is not None: + return self._content.tell() + return 0 + + def truncate(self, size: Optional[int] = None) -> int: + """Truncation is not supported.""" + raise NotImplementedError('Truncation not supported') + + def writable(self) -> bool: + """Writing is not supported.""" + return False + + def writelines(self, lines: Iterable[bytes]) -> None: + """Writing is not supported.""" + raise NotImplementedError('Writing not supported') + + def __del__(self) -> None: + """Prepare for deletion.""" + if self._content is not None: + del self._content + + +class IterReadWrapper(io.BytesIO): + """Wraps a response body streaming iterator to provide ``read()``.""" + + def __init__(self, iter_content: Callable[[int], Iterator[bytes]], + size: int = 4096) -> None: + """Initialize the streaming iterator.""" + self._iter_content = iter_content(size) + self._buff = bytearray() + self._pos = 0 + + def seek(self, offset: int, whence: int = 0) -> int: + """Change the stream position to the given byte offset.""" + if whence != 0: + raise NotImplementedError('Only supports 0-based seeks') + if offset > self._pos: + self._read_ahead(offset + 1) + self._pos = offset + return self._pos + + def seekable(self) -> Literal[True]: + """Indicate that this is a seekable stream.""" + return True + + def tell(self) -> int: + """Return the current stream position.""" + return self._pos + + def readable(self) -> Literal[True]: + """Indicate that it *is* a readable stream.""" + return True + + def read(self, size: Optional[int] = -1) -> bytes: + """Read from the content stream, loading more content if necessary.""" + if size == -1 or size is None: # Read everything! + self._buff.extend(bytearray(b''.join(self._iter_content))) + content = self._buff[self._pos:] + + else: + if size > len(self._buff) - self._pos: + self._read_ahead(self._pos + size) + content = self._buff[self._pos:self._pos + size] + self._pos += len(content) + return content + + def _read_ahead(self, offset: int) -> None: + while offset > len(self._buff): + try: + chunk = next(self._iter_content) + except StopIteration: + break # No more content to read. + if not chunk: # May issue empty chunks due to keep-alive. + continue + self._buff.extend(bytearray(chunk)) + + + diff --git a/arxiv/canonical/services/remote.py b/arxiv/canonical/services/remote.py new file mode 100644 index 0000000..9307910 --- /dev/null +++ b/arxiv/canonical/services/remote.py @@ -0,0 +1,102 @@ +import io +import time +from http import HTTPStatus as status +from typing import Any, Callable, Iterable, IO, Optional, Tuple, Union +from urllib3.util.retry import Retry + +import requests + +from .. import domain as D +from .. import record as R +from ..register import ICanonicalSource +from .readable import IterReadWrapper, BytesIOProxy + + +class RemoteSource(ICanonicalSource): + """Retrieves content from remote URIs.""" + + def __init__(self, trusted_domain: str, trusted_scheme: str = 'https', + retry: int = 3, backoff: int = 2, retry_status: int = 5, + force_retry_on: Iterable[status] = ( + status.INTERNAL_SERVER_ERROR, + status.BAD_GATEWAY, + status.SERVICE_UNAVAILABLE, + status.GATEWAY_TIMEOUT + )) -> None: + + self._trusted_scheme = trusted_scheme + self._trusted_domain = trusted_domain + self._session = requests.Session() + self._adapter = requests.adapters.HTTPAdapter( + max_retries=Retry( + total=retry * 3, # This is just a fallback for odd cases. + read=retry, + connect=retry, + backoff_factor=backoff, + status_forcelist=[code.value for code in force_retry_on], + status=retry_status + ) + ) + self._session.mount('http://', self._adapter) + self._session.mount('https://', self._adapter) + + def can_resolve(self, uri: D.URI) -> bool: + return self.__can_resolve(uri) + + def __can_resolve(self, uri: D.URI) -> bool: + return bool(uri.is_http_url + and uri.netloc == self._trusted_domain + and uri.scheme == self._trusted_scheme) + + def load_entry(self, key: D.URI) -> Tuple[R.RecordStream, str]: + """Load an entry from the record.""" + raise NotImplementedError('Implement me!') + + def load(self, key: D.URI, stream: bool = True) -> IO[bytes]: + """Make an IO that waits to load from the record until it is read().""" + if not self.__can_resolve(key): + raise RuntimeError(f'Cannot resolve URI: {key}') + return DeferredRequestReader(self._session.get, key, stream=stream) + + +class DeferredRequestReader(io.BytesIO): + """IO[bytes] object that reads lazily via an HTTP request.""" + + def __init__(self, method: Callable[..., requests.Response], + uri: D.URI, stream: bool = True) -> None: + self._method = method + self._uri = uri + self._stream = stream + self._loaded_reader: Optional[IO[bytes]] = None + + @property + def _reader(self) -> IO[bytes]: + if self._loaded_reader is None: + self._loaded_reader = self._get_reader() + return self._loaded_reader + + def _get_reader(self) -> IO[bytes]: + response = self._method(str(self._uri), stream=self._stream) + while response.status_code == 200 and 'Refresh' in response.headers: + time.sleep(int(response.headers['Refresh'])) + response = self._method(str(self._uri), stream=self._stream) + if response.status_code != 200: + # logger.error('%i: %s', response.status_code, response.headers) + raise IOError(f'Could not retrieve {self._uri}:' + f' {response.status_code}') + return IterReadWrapper(response.iter_content) + + def read(self, size: Optional[int] = -1) -> bytes: + """Read from the remote resource.""" + if size is None: + size = -1 + return self._reader.read(size) + + def seek(self, offset: int, whence: int = 0) -> None: + self._reader.seek(offset, whence=whence) + + def seekable(self) -> bool: + self._reader.seekable() + + def tell(self) -> bool: + self._reader.tell() diff --git a/arxiv/canonical/services/repository.py b/arxiv/canonical/services/repository.py new file mode 100644 index 0000000..68674a8 --- /dev/null +++ b/arxiv/canonical/services/repository.py @@ -0,0 +1,37 @@ +from http import HTTPStatus as status +from typing import IO, Iterable, Tuple, Union +from urllib.parse import urljoin +from urllib3.util.retry import Retry + +import requests + +from .. import domain as D +from .. import record as R +from ..register import ICanonicalSource +from .remote import RemoteSource + + +class RemoteRepository(RemoteSource): + """Retrieves content from a remote arXiv repository.""" + + def can_resolve(self, uri: D.URI) -> bool: + return self.__can_resolve(uri) + + def __can_resolve(self, uri: D.URI) -> bool: + return uri.is_canonical + + def _to_http(self, uri: D.URI) -> D.URI: + """Make an HTTP URI from an arXiv canonical URI.""" + return D.URI(urljoin( + f'{self._trusted_scheme}://{self._trusted_domain}', + uri.path + )) + + def load_entry(self, key: D.URI) -> Tuple[R.RecordStream, str]: + """Load an entry from the record.""" + raise NotImplementedError('Implement me!') + + def load(self, key: D.URI, stream: bool = True) -> IO[bytes]: + """Make an IO that waits to load from the record until it is read().""" + return super(RemoteRepository, self).load(self._to_http(key), + stream=stream) diff --git a/arxiv/canonical/services/store.py b/arxiv/canonical/services/store.py index cc68fa9..2a0f05b 100644 --- a/arxiv/canonical/services/store.py +++ b/arxiv/canonical/services/store.py @@ -1,44 +1,66 @@ """ Persist changes to the canonical record. -Provides a :class:`.CanonicalStore` that stores resources in S3. +Provides a :class:`.CanonicalStore` that stores resources in S3, using +:mod:`.serialize.record` to serialize and deserialize resources. """ - +import gzip import io -from unittest import mock # TODO: remove this when fakes are no longer used. -from typing import Optional, Dict, Any +import logging +import os +from base64 import urlsafe_b64decode, urlsafe_b64encode +from binascii import hexlify, unhexlify from datetime import datetime, date -from pytz import UTC +from functools import partial +from hashlib import md5 +from json import dumps, load +from typing import (Optional, Dict, Any, IO, List, Callable, Tuple, Type, + TypeVar, Union) -from flask import Flask import boto3 import botocore +from backports.datetime_fromisoformat import MonkeyPatch from botocore.config import Config from botocore.exceptions import ClientError +from pytz import UTC + +from .. import integrity as I +from .. import record as R +from .. import domain as D +from ..manifest import (Manifest, ManifestDecoder, ManifestEntry, + ManifestEncoder) +from ..register import ICanonicalStorage, IStorableEntry +from ..serialize.decoder import CanonicalDecoder +from ..serialize.encoder import CanonicalEncoder +from .readable import BytesIOProxy + -from arxiv.base.globals import get_application_global, get_application_config -from arxiv.taxonomy import Category +MonkeyPatch.patch_fromisoformat() -from ..domain import Listing, EPrint, Identifier, Event, License, File, \ - Person, CanonicalRecord, MonthlyBlock +logger = logging.getLogger(__name__) +logger.setLevel(int(os.environ.get('LOGLEVEL', '40'))) + +_I = TypeVar('_I', I.IntegrityEntry, I.IntegrityMetadata, I.IntegrityListing, + covariant=True) + +Checksum = str class DoesNotExist(Exception): """The requested resource does not exist.""" -# TODO: implement me! -class CanonicalStore: +class CanonicalStore(ICanonicalStorage): """ Persists the canonical record in S3. - + The intended pattern for working with the canonical record is to use the - :class:`.domain.CanonicalRecord` as the primary entrypoint for all + :class:`.domain.CanonicalRecord` as the primary entrypoint for all operations. Consequently, this service offers only a single public instance method, :fund:`.load_record`. Persistence is achieved by attaching members to - :class:`.domain.CanonicalRecord`, :class`.domain.MonthlyBlock`, and + :class:`.domain.CanonicalRecord`, :class`.domain.Month`, and :class:`.domain.Listing` instances that implement reads/writes to S3. In this way, consumers of ``arxiv.canonical.domain`` can largely work directly with :class:`.domain.CanonicalRecord`, and persistence is handled @@ -72,7 +94,104 @@ def read_only(self) -> bool: """ return self._read_only - def _new_client(self, config: Optional[Config] = None) -> boto3.client: + def can_resolve(self, uri: D.URI) -> bool: + return isinstance(uri, D.Key) or uri.is_canonical + + def inititalize(self) -> None: + self.client.create_bucket(Bucket=self._bucket) + + def is_available(self, retries: int = 0, read_timeout: int = 5, + connect_timeout: int = 5) -> bool: + """Determine whether or not we can read from/write to the store.""" + raise NotImplementedError('Implement me!') + + def _load_key(self, key: str) -> bytes: + response = self.client.get_object(Bucket=self._bucket, Key=key) + body: IO[bytes] = response['Body'] + return body.read() + + def load(self, key: D.URI) -> IO[bytes]: + load: Callable[[], bytes] = partial(self._load_key, key) + return BytesIOProxy(load) + + def load_entry(self, key: D.URI) -> Tuple[R.RecordStream, Checksum]: + assert isinstance(key, D.Key) + logger.debug('Load entry at %s', key) + response = self.client.get_object(Bucket=self._bucket, Key=key) + stream = R.RecordStream( + domain=D.CanonicalFile( + modified=response['LastModified'], + filename=key.filename, + size_bytes=response['ContentLength'], + content_type=D.ContentType.from_mimetype(response['ContentType']), + ref=key + # content=BytesIOProxy(response['Body'].read), + ), + content=BytesIOProxy(response['Body'].read), + content_type=D.ContentType.from_mimetype(response['ContentType']), + size_bytes=response['ContentLength'] + ) + return stream, _hex_to_b64(response['ETag'][1:-1]) + + def list_subkeys(self, key: str) -> List[str]: + response = self.client.list_objects_v2(Bucket=self._bucket, Prefix=key) + subs = [obj['Key'].split(key, 1)[1] for obj in response['Contents']] + return [sub.split('/', 1)[0] if '/' in sub else sub for sub in subs] + + def store_entry(self, ri: IStorableEntry) -> None: + assert ri.record.stream.content is not None + # Make sure to decompress the content if necessary. + if ri.record.stream.domain.is_gzipped: + body = gzip.GzipFile(fileobj=ri.record.stream.content).read() + s3_checksum = _b64_to_hex(I.calculate_checksum(body)) + else: + body = ri.record.stream.content.read() + s3_checksum = _b64_to_hex(ri.checksum) + size_bytes = len(body) + + self.client.put_object(Bucket=self._bucket, + Key=ri.record.key, + Body=body, + ContentLength=size_bytes, + ContentMD5=s3_checksum, + ContentType=ri.record.stream.content_type.mime_type) + + # Update the CanonicalFile if necessary. + if ri.record.stream.domain.is_gzipped: + ri.record.stream.domain.size_bytes = size_bytes + ri.record.stream.domain.is_gzipped = False + # Use an in-memory buffer for the checksum, to cut down on + # unnecessary IO. + ri.record.stream = \ + ri.record.stream._replace(content=io.BytesIO(body)) + ri.update_checksum() + # Finally, replace the content IO with a deferred IO. + ri.record.stream = ri.record.stream._replace( + content=self.load(ri.record.key) + ) + + def store_manifest(self, key: str, manifest: Manifest) -> None: + body = dumps(manifest, cls=ManifestEncoder).encode('utf-8') + self.client.put_object(Bucket=self._bucket, + Key=key, + Body=body, + ContentLength=len(body), + ContentMD5=I.checksum.checksum_raw(body), + ContentType='application/json') + + def load_manifest(self, key: str) -> Manifest: + response = self.client.get_object(Bucket=self._bucket, Key=key) + manifest: Manifest = load(response['Body'], cls=ManifestDecoder) + return manifest + + def _handle_client_error(self, exc: ClientError) -> None: + if exc.response['Error']['Code'] == 'NoSuchBucket': + raise DoesNotExist(f'{self._bucket} does not exist') from exc + if exc.response['Error']['Code'] == "NoSuchKey": + raise DoesNotExist(f'No such object in {self._bucket}') from exc + raise RuntimeError('Unhandled ClientError') from exc + + def _new_client(self) -> boto3.client: # Only add credentials to the client if they are explicitly set. # If they are not set, boto3 falls back to environment variables and # credentials files. @@ -85,254 +204,51 @@ def _new_client(self, config: Optional[Config] = None) -> boto3.client: params['verify'] = self._verify return boto3.client('s3', **params) - def _handle_client_error(self, exc: ClientError) -> None: - if exc.response['Error']['Code'] == 'NoSuchBucket': - raise DoesNotExist(f'{self._bucket} does not exist') from exc - if exc.response['Error']['Code'] == "NoSuchKey": - raise DoesNotExist(f'No such object in {self._bucket}') from exc - raise RuntimeError('Unhandled ClientError') from exc - - def is_available(self, retries: int = 0, read_timeout: int = 5, - connect_timeout: int = 5) -> bool: - """Determine whether or not we can read from/write to the store.""" - raise NotImplementedError('Implement me!') - - def load_record(self) -> CanonicalRecord: - """ - Initialize and return the :class:`.CanonicalRecord`. - - The ``blocks`` and ``listings`` members must be mappings that implement - ``__getitem__`` methods such that, when called, an object of the - expected type (:class:`.MonthlyBlock` and :class:`.Listing`, - respectively) is always returned. - """ - raise NotImplementedError('Implement me!') - def _load_listing(self, listing_date: date) -> Listing: - """ - Load a :class:`.Listing`. - - If ``self.read_only`` is ``False``, the ``events`` member of the listing - must be a subclass of ``list``, and implement an ``append(event: Event) - -> None`` method that, when called, writes the current state of the - listing to S3. - - Parameters - ---------- - listing_date : datetime - Date for selecting listing events. - - Returns - ------- - :class:`.Listing` +class InMemoryStorage(ICanonicalStorage): + def __init__(self) -> None: + self._streams: Dict[D.URI, Tuple[R.RecordStream, str]] = {} + self._manifests: Dict[str, Manifest] = {} - """ - raise NotImplementedError('Implement me!') + def can_resolve(self, uri: D.URI) -> bool: + return bool(uri in self._streams) - def _load_block(self, year: int, month: int) -> MonthlyBlock: - """ - Load a :class:`.MonthlyBlock`. - - The ``eprints`` member of the block must be a mapping (e.g. subclass of - ``dict``), and implement: - - - If ``self.read_only`` is ``False``, a method - ``__setitem__(identifier: VersionedIdentifier, eprint: EPrint) -> - None`` that, when called, writes the :class:`.EPrint` to S3. - - A method ``__getitem__(identifier: VersionedIdentifier) -> EPrint:`` - that, when called, reads the corresponding :class:`.EPrint` from - S3 if it exists (otherwise raises ``KeyError``). - - Parameters - ---------- - year : int - month : int - - Returns - ------- - :class:`.MonthlyBlock` + def load(self, key: D.URI) -> IO[bytes]: + return self._streams[key][0].content - """ - raise NotImplementedError('Implement me!') - + def load_entry(self, key: D.URI) -> Tuple[R.RecordStream, str]: + assert isinstance(key, D.Key) + return self._streams[key] - def _store_listing(self, listing: Listing) -> None: - """ - Store a :class:`.Listing`. + def list_subkeys(self, key: str) -> List[str]: + return [k.split(key, 1)[1].split('/', 1)[0] + for k in self._streams.keys() + if k.startswith(key) and k != key] - Should complain loudly if ``self.read_only`` is ``True``. - """ - raise NotImplementedError('Implement me!') - - def _store_eprint(self, eprint: EPrint) -> None: - """ - Store a :class:`.EPrint`. + def store_entry(self, ri: IStorableEntry) -> None: + assert ri.record.stream.content is not None + if ri.record.stream.domain.is_gzipped: + content = gzip.GzipFile(fileobj=ri.record.stream.content).read() + ri.record.stream.domain.size_bytes = len(content) + ri.record.stream.domain.is_gzipped = False + ri.record.stream = \ + ri.record.stream._replace(content=io.BytesIO(content)) + ri.update_checksum() + self._streams[ri.record.key] = (ri.record.stream, ri.checksum) + ri.record.stream.domain.ref = ri.record.key - If the :attr:`.EPrint.source_package` or :attr:`.EPrint.pdf` content - has changed, those should also be stored. + def store_manifest(self, key: str, manifest: Manifest) -> None: + self._manifests[key] = manifest - Should complain loudly if ``self.read_only`` is ``True``. - """ - raise NotImplementedError('Implement me!') - + def load_manifest(self, key: str) -> Manifest: + return self._manifests[key] - def _load_eprint(self, identifier: Identifier, version: int) \ - -> EPrint: - """ - Load an :class:`.EPrint`. - The content of the :attr:`.EPrint.source_package` and - :attr:`.EPrint.pdf` should implement :class:`.Readable`. The ``read()`` - method should be a closure that, when called, retrieves the content of - the corresponding resource from storage. - """ - raise NotImplementedError('Implement me!') - - @classmethod - def init_app(cls, app: Flask) -> None: - """Set defaults for required configuration parameters.""" - app.config.setdefault('AWS_REGION', 'us-east-1') - app.config.setdefault('AWS_ACCESS_KEY_ID', None) - app.config.setdefault('AWS_SECRET_ACCESS_KEY', None) - app.config.setdefault('S3_ENDPOINT', None) - app.config.setdefault('S3_VERIFY', True) - app.config.setdefault('S3_BUCKET', 'arxiv-canonical') - - @classmethod - def get_session(cls) -> 'CanonicalStore': - """Create a new :class:`botocore.client.S3` session.""" - config = get_application_config() - return cls(config['S3_BUCKET'], - config['S3_VERIFY'], - config['AWS_REGION'], - config['S3_ENDPOINT'], - config['AWS_ACCESS_KEY_ID'], - config['AWS_SECRET_ACCESS_KEY']) - - @classmethod - def current_session(cls) -> 'CanonicalStore': - """Get the current store session for this application.""" - g = get_application_global() - if g is None: - return cls.get_session() - if 'store' not in g: - g.store = cls.get_session() - store: CanonicalStore = g.store - return store - - -class FakeCanonicalStore(CanonicalStore): - """ - A mock implementation of the canonical store. - - Methods to store things don't do anything, so don't expect data to stick - around. - """ +def _b64_to_hex(checksum: Checksum) -> str: + return hexlify(urlsafe_b64decode(checksum.encode('utf-8'))).decode('utf-8') + + +def _hex_to_b64(etag: str) -> Checksum: + """Convert an hexdigest of an MD5 to a URL-safe base64-encoded digest.""" + return urlsafe_b64encode(unhexlify(etag)).decode('utf-8') - @classmethod - def current_session(cls) -> 'FakeCanonicalStore': - return cls('foo') - - def store_listing(self, listing: Listing) -> None: - return - - def store_eprint(self, eprint: EPrint) -> None: - return - - def load_record(self) -> CanonicalRecord: - fake_eprints = mock.MagicMock(spec=dict) - identifier = Identifier('1901.00123') - fake_eprint = EPrint( - arxiv_id=identifier, - announced_date=date.today(), - version=1, - legacy=True, - submitted_date=datetime.now(UTC), - license=License( - href="https://arxiv.org/licenses/nonexclusive-distrib/1.0/" - "license.html" - ), - primary_classification=Category("cs.DL"), - title="Adventures in Flatland", - abstract="As Gregor Samsa awoke one morning from uneasy dreams he" - " found himself transformed in his bed into a gigantic" - " insect. He was lying on his hard, as it were" - " armor-plated, back and when he lifted his head a little" - " he could see his dome-like brown belly divided into" - " stiff arched segments on top of which the bed quilt" - " could hardly keep in position and was about to slide" - " off completely. His numerous legs, which were pitifully" - " thin compared to the rest of his bulk, waved helplessly" - " before his eyes.", - authors="Ima N. Author (FSU)", - source_type="tex", - size_kilobytes=543, - previous_versions=[], - secondary_classification=[Category('cs.AI'), Category('cs.AR')], - history=[ - Event(arxiv_id=identifier, - event_date=datetime.now(UTC), - event_type=Event.Type.NEW, - categories=[Category('cs.DL'), - Category('cs.AI'), - Category('cs.AR')], - version=1), - ], - submitter=Person( - full_name="Ima N. Author", - last_name="Author", - first_name="Ima N.", - affiliation=["FSU"] - ), - comments="4 figures, 2 turtles", - source_package=File( - filename=f"{identifier}.tar.gz", - mime_type="application/tar+gzip", - checksum="asdf1234==", - content=io.BytesIO(b'foocontent'), - created=datetime.now(UTC), - modified=datetime.now(UTC) - ), - pdf=File( - filename=f"{identifier}.pdf", - mime_type="application/pdf", - checksum="qwer9876==", - content=io.BytesIO(b'foopdf'), - created=datetime.now(UTC), - modified=datetime.now(UTC) - ) - ) - fake_eprints.__getitem__.return_value = fake_eprint - fake_block = mock.MagicMock(spec=MonthlyBlock, - eprints=fake_eprints) - fake_block.load_eprint.return_value = fake_eprint - fake_blocks = mock.MagicMock(spec=dict) - fake_blocks.__getitem__.return_value = fake_block - - fake_listings = mock.MagicMock(spec=dict) - fake_listings.__getitem__.return_value = Listing( - date=date.today(), - events=[ - Event(arxiv_id=Identifier('2004.00321'), - event_date=datetime.now(UTC), - event_type=Event.Type.NEW, - categories=[Category('cs.DL'), Category('cs.AI')], - version=1), - Event(arxiv_id=Identifier('2004.00322'), - event_date=datetime.now(UTC), - event_type=Event.Type.NEW, - categories=[Category('cs.DL'), Category('cs.AI')], - version=1), - Event(arxiv_id=Identifier('2003.00021'), - event_date=datetime.now(UTC), - event_type=Event.Type.CROSSLIST, - categories=[Category('cs.AR')], - version=1), - Event(arxiv_id=Identifier('2003.00001'), - event_date=datetime.now(UTC), - event_type=Event.Type.REPLACED, - categories=[Category('cs.AR')], - version=2) - ] - ) - return CanonicalRecord(blocks=fake_blocks, listings=fake_listings) diff --git a/arxiv/canonical/services/stream.py b/arxiv/canonical/services/stream.py new file mode 100644 index 0000000..372d9f0 --- /dev/null +++ b/arxiv/canonical/services/stream.py @@ -0,0 +1,78 @@ +""" +Implementation of event consumer and producer. + +TODO: write tests for these implementations. +""" + +from json import dumps +from typing import Any, Callable, Dict, Optional + +import boto3 + +from arxiv.canonical.domain import Event +from arxiv.canonical.core import IEventStream +from arxiv.integration.kinesis.consumer import BaseConsumer, process_stream + + +class EventConsumer(BaseConsumer): + """Consumes announcement events, and updates the canonical record.""" + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super(EventConsumer, self).__init__(*args, **kwargs) + self.on_event: Callable[[Event], None] = kwargs['on_event'] + + def process_record(self, record: dict) -> None: + """Process an event.""" + self.on_event(Event.from_dict(record)) + + +class ListenerEventStream(IEventStream): + Consumer = EventConsumer + + def __init__(self, config: Dict[str, Any], + on_event: Callable[[Event], None]) -> None: + self._on_event = on_event + self._config = config + self._config['on_event'] = self._on_event + + def listen(self, on_event: Callable[[Event], None]) -> None: + process_stream(self.Consumer, self._config, + extra=dict(on_event=on_event)) + + +# TODO: arxiv.base.integration.kinesis should have a BaseKinesis class with +# the bulk of the boto3 integration (rather than having it in BaseConsumer). +# But this will be OK for now. +class _Producer(BaseConsumer): + def __init__(self, *args: Any, **kwargs: Any) -> None: + self._last_sequence_number: Optional[str] \ + = kwargs.pop('last_sequence_number', None) + super(_Producer, self).__init__(*args, **kwargs) + self.client = self.new_client() + self.get_or_create_stream() + + def emit(self, payload: bytes) -> None: + # SequenceNumberForOrdering for must be the SequenceNumber of the + # last record that was produced on this partition. + if self._last_sequence_number is not None: + response = self.client.put_record( + StreamName=self.stream_name, + Data=payload, + PartitionKey='', + SequenceNumberForOrdering=self._last_sequence_number + ) + else: + response = self.client.put_record( + StreamName=self.stream_name, + Data=payload, + PartitionKey='' + ) + self._last_sequence_number = response['SequenceNumber'] + + +class ProducerEventStream(IEventStream): + def __init__(self, config: Dict[str, Any]) -> None: + self._producer = _Producer(**config) + + def emit(self, event: Event) -> None: + self._producer.emit(dumps(event.to_dict()).encode('utf-8')) diff --git a/arxiv/canonical/services/tests/__init__.py b/arxiv/canonical/services/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/arxiv/canonical/services/tests/test_filesystem.py b/arxiv/canonical/services/tests/test_filesystem.py new file mode 100644 index 0000000..9c41979 --- /dev/null +++ b/arxiv/canonical/services/tests/test_filesystem.py @@ -0,0 +1,58 @@ +"""Tests for :mod:`arxiv.canonical.services.filesystem`.""" + +import io +import os +import tempfile +from unittest import TestCase + +from ...domain import URI +from ..filesystem import Filesystem + + +class TestCanResolve(TestCase): + """Filesystem service can resolve file URIs.""" + + def setUp(self): + """Given a filesystem.""" + self.base_path = tempfile.mkdtemp() + self.filesystem = Filesystem(self.base_path) + + def test_with_http_uri(self): + """Cannot resolve HTTP URIs.""" + self.assertFalse(self.filesystem.can_resolve(URI('https://asdf.com'))) + + def test_with_canonical_uri(self): + """Cannot resolve canonical URIs.""" + self.assertFalse(self.filesystem.can_resolve(URI('arxiv:///foo/key'))) + + def test_with_file_uri(self): + """CAN resolve file URIs.""" + path = os.path.join(self.base_path, 'foo.json') + self.assertTrue(self.filesystem.can_resolve(URI(path))) + + def test_with_file_uri_outside_base_path(self): + """Cannot resolve file URIs that are outside of the base path.""" + self.assertFalse(self.filesystem.can_resolve(URI('file:///foo/key'))) + + +class TestLoadDeferred(TestCase): + """Filesystem service can load file URIs.""" + + def setUp(self): + """Given a file...""" + self.base_path = tempfile.mkdtemp() + self.filesystem = Filesystem(self.base_path) + _, self.file_path = tempfile.mkstemp(dir=self.base_path) + with open(self.file_path, 'wb') as f: + f.write(b'some content') + + def test_load(self): + """Can load content from the file.""" + resource = self.filesystem.load(URI(self.file_path)) + self.assertEqual(resource.read(4), b'some') + + def test_load_outside_base_path(self): + """Cannot load a file outside of the base path""" + _, other_path = tempfile.mkstemp() + with self.assertRaises(RuntimeError): + self.filesystem.load(URI(other_path)) diff --git a/arxiv/canonical/services/tests/test_readable.py b/arxiv/canonical/services/tests/test_readable.py new file mode 100644 index 0000000..de0ff83 --- /dev/null +++ b/arxiv/canonical/services/tests/test_readable.py @@ -0,0 +1,86 @@ +import io +from unittest import TestCase, mock + +from ..readable import BytesIOProxy + + +class TestBytesIOProxy(TestCase): + def setUp(self): + """Create a new BytesIOProxy.""" + self.test_content = b'test content' + self.mock_read = mock.MagicMock() + self.mock_read.return_value = self.test_content + self.readable = BytesIOProxy(self.mock_read) + + def test_read(self): + """Read from a :class:`BytesIOProxy`.""" + self.assertEqual(self.mock_read.call_count, 0, + 'Passed callable not yet used') + self.assertEqual(self.readable.read(), self.test_content, + 'Content is read from passed callable') + self.assertEqual(self.mock_read.call_count, 1, + 'Passed callable has been used') + + def test_read_again(self): + """Read more than once from a :class:`BytesIOProxy`.""" + self.assertEqual(self.mock_read.call_count, 0, + 'Passed callable not yet used') + self.assertEqual(self.readable.read(), self.test_content, + 'Content is read from passed callable') + self.assertEqual(self.mock_read.call_count, 1, + 'Passed callable has been used') + self.readable.seek(0) + self.assertEqual(self.readable.read(), self.test_content, + 'The same content is read') + self.assertEqual(self.mock_read.call_count, 1, + 'Passed callable is not called a second time') + + def test_not_closed_before_loading_content(self): + """BytesIOProxy is not closed prior to loading content.""" + self.assertFalse(self.readable.closed, 'Readable is not closed') + + def test_not_closed_after_loading_content(self): + """BytesIOProxy is not closed after loading content.""" + self.assertEqual(self.readable.read(), self.test_content, + 'Content is read from passed callable') + self.assertFalse(self.readable.closed, 'Readable is not closed') + + def test_closed_after_explicit_close(self): + """BytesIOProxy is closed after being explicitly closed.""" + self.assertFalse(self.readable.closed, 'Readable is not closed') + self.readable.close() + self.assertTrue(self.readable.closed, 'Readable is closed') + + def test_closed_after_read_and_explicit_close(self): + """BytesIOProxy is closed after being explicitly closed.""" + self.assertEqual(self.readable.read(), self.test_content, + 'Content is read from passed callable') + self.assertFalse(self.readable.closed, 'Readable is not closed') + self.readable.close() + self.assertTrue(self.readable.closed, 'Readable is closed') + + def test_readable_before_loading_content(self): + """BytesIOProxy is readable prior to loading content.""" + self.assertTrue(self.readable.readable(), 'Readable is readable') + + def test_readable_after_loading_content(self): + """BytesIOProxy is readable after loading content.""" + self.assertEqual(self.readable.read(), self.test_content, + 'Content is read from passed callable') + self.assertTrue(self.readable.readable(), 'Readable is readable') + + def test_not_readable_after_explicit_close(self): + """BytesIOProxy is not readable after being explicitly closed.""" + self.assertTrue(self.readable.readable(), 'Readable is readable') + self.readable.close() + with self.assertRaises(ValueError): + self.readable.readable() + + def test_not_readable_after_read_and_explicit_close(self): + """BytesIOProxy is not readable after being explicitly closed.""" + self.assertEqual(self.readable.read(), self.test_content, + 'Content is read from passed callable') + self.assertTrue(self.readable.readable(), 'Readable is readable') + self.readable.close() + with self.assertRaises(ValueError): + self.readable.readable() \ No newline at end of file diff --git a/arxiv/canonical/services/tests/test_remote.py b/arxiv/canonical/services/tests/test_remote.py new file mode 100644 index 0000000..b73223d --- /dev/null +++ b/arxiv/canonical/services/tests/test_remote.py @@ -0,0 +1,104 @@ +"""Tests for :mod:`arxiv.canonical.services.remote`.""" + +import io +import os +import tempfile +from unittest import TestCase, mock + +from ...domain import URI +from .. import remote + + +class TestCanResolve(TestCase): + """Remote service can resolve HTTP URIs.""" + + def setUp(self): + """Given a remote service instance.""" + self.trusted_domain = 'arxiv.org' + self.remote = remote.RemoteSource(self.trusted_domain, 'https') + + def test_with_http_uri(self): + """CAN resolve HTTP URIs in the trusted domain.""" + self.assertTrue( + self.remote.can_resolve(URI('https://arxiv.org/stats/today')) + ) + + def test_with_http_uri_outside_trusted_domain(self): + """Cannot resolve HTTP URIs outside of the trusted domain.""" + self.assertFalse(self.remote.can_resolve(URI('https://asdf.com'))) + + def test_with_http_uri_with_nontrusted_scheme(self): + """Cannot resolve HTTP URIs with a non-trusted scheme.""" + self.assertFalse( + self.remote.can_resolve(URI('http://arxiv.org/stats/today')) + ) + + def test_with_canonical_uri(self): + """Cannot resolve canonical URIs.""" + self.assertFalse(self.remote.can_resolve(URI('arxiv:///foo/key'))) + + def test_with_file_uri(self): + """Cannot resolve file URIs.""" + self.assertFalse(self.remote.can_resolve(URI('/foo/bar/baz'))) + + +class TestLoadDeferred(TestCase): + """Remote service can load HTTP URIs.""" + + @mock.patch(f'{remote.__name__}.requests.Session') + def setUp(self, mock_Session): + """Given a remote service instance.""" + self.mock_session = mock.MagicMock() + mock_Session.return_value = self.mock_session + + self.trusted_domain = 'arxiv.org' + self.remote = remote.RemoteSource(self.trusted_domain, 'https') + + def test_load(self): + """Can load content from the HTTP URI.""" + mock_response = mock.MagicMock(status_code=200) + mock_response.iter_content.return_value = \ + iter([b'foo', b'con' b'ten', b't']) + self.mock_session.get.return_value = mock_response + res = self.remote.load(URI('https://arxiv.org/stats/today')) + self.assertEqual(self.mock_session.get.call_count, 0, + 'No request is yet performed') + self.assertEqual(res.read(4), b'fooc') + self.assertEqual(self.mock_session.get.call_count, 1, + 'Until an attempt to read() is made') + + mock_response.iter_content.return_value = \ + iter([b'foo', b'con' b'ten', b't']) + res = self.remote.load(URI('https://arxiv.org/stats/today')) + self.assertEqual(res.read(), b'foocontent') + + def test_load_outside_base_path(self): + """Cannot load an HTTP URI outside trusted domain.""" + with self.assertRaises(RuntimeError): + self.remote.load(URI('https://asdf.com')) + + def test_load_without_training_wheels(self): + """This will issue a live call to arxiv.org.""" + r = remote.RemoteSource(self.trusted_domain, 'https') + reader = r.load(URI('https://arxiv.org/pdf/0801.1021v2.pdf')) + self.assertIsInstance(reader, io.BytesIO) + self.assertEqual(len(reader.read()), 237187) + reader.seek(0) + self.assertEqual(len(reader.read()), 237187) + + reader.seek(0) + self.assertEqual(len(reader.read(4096)), 4096) + + + def test_load_streaming_without_training_wheels(self): + """This will issue a live call to arxiv.org.""" + r = remote.RemoteSource(self.trusted_domain, 'https') + reader = r.load(URI('https://arxiv.org/pdf/0801.1021v2.pdf'), + stream=True) + self.assertIsInstance(reader, io.BytesIO) + self.assertEqual(len(reader.read()), 237187) + reader.seek(0) + self.assertEqual(len(reader.read()), 237187) + + reader.seek(0) + self.assertEqual(len(reader.read(4096)), 4096) diff --git a/arxiv/canonical/services/tests/test_repository.py b/arxiv/canonical/services/tests/test_repository.py new file mode 100644 index 0000000..964c0a9 --- /dev/null +++ b/arxiv/canonical/services/tests/test_repository.py @@ -0,0 +1,63 @@ +"""Tests for :mod:`arxiv.canonical.services.remote`.""" + +import io +import os +import tempfile +from unittest import TestCase, mock + +from ...domain import URI +from .. import repository + + +class TestCanResolve(TestCase): + """Remote repository service can resolve arXiv canonical URIs.""" + + def setUp(self): + """Given a remote service instance.""" + self.trusted_domain = 'arxiv.org' + self.remote = repository.RemoteRepository(self.trusted_domain, 'https') + + def test_with_http_uri(self): + """Cannot resolve HTTP URIs.""" + self.assertFalse( + self.remote.can_resolve(URI('https://arxiv.org/stats/today')) + ) + + def test_with_canonical_uri(self): + """Can resolve canonical URIs.""" + self.assertTrue(self.remote.can_resolve(URI('arxiv:///foo/key'))) + + def test_with_file_uri(self): + """Cannot resolve file URIs.""" + self.assertFalse(self.remote.can_resolve(URI('/foo/bar/baz'))) + + +class TestLoadDeferred(TestCase): + """Remote repository can load arXiv canonical URIs.""" + + @mock.patch(f'{repository.__name__}.requests.Session') + def setUp(self, mock_Session): + """Given a remote service instance.""" + self.mock_session = mock.MagicMock() + mock_Session.return_value = self.mock_session + + self.trusted_domain = 'arxiv.org' + self.remote = repository.RemoteRepository(self.trusted_domain, 'https') + + def test_load(self): + """Can load content from the HTTP URI.""" + mock_response = mock.MagicMock(status_code=200) + mock_response.iter_content.return_value = \ + iter([b'foo', b'con' b'ten', b't']) + self.mock_session.get.return_value = mock_response + res = self.remote.load(URI('arxiv:///foo/resource')) + self.assertEqual(self.mock_session.get.call_count, 0, + 'No request is yet performed') + self.assertEqual(res.read(4), b'fooc') + self.assertEqual(self.mock_session.get.call_count, 1, + 'Until an attempt to read() is made') + + mock_response.iter_content.return_value = \ + iter([b'foo', b'con' b'ten', b't']) + res = self.remote.load(URI('arxiv:///foo/resource')) + self.assertEqual(res.read(), b'foocontent') diff --git a/arxiv/canonical/services/tests/test_store.py b/arxiv/canonical/services/tests/test_store.py new file mode 100644 index 0000000..de3c2a7 --- /dev/null +++ b/arxiv/canonical/services/tests/test_store.py @@ -0,0 +1,80 @@ +# import json +# from unittest import TestCase, mock +# from moto import mock_s3 + +# from ...domain import Identifier, VersionedIdentifier +# from .. import store + +# class TestLoadEPrint(TestCase): + +# def setUp(self): +# self.eprint_json = json.dumps({ +# '@type': 'EPrint', +# 'abstract': 'Very abstract. Too short to be a real abstract.', +# 'acm_class': None, +# 'announced_date': '2019-07-11', +# 'arxiv_id': '2004.00111', +# 'authors': 'Ima N. Author (FSU)', +# 'comments': None, +# 'doi': None, +# 'history': [], +# 'is_withdrawn': False, +# 'journal_ref': None, +# 'legacy': False, +# 'license': 'http://notalicense', +# 'msc_class': None, +# 'pdf': {'@type': 'File', +# 'checksum': 'bNmNEmoWNzA6LEaKswzI6w==', +# 'created': '2019-07-11T15:43:03.031980+00:00', +# 'filename': '2004.00111.pdf', +# 'mime_type': 'application/pdf', +# 'modified': '2019-07-11T15:43:03.031982+00:00'}, +# 'previous_versions': [], +# 'primary_classification': 'cs.AR', +# 'proxy': None, +# 'reason_for_withdrawal': None, +# 'report_num': None, +# 'secondary_classification': ['cs.AI', 'cs.DL'], +# 'size_kilobytes': 1, +# 'source': {'@type': 'File', +# 'checksum': 'UkMjgWHli_2o5cX86fJFRg==', +# 'created': '2019-07-11T15:43:03.031967+00:00', +# 'filename': '2004.00111.tar.gz', +# 'mime_type': 'application/gzip', +# 'modified': '2019-07-11T15:43:03.031972+00:00'}, +# 'source_type': 'tex', +# 'submitted_date': '2019-07-11', +# 'submitter': None, +# 'title': 'The Title of Everything', +# 'version': 1 +# }).encode('utf-8') + +# self.manifest_json = json.dumps({ +# 'e-prints/2019/01/1901.00123/v1/1901.00123v1.json': +# '3Vk4TQYzizLHjcyNL62x2w==', +# 'e-prints/2019/01/1901.00123/v1/1901.00123v1.pdf': +# 'rL0Y20zC-Fzt72VPzMSk2A==', +# 'e-prints/2019/01/1901.00123/v1/1901.00123v1.tar': +# 'rL0Y20zC-Fzt72VPzMSk2A==', +# }).encode('utf-8') + +# def test_load_an_eprint(self): +# """Load an e-print, with lazily-loaded content.""" +# store_service = store.CanonicalStore('foobucket') +# fake_content = b'foo' + +# def load_fake_data(key): +# if key.endswith('manifest.json'): +# return self.manifest_json +# elif key.endswith('.json'): +# return self.eprint_json +# return fake_content + +# store_service._load_key = load_fake_data + +# eprint = store_service.load_eprint(VersionedIdentifier('1901.00123v1')) +# self.assertEqual(eprint.pdf.content.read(), fake_content, +# 'Lazily-loaded fake data is returned') +# self.assertEqual(eprint.source.content.read(), fake_content, +# 'Lazily-loaded fake data is returned') + diff --git a/arxiv/canonical/tests/data/orig/math-ph/papers/0702031v1.abs b/arxiv/canonical/tests/data/orig/math-ph/papers/0702031v1.abs deleted file mode 100644 index 5809d6d..0000000 --- a/arxiv/canonical/tests/data/orig/math-ph/papers/0702031v1.abs +++ /dev/null @@ -1,17 +0,0 @@ ------------------------------------------------------------------------------- -\\ -arXiv:math-ph/0702031 -From: Paulo Mendon\c{c}a -Date: Fri, 9 Feb 2007 20:39:05 GMT (8kb) - -Title: Probability Distribution of Curvatures of Isosurfaces in Gaussian Random - Fields -Authors: Paulo R. S. Mendonca, Rahul Bhotika and James V. Miller -Categories: math-ph.MP -Comments: Seven pages, six references -MSC-class: 60D05 -\\ - An expression for the joint probability distribution of the principal -curvatures at an arbitrary point in the ensemble of isosurfaces defined on -isotropic Gaussian random fields on Rn is derived. -\\ diff --git a/arxiv/canonical/tests/test_serialize_classic.py b/arxiv/canonical/tests/test_serialize_classic.py deleted file mode 100644 index 0853b17..0000000 --- a/arxiv/canonical/tests/test_serialize_classic.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Tests for :mod:`.serialize.classic`.""" - -from unittest import TestCase -import os -import json -from pprint import pprint - -import jsonschema - -from .. import serialize -from ..serialize import classic - -DATA = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data') -ABS_ROOT = os.path.join(DATA, 'orig') - -ALL_ABS = [os.path.join(base, fname) - for base, dirs, fnames in os.walk(ABS_ROOT) - for fname in fnames if fname.endswith('.abs')] - - -class TestClassicDeserialize(TestCase): - """Test deserialization of the classic abs format.""" - - SCHEMA_PATH = os.path.abspath('schema/resources') - - def setUp(self): - """Get a JSON Schema reference resolver.""" - resolver_path = 'file://%s/' % self.SCHEMA_PATH - self.resolver = jsonschema.RefResolver(resolver_path, None) - - def test_parse(self): - """Parse and reserialize a variety of classic abs records.""" - with open(os.path.join(self.SCHEMA_PATH, 'EPrintMetadata.json')) as f: - schema = json.load(f) - - for abs in ALL_ABS: - self.assertIsNone( - jsonschema.validate( - json.loads(serialize.dumps(classic.abs.parse(abs))), - schema, - resolver=self.resolver - ) - ) diff --git a/arxiv/canonical/util.py b/arxiv/canonical/util.py new file mode 100644 index 0000000..1cab4f7 --- /dev/null +++ b/arxiv/canonical/util.py @@ -0,0 +1,12 @@ +"""Various helpers and utilities that don't belong anywhere else.""" + +from typing import Dict, Generic, TypeVar + +KeyType = TypeVar('KeyType') +ValueType = TypeVar('ValueType') + + +class GenericMonoDict(Dict[KeyType, ValueType]): + """A dict with specific key and value types.""" + + def __getitem__(self, key: KeyType) -> ValueType: ... \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..754ec17 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = arxiv-canonical +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/source/arxiv.canonical/arxiv.canonical.classic.abs.rst b/docs/source/arxiv.canonical/arxiv.canonical.classic.abs.rst new file mode 100644 index 0000000..a45afa3 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.classic.abs.rst @@ -0,0 +1,7 @@ +arxiv.canonical.classic.abs module +================================== + +.. automodule:: arxiv.canonical.classic.abs + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.classic.backfill.rst b/docs/source/arxiv.canonical/arxiv.canonical.classic.backfill.rst new file mode 100644 index 0000000..4e9a422 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.classic.backfill.rst @@ -0,0 +1,7 @@ +arxiv.canonical.classic.backfill module +======================================= + +.. automodule:: arxiv.canonical.classic.backfill + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.classic.cli.rst b/docs/source/arxiv.canonical/arxiv.canonical.classic.cli.rst new file mode 100644 index 0000000..becb72c --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.classic.cli.rst @@ -0,0 +1,7 @@ +arxiv.canonical.classic.cli module +================================== + +.. automodule:: arxiv.canonical.classic.cli + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.classic.content.rst b/docs/source/arxiv.canonical/arxiv.canonical.classic.content.rst new file mode 100644 index 0000000..6f2d3d1 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.classic.content.rst @@ -0,0 +1,7 @@ +arxiv.canonical.classic.content module +====================================== + +.. automodule:: arxiv.canonical.classic.content + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.classic.daily.rst b/docs/source/arxiv.canonical/arxiv.canonical.classic.daily.rst new file mode 100644 index 0000000..cf747b2 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.classic.daily.rst @@ -0,0 +1,7 @@ +arxiv.canonical.classic.daily module +==================================== + +.. automodule:: arxiv.canonical.classic.daily + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.classic.rst b/docs/source/arxiv.canonical/arxiv.canonical.classic.rst new file mode 100644 index 0000000..ee51a27 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.classic.rst @@ -0,0 +1,27 @@ +arxiv.canonical.classic package +=============================== + +.. automodule:: arxiv.canonical.classic + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + arxiv.canonical.classic.tests + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.classic.abs + arxiv.canonical.classic.backfill + arxiv.canonical.classic.cli + arxiv.canonical.classic.content + arxiv.canonical.classic.daily + arxiv.canonical.classic.util + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.rst b/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.rst new file mode 100644 index 0000000..092caec --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.rst @@ -0,0 +1,19 @@ +arxiv.canonical.classic.tests package +===================================== + +.. automodule:: arxiv.canonical.classic.tests + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.classic.tests.test_abs + arxiv.canonical.classic.tests.test_backfill + arxiv.canonical.classic.tests.test_content + arxiv.canonical.classic.tests.test_daily + arxiv.canonical.classic.tests.test_serialize_classic + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.test_abs.rst b/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.test_abs.rst new file mode 100644 index 0000000..9ccd8fa --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.test_abs.rst @@ -0,0 +1,7 @@ +arxiv.canonical.classic.tests.test\_abs module +============================================== + +.. automodule:: arxiv.canonical.classic.tests.test_abs + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.test_backfill.rst b/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.test_backfill.rst new file mode 100644 index 0000000..7df7ed1 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.test_backfill.rst @@ -0,0 +1,7 @@ +arxiv.canonical.classic.tests.test\_backfill module +=================================================== + +.. automodule:: arxiv.canonical.classic.tests.test_backfill + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.test_content.rst b/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.test_content.rst new file mode 100644 index 0000000..f882162 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.test_content.rst @@ -0,0 +1,7 @@ +arxiv.canonical.classic.tests.test\_content module +================================================== + +.. automodule:: arxiv.canonical.classic.tests.test_content + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.test_daily.rst b/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.test_daily.rst new file mode 100644 index 0000000..db0acd5 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.test_daily.rst @@ -0,0 +1,7 @@ +arxiv.canonical.classic.tests.test\_daily module +================================================ + +.. automodule:: arxiv.canonical.classic.tests.test_daily + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.test_serialize_classic.rst b/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.test_serialize_classic.rst new file mode 100644 index 0000000..afeabf0 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.classic.tests.test_serialize_classic.rst @@ -0,0 +1,7 @@ +arxiv.canonical.classic.tests.test\_serialize\_classic module +============================================================= + +.. automodule:: arxiv.canonical.classic.tests.test_serialize_classic + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.classic.util.rst b/docs/source/arxiv.canonical/arxiv.canonical.classic.util.rst new file mode 100644 index 0000000..dc61854 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.classic.util.rst @@ -0,0 +1,7 @@ +arxiv.canonical.classic.util module +=================================== + +.. automodule:: arxiv.canonical.classic.util + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.core.rst b/docs/source/arxiv.canonical/arxiv.canonical.core.rst new file mode 100644 index 0000000..3527fa2 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.core.rst @@ -0,0 +1,7 @@ +arxiv.canonical.core module +=========================== + +.. automodule:: arxiv.canonical.core + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.domain.base.rst b/docs/source/arxiv.canonical/arxiv.canonical.domain.base.rst new file mode 100644 index 0000000..2c85c1c --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.domain.base.rst @@ -0,0 +1,7 @@ +arxiv.canonical.domain.base module +================================== + +.. automodule:: arxiv.canonical.domain.base + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.domain.block.rst b/docs/source/arxiv.canonical/arxiv.canonical.domain.block.rst new file mode 100644 index 0000000..901feed --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.domain.block.rst @@ -0,0 +1,7 @@ +arxiv.canonical.domain.block module +=================================== + +.. automodule:: arxiv.canonical.domain.block + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.domain.content.rst b/docs/source/arxiv.canonical/arxiv.canonical.domain.content.rst new file mode 100644 index 0000000..0be7b7e --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.domain.content.rst @@ -0,0 +1,7 @@ +arxiv.canonical.domain.content module +===================================== + +.. automodule:: arxiv.canonical.domain.content + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.domain.eprint.rst b/docs/source/arxiv.canonical/arxiv.canonical.domain.eprint.rst new file mode 100644 index 0000000..3e5b35e --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.domain.eprint.rst @@ -0,0 +1,7 @@ +arxiv.canonical.domain.eprint module +==================================== + +.. automodule:: arxiv.canonical.domain.eprint + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.domain.file.rst b/docs/source/arxiv.canonical/arxiv.canonical.domain.file.rst new file mode 100644 index 0000000..6410774 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.domain.file.rst @@ -0,0 +1,7 @@ +arxiv.canonical.domain.file module +================================== + +.. automodule:: arxiv.canonical.domain.file + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.domain.identifier.rst b/docs/source/arxiv.canonical/arxiv.canonical.domain.identifier.rst new file mode 100644 index 0000000..3fc394d --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.domain.identifier.rst @@ -0,0 +1,7 @@ +arxiv.canonical.domain.identifier module +======================================== + +.. automodule:: arxiv.canonical.domain.identifier + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.domain.license.rst b/docs/source/arxiv.canonical/arxiv.canonical.domain.license.rst new file mode 100644 index 0000000..6038010 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.domain.license.rst @@ -0,0 +1,7 @@ +arxiv.canonical.domain.license module +===================================== + +.. automodule:: arxiv.canonical.domain.license + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.domain.listing.rst b/docs/source/arxiv.canonical/arxiv.canonical.domain.listing.rst new file mode 100644 index 0000000..62b88ef --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.domain.listing.rst @@ -0,0 +1,7 @@ +arxiv.canonical.domain.listing module +===================================== + +.. automodule:: arxiv.canonical.domain.listing + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.domain.person.rst b/docs/source/arxiv.canonical/arxiv.canonical.domain.person.rst new file mode 100644 index 0000000..759cdb3 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.domain.person.rst @@ -0,0 +1,7 @@ +arxiv.canonical.domain.person module +==================================== + +.. automodule:: arxiv.canonical.domain.person + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.domain.preservation.rst b/docs/source/arxiv.canonical/arxiv.canonical.domain.preservation.rst new file mode 100644 index 0000000..7373e51 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.domain.preservation.rst @@ -0,0 +1,7 @@ +arxiv.canonical.domain.preservation module +========================================== + +.. automodule:: arxiv.canonical.domain.preservation + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.domain.rst b/docs/source/arxiv.canonical/arxiv.canonical.domain.rst new file mode 100644 index 0000000..e86e6cf --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.domain.rst @@ -0,0 +1,33 @@ +arxiv.canonical.domain package +============================== + +.. automodule:: arxiv.canonical.domain + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + arxiv.canonical.domain.tests + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.domain.base + arxiv.canonical.domain.block + arxiv.canonical.domain.content + arxiv.canonical.domain.eprint + arxiv.canonical.domain.file + arxiv.canonical.domain.identifier + arxiv.canonical.domain.license + arxiv.canonical.domain.listing + arxiv.canonical.domain.person + arxiv.canonical.domain.preservation + arxiv.canonical.domain.util + arxiv.canonical.domain.version + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.domain.tests.rst b/docs/source/arxiv.canonical/arxiv.canonical.domain.tests.rst new file mode 100644 index 0000000..b9ef574 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.domain.tests.rst @@ -0,0 +1,17 @@ +arxiv.canonical.domain.tests package +==================================== + +.. automodule:: arxiv.canonical.domain.tests + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.domain.tests.test_content + arxiv.canonical.domain.tests.test_file + arxiv.canonical.domain.tests.test_identifier + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.domain.tests.test_content.rst b/docs/source/arxiv.canonical/arxiv.canonical.domain.tests.test_content.rst new file mode 100644 index 0000000..f174917 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.domain.tests.test_content.rst @@ -0,0 +1,7 @@ +arxiv.canonical.domain.tests.test\_content module +================================================= + +.. automodule:: arxiv.canonical.domain.tests.test_content + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.domain.tests.test_file.rst b/docs/source/arxiv.canonical/arxiv.canonical.domain.tests.test_file.rst new file mode 100644 index 0000000..7265ce8 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.domain.tests.test_file.rst @@ -0,0 +1,7 @@ +arxiv.canonical.domain.tests.test\_file module +============================================== + +.. automodule:: arxiv.canonical.domain.tests.test_file + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.domain.tests.test_identifier.rst b/docs/source/arxiv.canonical/arxiv.canonical.domain.tests.test_identifier.rst new file mode 100644 index 0000000..caeab84 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.domain.tests.test_identifier.rst @@ -0,0 +1,7 @@ +arxiv.canonical.domain.tests.test\_identifier module +==================================================== + +.. automodule:: arxiv.canonical.domain.tests.test_identifier + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.domain.util.rst b/docs/source/arxiv.canonical/arxiv.canonical.domain.util.rst new file mode 100644 index 0000000..07403d3 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.domain.util.rst @@ -0,0 +1,7 @@ +arxiv.canonical.domain.util module +================================== + +.. automodule:: arxiv.canonical.domain.util + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.domain.version.rst b/docs/source/arxiv.canonical/arxiv.canonical.domain.version.rst new file mode 100644 index 0000000..cf75d57 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.domain.version.rst @@ -0,0 +1,7 @@ +arxiv.canonical.domain.version module +===================================== + +.. automodule:: arxiv.canonical.domain.version + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.integrity.checksum.rst b/docs/source/arxiv.canonical/arxiv.canonical.integrity.checksum.rst new file mode 100644 index 0000000..1bff3be --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.integrity.checksum.rst @@ -0,0 +1,7 @@ +arxiv.canonical.integrity.checksum module +========================================= + +.. automodule:: arxiv.canonical.integrity.checksum + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.integrity.core.rst b/docs/source/arxiv.canonical/arxiv.canonical.integrity.core.rst new file mode 100644 index 0000000..851011c --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.integrity.core.rst @@ -0,0 +1,7 @@ +arxiv.canonical.integrity.core module +===================================== + +.. automodule:: arxiv.canonical.integrity.core + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.integrity.exceptions.rst b/docs/source/arxiv.canonical/arxiv.canonical.integrity.exceptions.rst new file mode 100644 index 0000000..37092cc --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.integrity.exceptions.rst @@ -0,0 +1,7 @@ +arxiv.canonical.integrity.exceptions module +=========================================== + +.. automodule:: arxiv.canonical.integrity.exceptions + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.integrity.listing.rst b/docs/source/arxiv.canonical/arxiv.canonical.integrity.listing.rst new file mode 100644 index 0000000..402db00 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.integrity.listing.rst @@ -0,0 +1,7 @@ +arxiv.canonical.integrity.listing module +======================================== + +.. automodule:: arxiv.canonical.integrity.listing + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.integrity.metadata.rst b/docs/source/arxiv.canonical/arxiv.canonical.integrity.metadata.rst new file mode 100644 index 0000000..bcafc24 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.integrity.metadata.rst @@ -0,0 +1,7 @@ +arxiv.canonical.integrity.metadata module +========================================= + +.. automodule:: arxiv.canonical.integrity.metadata + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.integrity.preservation.rst b/docs/source/arxiv.canonical/arxiv.canonical.integrity.preservation.rst new file mode 100644 index 0000000..94314a8 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.integrity.preservation.rst @@ -0,0 +1,7 @@ +arxiv.canonical.integrity.preservation module +============================================= + +.. automodule:: arxiv.canonical.integrity.preservation + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.integrity.rst b/docs/source/arxiv.canonical/arxiv.canonical.integrity.rst new file mode 100644 index 0000000..b0428f2 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.integrity.rst @@ -0,0 +1,28 @@ +arxiv.canonical.integrity package +================================= + +.. automodule:: arxiv.canonical.integrity + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + arxiv.canonical.integrity.tests + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.integrity.checksum + arxiv.canonical.integrity.core + arxiv.canonical.integrity.exceptions + arxiv.canonical.integrity.listing + arxiv.canonical.integrity.metadata + arxiv.canonical.integrity.preservation + arxiv.canonical.integrity.version + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.integrity.tests.rst b/docs/source/arxiv.canonical/arxiv.canonical.integrity.tests.rst new file mode 100644 index 0000000..5d1f57e --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.integrity.tests.rst @@ -0,0 +1,15 @@ +arxiv.canonical.integrity.tests package +======================================= + +.. automodule:: arxiv.canonical.integrity.tests + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.integrity.tests.test_version + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.integrity.tests.test_version.rst b/docs/source/arxiv.canonical/arxiv.canonical.integrity.tests.test_version.rst new file mode 100644 index 0000000..201a84b --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.integrity.tests.test_version.rst @@ -0,0 +1,7 @@ +arxiv.canonical.integrity.tests.test\_version module +==================================================== + +.. automodule:: arxiv.canonical.integrity.tests.test_version + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.integrity.version.rst b/docs/source/arxiv.canonical/arxiv.canonical.integrity.version.rst new file mode 100644 index 0000000..16f3a41 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.integrity.version.rst @@ -0,0 +1,7 @@ +arxiv.canonical.integrity.version module +======================================== + +.. automodule:: arxiv.canonical.integrity.version + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.log.log.rst b/docs/source/arxiv.canonical/arxiv.canonical.log.log.rst new file mode 100644 index 0000000..f9c7f2d --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.log.log.rst @@ -0,0 +1,7 @@ +arxiv.canonical.log.log module +============================== + +.. automodule:: arxiv.canonical.log.log + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.log.rst b/docs/source/arxiv.canonical/arxiv.canonical.log.rst new file mode 100644 index 0000000..2d59b0c --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.log.rst @@ -0,0 +1,16 @@ +arxiv.canonical.log package +=========================== + +.. automodule:: arxiv.canonical.log + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.log.log + arxiv.canonical.log.tests + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.log.tests.rst b/docs/source/arxiv.canonical/arxiv.canonical.log.tests.rst new file mode 100644 index 0000000..4316d46 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.log.tests.rst @@ -0,0 +1,7 @@ +arxiv.canonical.log.tests module +================================ + +.. automodule:: arxiv.canonical.log.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.manifest.rst b/docs/source/arxiv.canonical/arxiv.canonical.manifest.rst new file mode 100644 index 0000000..353f127 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.manifest.rst @@ -0,0 +1,7 @@ +arxiv.canonical.manifest module +=============================== + +.. automodule:: arxiv.canonical.manifest + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.preservation.rst b/docs/source/arxiv.canonical/arxiv.canonical.preservation.rst new file mode 100644 index 0000000..c2eb117 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.preservation.rst @@ -0,0 +1,7 @@ +arxiv.canonical.preservation module +=================================== + +.. automodule:: arxiv.canonical.preservation + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.record.core.rst b/docs/source/arxiv.canonical/arxiv.canonical.record.core.rst new file mode 100644 index 0000000..163ab73 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.record.core.rst @@ -0,0 +1,7 @@ +arxiv.canonical.record.core module +================================== + +.. automodule:: arxiv.canonical.record.core + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.record.file.rst b/docs/source/arxiv.canonical/arxiv.canonical.record.file.rst new file mode 100644 index 0000000..466c730 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.record.file.rst @@ -0,0 +1,7 @@ +arxiv.canonical.record.file module +================================== + +.. automodule:: arxiv.canonical.record.file + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.record.listing.rst b/docs/source/arxiv.canonical/arxiv.canonical.record.listing.rst new file mode 100644 index 0000000..7e7c0ec --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.record.listing.rst @@ -0,0 +1,7 @@ +arxiv.canonical.record.listing module +===================================== + +.. automodule:: arxiv.canonical.record.listing + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.record.metadata.rst b/docs/source/arxiv.canonical/arxiv.canonical.record.metadata.rst new file mode 100644 index 0000000..0307ab0 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.record.metadata.rst @@ -0,0 +1,7 @@ +arxiv.canonical.record.metadata module +====================================== + +.. automodule:: arxiv.canonical.record.metadata + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.record.preservation.rst b/docs/source/arxiv.canonical/arxiv.canonical.record.preservation.rst new file mode 100644 index 0000000..8d96c1f --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.record.preservation.rst @@ -0,0 +1,7 @@ +arxiv.canonical.record.preservation module +========================================== + +.. automodule:: arxiv.canonical.record.preservation + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.record.rst b/docs/source/arxiv.canonical/arxiv.canonical.record.rst new file mode 100644 index 0000000..dec362d --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.record.rst @@ -0,0 +1,27 @@ +arxiv.canonical.record package +============================== + +.. automodule:: arxiv.canonical.record + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + arxiv.canonical.record.tests + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.record.core + arxiv.canonical.record.file + arxiv.canonical.record.listing + arxiv.canonical.record.metadata + arxiv.canonical.record.preservation + arxiv.canonical.record.version + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.record.tests.rst b/docs/source/arxiv.canonical/arxiv.canonical.record.tests.rst new file mode 100644 index 0000000..fd35c12 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.record.tests.rst @@ -0,0 +1,16 @@ +arxiv.canonical.record.tests package +==================================== + +.. automodule:: arxiv.canonical.record.tests + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.record.tests.test_listing + arxiv.canonical.record.tests.test_version + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.record.tests.test_listing.rst b/docs/source/arxiv.canonical/arxiv.canonical.record.tests.test_listing.rst new file mode 100644 index 0000000..38113f0 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.record.tests.test_listing.rst @@ -0,0 +1,7 @@ +arxiv.canonical.record.tests.test\_listing module +================================================= + +.. automodule:: arxiv.canonical.record.tests.test_listing + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.record.tests.test_version.rst b/docs/source/arxiv.canonical/arxiv.canonical.record.tests.test_version.rst new file mode 100644 index 0000000..d95dd7f --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.record.tests.test_version.rst @@ -0,0 +1,7 @@ +arxiv.canonical.record.tests.test\_version module +================================================= + +.. automodule:: arxiv.canonical.record.tests.test_version + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.record.version.rst b/docs/source/arxiv.canonical/arxiv.canonical.record.version.rst new file mode 100644 index 0000000..0198887 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.record.version.rst @@ -0,0 +1,7 @@ +arxiv.canonical.record.version module +===================================== + +.. automodule:: arxiv.canonical.record.version + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.register.api.rst b/docs/source/arxiv.canonical/arxiv.canonical.register.api.rst new file mode 100644 index 0000000..8e21b05 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.register.api.rst @@ -0,0 +1,7 @@ +arxiv.canonical.register.api module +=================================== + +.. automodule:: arxiv.canonical.register.api + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.register.core.rst b/docs/source/arxiv.canonical/arxiv.canonical.register.core.rst new file mode 100644 index 0000000..9e8d5f1 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.register.core.rst @@ -0,0 +1,7 @@ +arxiv.canonical.register.core module +==================================== + +.. automodule:: arxiv.canonical.register.core + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.register.eprint.rst b/docs/source/arxiv.canonical/arxiv.canonical.register.eprint.rst new file mode 100644 index 0000000..87f6d3d --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.register.eprint.rst @@ -0,0 +1,7 @@ +arxiv.canonical.register.eprint module +====================================== + +.. automodule:: arxiv.canonical.register.eprint + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.register.exceptions.rst b/docs/source/arxiv.canonical/arxiv.canonical.register.exceptions.rst new file mode 100644 index 0000000..dd2b08b --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.register.exceptions.rst @@ -0,0 +1,7 @@ +arxiv.canonical.register.exceptions module +========================================== + +.. automodule:: arxiv.canonical.register.exceptions + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.register.file.rst b/docs/source/arxiv.canonical/arxiv.canonical.register.file.rst new file mode 100644 index 0000000..6effee8 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.register.file.rst @@ -0,0 +1,7 @@ +arxiv.canonical.register.file module +==================================== + +.. automodule:: arxiv.canonical.register.file + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.register.listing.rst b/docs/source/arxiv.canonical/arxiv.canonical.register.listing.rst new file mode 100644 index 0000000..3e0fde6 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.register.listing.rst @@ -0,0 +1,7 @@ +arxiv.canonical.register.listing module +======================================= + +.. automodule:: arxiv.canonical.register.listing + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.register.metadata.rst b/docs/source/arxiv.canonical/arxiv.canonical.register.metadata.rst new file mode 100644 index 0000000..f67cfcb --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.register.metadata.rst @@ -0,0 +1,7 @@ +arxiv.canonical.register.metadata module +======================================== + +.. automodule:: arxiv.canonical.register.metadata + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.register.methods.rst b/docs/source/arxiv.canonical/arxiv.canonical.register.methods.rst new file mode 100644 index 0000000..7ba8939 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.register.methods.rst @@ -0,0 +1,7 @@ +arxiv.canonical.register.methods module +======================================= + +.. automodule:: arxiv.canonical.register.methods + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.register.rst b/docs/source/arxiv.canonical/arxiv.canonical.register.rst new file mode 100644 index 0000000..1ec5e0a --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.register.rst @@ -0,0 +1,31 @@ +arxiv.canonical.register package +================================ + +.. automodule:: arxiv.canonical.register + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + arxiv.canonical.register.tests + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.register.api + arxiv.canonical.register.core + arxiv.canonical.register.eprint + arxiv.canonical.register.exceptions + arxiv.canonical.register.file + arxiv.canonical.register.listing + arxiv.canonical.register.metadata + arxiv.canonical.register.methods + arxiv.canonical.register.util + arxiv.canonical.register.version + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.register.tests.rst b/docs/source/arxiv.canonical/arxiv.canonical.register.tests.rst new file mode 100644 index 0000000..3498c9f --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.register.tests.rst @@ -0,0 +1,15 @@ +arxiv.canonical.register.tests package +====================================== + +.. automodule:: arxiv.canonical.register.tests + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.register.tests.test_api + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.register.tests.test_api.rst b/docs/source/arxiv.canonical/arxiv.canonical.register.tests.test_api.rst new file mode 100644 index 0000000..96ac9f8 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.register.tests.test_api.rst @@ -0,0 +1,7 @@ +arxiv.canonical.register.tests.test\_api module +=============================================== + +.. automodule:: arxiv.canonical.register.tests.test_api + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.register.util.rst b/docs/source/arxiv.canonical/arxiv.canonical.register.util.rst new file mode 100644 index 0000000..95ef6b5 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.register.util.rst @@ -0,0 +1,7 @@ +arxiv.canonical.register.util module +==================================== + +.. automodule:: arxiv.canonical.register.util + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.register.version.rst b/docs/source/arxiv.canonical/arxiv.canonical.register.version.rst new file mode 100644 index 0000000..e0c6f62 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.register.version.rst @@ -0,0 +1,7 @@ +arxiv.canonical.register.version module +======================================= + +.. automodule:: arxiv.canonical.register.version + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.role.proxy.rst b/docs/source/arxiv.canonical/arxiv.canonical.role.proxy.rst new file mode 100644 index 0000000..729a2d0 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.role.proxy.rst @@ -0,0 +1,7 @@ +arxiv.canonical.role.proxy module +================================= + +.. automodule:: arxiv.canonical.role.proxy + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.role.register.rst b/docs/source/arxiv.canonical/arxiv.canonical.role.register.rst new file mode 100644 index 0000000..630dacf --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.role.register.rst @@ -0,0 +1,7 @@ +arxiv.canonical.role.register module +==================================== + +.. automodule:: arxiv.canonical.role.register + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.role.role.rst b/docs/source/arxiv.canonical/arxiv.canonical.role.role.rst new file mode 100644 index 0000000..f9f54ab --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.role.role.rst @@ -0,0 +1,7 @@ +arxiv.canonical.role.role module +================================ + +.. automodule:: arxiv.canonical.role.role + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.role.rst b/docs/source/arxiv.canonical/arxiv.canonical.role.rst new file mode 100644 index 0000000..273d377 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.role.rst @@ -0,0 +1,25 @@ +arxiv.canonical.role package +============================ + +.. automodule:: arxiv.canonical.role + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + arxiv.canonical.role.tests + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.role.proxy + arxiv.canonical.role.register + arxiv.canonical.role.role + arxiv.canonical.role.stream + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.role.stream.rst b/docs/source/arxiv.canonical/arxiv.canonical.role.stream.rst new file mode 100644 index 0000000..746ece5 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.role.stream.rst @@ -0,0 +1,7 @@ +arxiv.canonical.role.stream module +================================== + +.. automodule:: arxiv.canonical.role.stream + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.role.tests.rst b/docs/source/arxiv.canonical/arxiv.canonical.role.tests.rst new file mode 100644 index 0000000..70b4602 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.role.tests.rst @@ -0,0 +1,15 @@ +arxiv.canonical.role.tests package +================================== + +.. automodule:: arxiv.canonical.role.tests + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.role.tests.test_register + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.role.tests.test_register.rst b/docs/source/arxiv.canonical/arxiv.canonical.role.tests.test_register.rst new file mode 100644 index 0000000..247d43c --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.role.tests.test_register.rst @@ -0,0 +1,7 @@ +arxiv.canonical.role.tests.test\_register module +================================================ + +.. automodule:: arxiv.canonical.role.tests.test_register + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.rst b/docs/source/arxiv.canonical/arxiv.canonical.rst new file mode 100644 index 0000000..ce2499f --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.rst @@ -0,0 +1,34 @@ +arxiv.canonical package +======================= + +.. automodule:: arxiv.canonical + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + arxiv.canonical.classic + arxiv.canonical.domain + arxiv.canonical.integrity + arxiv.canonical.log + arxiv.canonical.record + arxiv.canonical.register + arxiv.canonical.role + arxiv.canonical.serialize + arxiv.canonical.services + arxiv.canonical.tests + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.core + arxiv.canonical.manifest + arxiv.canonical.preservation + arxiv.canonical.util + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.serialize.decoder.rst b/docs/source/arxiv.canonical/arxiv.canonical.serialize.decoder.rst new file mode 100644 index 0000000..58dcf2a --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.serialize.decoder.rst @@ -0,0 +1,7 @@ +arxiv.canonical.serialize.decoder module +======================================== + +.. automodule:: arxiv.canonical.serialize.decoder + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.serialize.encoder.rst b/docs/source/arxiv.canonical/arxiv.canonical.serialize.encoder.rst new file mode 100644 index 0000000..0f667e6 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.serialize.encoder.rst @@ -0,0 +1,7 @@ +arxiv.canonical.serialize.encoder module +======================================== + +.. automodule:: arxiv.canonical.serialize.encoder + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.serialize.record.rst b/docs/source/arxiv.canonical/arxiv.canonical.serialize.record.rst new file mode 100644 index 0000000..8ea0377 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.serialize.record.rst @@ -0,0 +1,7 @@ +arxiv.canonical.serialize.record module +======================================= + +.. automodule:: arxiv.canonical.serialize.record + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.serialize.rst b/docs/source/arxiv.canonical/arxiv.canonical.serialize.rst new file mode 100644 index 0000000..e694549 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.serialize.rst @@ -0,0 +1,24 @@ +arxiv.canonical.serialize package +================================= + +.. automodule:: arxiv.canonical.serialize + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + arxiv.canonical.serialize.tests + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.serialize.decoder + arxiv.canonical.serialize.encoder + arxiv.canonical.serialize.record + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.serialize.tests.rst b/docs/source/arxiv.canonical/arxiv.canonical.serialize.tests.rst new file mode 100644 index 0000000..0817837 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.serialize.tests.rst @@ -0,0 +1,15 @@ +arxiv.canonical.serialize.tests package +======================================= + +.. automodule:: arxiv.canonical.serialize.tests + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.serialize.tests.test_serialize + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.serialize.tests.test_serialize.rst b/docs/source/arxiv.canonical/arxiv.canonical.serialize.tests.test_serialize.rst new file mode 100644 index 0000000..44de65f --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.serialize.tests.test_serialize.rst @@ -0,0 +1,7 @@ +arxiv.canonical.serialize.tests.test\_serialize module +====================================================== + +.. automodule:: arxiv.canonical.serialize.tests.test_serialize + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.services.cache.rst b/docs/source/arxiv.canonical/arxiv.canonical.services.cache.rst new file mode 100644 index 0000000..b84d4de --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.services.cache.rst @@ -0,0 +1,7 @@ +arxiv.canonical.services.cache module +===================================== + +.. automodule:: arxiv.canonical.services.cache + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.services.filesystem.rst b/docs/source/arxiv.canonical/arxiv.canonical.services.filesystem.rst new file mode 100644 index 0000000..876b8f9 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.services.filesystem.rst @@ -0,0 +1,7 @@ +arxiv.canonical.services.filesystem module +========================================== + +.. automodule:: arxiv.canonical.services.filesystem + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.services.readable.rst b/docs/source/arxiv.canonical/arxiv.canonical.services.readable.rst new file mode 100644 index 0000000..ee530d1 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.services.readable.rst @@ -0,0 +1,7 @@ +arxiv.canonical.services.readable module +======================================== + +.. automodule:: arxiv.canonical.services.readable + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.services.remote.rst b/docs/source/arxiv.canonical/arxiv.canonical.services.remote.rst new file mode 100644 index 0000000..27c445a --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.services.remote.rst @@ -0,0 +1,7 @@ +arxiv.canonical.services.remote module +====================================== + +.. automodule:: arxiv.canonical.services.remote + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.services.repository.rst b/docs/source/arxiv.canonical/arxiv.canonical.services.repository.rst new file mode 100644 index 0000000..c6a9c45 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.services.repository.rst @@ -0,0 +1,7 @@ +arxiv.canonical.services.repository module +========================================== + +.. automodule:: arxiv.canonical.services.repository + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.services.rst b/docs/source/arxiv.canonical/arxiv.canonical.services.rst new file mode 100644 index 0000000..f41f262 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.services.rst @@ -0,0 +1,28 @@ +arxiv.canonical.services package +================================ + +.. automodule:: arxiv.canonical.services + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + arxiv.canonical.services.tests + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.services.cache + arxiv.canonical.services.filesystem + arxiv.canonical.services.readable + arxiv.canonical.services.remote + arxiv.canonical.services.repository + arxiv.canonical.services.store + arxiv.canonical.services.stream + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.services.store.rst b/docs/source/arxiv.canonical/arxiv.canonical.services.store.rst new file mode 100644 index 0000000..895c4a4 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.services.store.rst @@ -0,0 +1,7 @@ +arxiv.canonical.services.store module +===================================== + +.. automodule:: arxiv.canonical.services.store + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.services.stream.rst b/docs/source/arxiv.canonical/arxiv.canonical.services.stream.rst new file mode 100644 index 0000000..802ebb2 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.services.stream.rst @@ -0,0 +1,7 @@ +arxiv.canonical.services.stream module +====================================== + +.. automodule:: arxiv.canonical.services.stream + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.services.tests.rst b/docs/source/arxiv.canonical/arxiv.canonical.services.tests.rst new file mode 100644 index 0000000..d359b62 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.services.tests.rst @@ -0,0 +1,19 @@ +arxiv.canonical.services.tests package +====================================== + +.. automodule:: arxiv.canonical.services.tests + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + arxiv.canonical.services.tests.test_filesystem + arxiv.canonical.services.tests.test_readable + arxiv.canonical.services.tests.test_remote + arxiv.canonical.services.tests.test_repository + arxiv.canonical.services.tests.test_store + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.services.tests.test_filesystem.rst b/docs/source/arxiv.canonical/arxiv.canonical.services.tests.test_filesystem.rst new file mode 100644 index 0000000..791700a --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.services.tests.test_filesystem.rst @@ -0,0 +1,7 @@ +arxiv.canonical.services.tests.test\_filesystem module +====================================================== + +.. automodule:: arxiv.canonical.services.tests.test_filesystem + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.services.tests.test_readable.rst b/docs/source/arxiv.canonical/arxiv.canonical.services.tests.test_readable.rst new file mode 100644 index 0000000..8eb743a --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.services.tests.test_readable.rst @@ -0,0 +1,7 @@ +arxiv.canonical.services.tests.test\_readable module +==================================================== + +.. automodule:: arxiv.canonical.services.tests.test_readable + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.services.tests.test_remote.rst b/docs/source/arxiv.canonical/arxiv.canonical.services.tests.test_remote.rst new file mode 100644 index 0000000..d969869 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.services.tests.test_remote.rst @@ -0,0 +1,7 @@ +arxiv.canonical.services.tests.test\_remote module +================================================== + +.. automodule:: arxiv.canonical.services.tests.test_remote + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.services.tests.test_repository.rst b/docs/source/arxiv.canonical/arxiv.canonical.services.tests.test_repository.rst new file mode 100644 index 0000000..597159b --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.services.tests.test_repository.rst @@ -0,0 +1,7 @@ +arxiv.canonical.services.tests.test\_repository module +====================================================== + +.. automodule:: arxiv.canonical.services.tests.test_repository + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.services.tests.test_store.rst b/docs/source/arxiv.canonical/arxiv.canonical.services.tests.test_store.rst new file mode 100644 index 0000000..5421251 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.services.tests.test_store.rst @@ -0,0 +1,7 @@ +arxiv.canonical.services.tests.test\_store module +================================================= + +.. automodule:: arxiv.canonical.services.tests.test_store + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/arxiv.canonical.tests.rst b/docs/source/arxiv.canonical/arxiv.canonical.tests.rst new file mode 100644 index 0000000..9a3eb3d --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.tests.rst @@ -0,0 +1,8 @@ +arxiv.canonical.tests package +============================= + +.. automodule:: arxiv.canonical.tests + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/source/arxiv.canonical/arxiv.canonical.util.rst b/docs/source/arxiv.canonical/arxiv.canonical.util.rst new file mode 100644 index 0000000..dc86996 --- /dev/null +++ b/docs/source/arxiv.canonical/arxiv.canonical.util.rst @@ -0,0 +1,7 @@ +arxiv.canonical.util module +=========================== + +.. automodule:: arxiv.canonical.util + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arxiv.canonical/modules.rst b/docs/source/arxiv.canonical/modules.rst new file mode 100644 index 0000000..64ebe1a --- /dev/null +++ b/docs/source/arxiv.canonical/modules.rst @@ -0,0 +1,7 @@ +arxiv +===== + +.. toctree:: + :maxdepth: 4 + + arxiv.canonical diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..a50a8c3 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,200 @@ +"""Configuration for arxiv-canonical documentation build.""" +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# arXiv Canonical Record documentation build configuration file, created by +# sphinx-quickstart on Thu Nov 30 09:39:11 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +sys.path.insert(0, os.path.abspath('.')) +sys.path.append(os.path.abspath('../..')) +sys.path.append(os.path.abspath('../../repository')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx_autodoc_typehints', + 'sphinx.ext.autosummary', + 'sphinx.ext.napoleon', + 'sphinx.ext.intersphinx', + 'sphinx.ext.graphviz', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.mathjax', + 'sphinx.ext.ifconfig', + 'sphinx.ext.viewcode', + 'sphinx.ext.githubpages' +] + + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'arXiv Canonical Record' +copyright = '2018, arXiv.org' +author = 'arXiv Developers' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.1' +# The full version, including alpha/beta/rc tags. +release = '0.1' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = [] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = True + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# This is required for the alabaster theme +# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars +html_sidebars = { + '**': [ + 'about.html', + 'navigation.html', + 'relations.html', # needs 'show_related': True theme option to display + 'searchbox.html', + ] +} + + +# -- Options for HTMLHelp output ------------------------------------------ + +# Output file base name for HTML help builder. +htmlhelp_basename = 'arXiv-Canonical-Docs' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'arXivCanonical.tex', + 'arXiv Canonical Record Documentation', + 'arXiv Developers', 'manual'), +] + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'arXivCanonical', + 'arXiv Canonical Record Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'arXivCanonical', + 'arXiv Canonical Record Documentation', + author, 'arXivCanonical', 'One line description of project.', + 'Miscellaneous'), +] + + +intersphinx_mapping = { + 'python': ('https://docs.python.org/3.6', None), + 'arxitecture': ('https://arxiv.github.io/arxiv-arxitecture/', None), + 'arxiv.taxonomy': ('https://arxiv.github.io/arxiv-base', None), + 'arxiv.base': ('https://arxiv.github.io/arxiv-base', None), + 'arxiv.forms': ('https://arxiv.github.io/arxiv-base', None), + 'browse': ('https://arxiv.github.io/arxiv-browse/', None), + 'search': ('https://arxiv.github.io/arxiv-search/', None), + 'zero': ('https://arxiv.github.io/arxiv-zero/', None), + 'flask': ('http://flask.pocoo.org/docs/1.0/', None), +} \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..4cde00a --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,6 @@ +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + arxiv.canonical/arxiv.canonical.rst + repository/repository.rst \ No newline at end of file diff --git a/docs/source/repository/modules.rst b/docs/source/repository/modules.rst new file mode 100644 index 0000000..8bf7695 --- /dev/null +++ b/docs/source/repository/modules.rst @@ -0,0 +1,8 @@ +repository +========== + +.. toctree:: + :maxdepth: 4 + + repository + tests diff --git a/docs/source/repository/repository.config.rst b/docs/source/repository/repository.config.rst new file mode 100644 index 0000000..8ff4dec --- /dev/null +++ b/docs/source/repository/repository.config.rst @@ -0,0 +1,7 @@ +repository.config module +======================== + +.. automodule:: repository.config + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/repository/repository.controllers.rst b/docs/source/repository/repository.controllers.rst new file mode 100644 index 0000000..bbb3596 --- /dev/null +++ b/docs/source/repository/repository.controllers.rst @@ -0,0 +1,7 @@ +repository.controllers module +============================= + +.. automodule:: repository.controllers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/repository/repository.factory.rst b/docs/source/repository/repository.factory.rst new file mode 100644 index 0000000..1c66088 --- /dev/null +++ b/docs/source/repository/repository.factory.rst @@ -0,0 +1,7 @@ +repository.factory module +========================= + +.. automodule:: repository.factory + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/repository/repository.routes.api.rst b/docs/source/repository/repository.routes.api.rst new file mode 100644 index 0000000..48882a6 --- /dev/null +++ b/docs/source/repository/repository.routes.api.rst @@ -0,0 +1,7 @@ +repository.routes.api module +============================ + +.. automodule:: repository.routes.api + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/repository/repository.routes.rst b/docs/source/repository/repository.routes.rst new file mode 100644 index 0000000..532c5e1 --- /dev/null +++ b/docs/source/repository/repository.routes.rst @@ -0,0 +1,15 @@ +repository.routes package +========================= + +.. automodule:: repository.routes + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + repository.routes.api + diff --git a/docs/source/repository/repository.rst b/docs/source/repository/repository.rst new file mode 100644 index 0000000..e85328d --- /dev/null +++ b/docs/source/repository/repository.rst @@ -0,0 +1,26 @@ +repository package +================== + +.. automodule:: repository + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + repository.routes + repository.services + repository.tests + +Submodules +---------- + +.. toctree:: + + repository.config + repository.controllers + repository.factory + diff --git a/docs/source/repository/repository.services.record.rst b/docs/source/repository/repository.services.record.rst new file mode 100644 index 0000000..27cbc3f --- /dev/null +++ b/docs/source/repository/repository.services.record.rst @@ -0,0 +1,7 @@ +repository.services.record module +================================= + +.. automodule:: repository.services.record + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/repository/repository.services.rst b/docs/source/repository/repository.services.rst new file mode 100644 index 0000000..5defcb6 --- /dev/null +++ b/docs/source/repository/repository.services.rst @@ -0,0 +1,15 @@ +repository.services package +=========================== + +.. automodule:: repository.services + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + repository.services.record + diff --git a/docs/source/repository/repository.tests.rst b/docs/source/repository/repository.tests.rst new file mode 100644 index 0000000..7b4c158 --- /dev/null +++ b/docs/source/repository/repository.tests.rst @@ -0,0 +1,15 @@ +repository.tests package +======================== + +.. automodule:: repository.tests + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + repository.tests.test_controllers + diff --git a/docs/source/repository/repository.tests.test_controllers.rst b/docs/source/repository/repository.tests.test_controllers.rst new file mode 100644 index 0000000..56a14b5 --- /dev/null +++ b/docs/source/repository/repository.tests.test_controllers.rst @@ -0,0 +1,7 @@ +repository.tests.test\_controllers module +========================================= + +.. automodule:: repository.tests.test_controllers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/repository/tests.rst b/docs/source/repository/tests.rst new file mode 100644 index 0000000..29e1e02 --- /dev/null +++ b/docs/source/repository/tests.rst @@ -0,0 +1,15 @@ +tests package +============= + +.. automodule:: tests + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + tests.test_app + diff --git a/docs/source/repository/tests.test_app.rst b/docs/source/repository/tests.test_app.rst new file mode 100644 index 0000000..9a70fd2 --- /dev/null +++ b/docs/source/repository/tests.test_app.rst @@ -0,0 +1,7 @@ +tests.test\_app module +====================== + +.. automodule:: tests.test_app + :members: + :undoc-members: + :show-inheritance: diff --git a/repository/repository/config.py b/repository/repository/config.py index 2bce7c9..b0b807b 100644 --- a/repository/repository/config.py +++ b/repository/repository/config.py @@ -4,10 +4,21 @@ Docstrings are from the `Flask configuration documentation `_. """ -from typing import Optional +from typing import Any, Optional, Type import warnings from os import environ + +def _showwarning(message: str, + *args: Any, + category: Type[Exception] = UserWarning, + filename: str = '', + lineno: int = -1, + **kwargs: Any) -> None: + print(message) + +warnings.showwarning = _showwarning + NAMESPACE = environ.get('NAMESPACE') """Namespace in which this service is deployed; to qualify keys for secrets.""" @@ -133,7 +144,7 @@ BASE_SERVER = environ.get('BASE_SERVER', 'arxiv.org') URLS = [ - + ] """ URLs for external services, for use with :func:`flask.url_for`. diff --git a/repository/repository/controllers.py b/repository/repository/controllers.py index e3c3c48..a9bd939 100644 --- a/repository/repository/controllers.py +++ b/repository/repository/controllers.py @@ -10,14 +10,15 @@ from werkzeug.datastructures import MultiDict from werkzeug.exceptions import NotFound -from .services.record import CanonicalStore, DoesNotExist -from arxiv.canonical.domain import File +from arxiv.canonical.domain import CanonicalFile, VersionedIdentifier + +from .services.record import RepositoryService, NoSuchResource Response = Tuple[Dict[str, Any], HTTPStatus, Dict[str, str]] -FileResponse = Tuple[Union[Dict[str, Any], File], HTTPStatus, Dict[str, str]] -def service_status(params: MultiDict) -> Response: + +def service_status(_: MultiDict) -> Response: """ Handle requests for the service status endpoint. @@ -43,15 +44,19 @@ def get_eprint_events(identifier: str, version: int) -> Response: Raised when the requested identifier + version does not exist. """ - estore = CanonicalStore.current_session() + repo = RepositoryService.current_session() + try: + v_identifier = VersionedIdentifier.from_parts(identifier, version) + except (TypeError, ValueError) as e: + raise NotFound(f'No such e-print: {identifier}v{version}') from e try: - eprint = estore.load_eprint(identifier, version) - except DoesNotExist as e: + eprint = repo.register.load_version(v_identifier) + except NoSuchResource as e: raise NotFound(f'No such e-print: {identifier}v{version}') from e - return eprint.history, HTTPStatus.OK, {} + return eprint.events, HTTPStatus.OK, {} -def get_eprint_pdf(identifier: str, version: int) -> FileResponse: +def get_eprint_pdf(identifier: str, version: int) -> Response: """ Retrieve pdf for a specific e-print version. @@ -68,9 +73,13 @@ def get_eprint_pdf(identifier: str, version: int) -> FileResponse: Raised when the requested identifier + version does not exist. """ - estore = CanonicalStore.current_session() + repo = RepositoryService.current_session() + try: + v_identifier = VersionedIdentifier.from_parts(identifier, version) + except (ValueError, TypeError) as e: + raise NotFound(f'No such e-print: {identifier}v{version}') from e try: - eprint = estore.load_eprint(identifier, version) - except DoesNotExist as e: + cf, f = repo.register.load_render(v_identifier) + except NoSuchResource as e: raise NotFound(f'No such e-print: {identifier}v{version}') from e - return eprint.pdf, HTTPStatus.OK, {} + return {'metadata': cf, 'pointer': f}, HTTPStatus.OK, {} diff --git a/repository/repository/factory.py b/repository/repository/factory.py index 8c58b1d..3f1bf00 100644 --- a/repository/repository/factory.py +++ b/repository/repository/factory.py @@ -20,8 +20,8 @@ def create_api_app() -> Flask: Base(app) auth.Auth(app) - app.json_decoder = decoder.CanonicalJSONDecoder - app.json_encoder = encoder.CanonicalJSONEncoder + app.json_decoder = decoder.CanonicalDecoder + app.json_encoder = encoder.CanonicalEncoder app.register_blueprint(routes.api.blueprint) wrap(app, [auth.middleware.AuthMiddleware]) diff --git a/repository/repository/routes/api.py b/repository/repository/routes/api.py index 4600ca9..d17fad2 100644 --- a/repository/repository/routes/api.py +++ b/repository/repository/routes/api.py @@ -17,29 +17,33 @@ def service_status() -> Response: Returns ``200 OK`` if the service is up and ready to handle requests. """ - response_data, status_code, headers = controllers.service_status(request.params) - response: Response = jsonify(response_data) - response.status_code = status_code - response.headers.extend(headers) + data, code, headers = controllers.service_status(request.params) + response: Response = jsonify(data) + response.status_code = code + response.headers.extend(headers) # type: ignore return response -@blueprint.route('/e-print/v/events', +@blueprint.route('/e-print/v/events', methods=['GET']) def get_eprint_events(identifier: str, version: int) -> Response: """Get events for a specific version of an e-print.""" data, code, headers = controllers.get_eprint_events(identifier, version) - return jsonify(data), code, headers + response: Response = jsonify(data) + response.status_code = code + response.headers.extend(headers) # type: ignore + return response @blueprint.route('/e-print/v/pdf', methods=['GET']) def get_eprint_pdf(identifier: str, version: int) -> Response: """Get PDF for a specific version of an e-print.""" - pdf, status_code, headers = controllers.get_eprint_pdf(identifier, version) - response: Response = send_file(pdf.content, as_attachment=True, + data, code, headers = controllers.get_eprint_pdf(identifier, version) + pdf = data['metadata'] + response: Response = send_file(data['pointer'], as_attachment=True, attachment_filename=pdf.filename, last_modified=pdf.modified) - response.status_code = status_code - response.headers.extend(headers) + response.status_code = code + response.headers.extend(headers) # type: ignore return response diff --git a/repository/repository/services/record.py b/repository/repository/services/record.py index 5a1e76d..bbe5235 100644 --- a/repository/repository/services/record.py +++ b/repository/repository/services/record.py @@ -1,19 +1,48 @@ """Service integration module for reading the canonical record.""" -from arxiv.canonical.domain import EPrint, CanonicalRecord, Identifier, \ - VersionedIdentifier -from arxiv.canonical.services.store import FakeCanonicalStore, DoesNotExist +from typing import Optional +from flask import Flask, current_app, g -# TODO: switch to the real store.CanonicalStore implementation when it is -# available. -class CanonicalStore(FakeCanonicalStore): - """ - Extension of :class:`.store.CanonicalStore` with additional read patterns. - """ +from arxiv.canonical import Repository, NoSuchResource +from arxiv.canonical.services.store import CanonicalStore - def load_eprint(self, identifier: Identifier, version: int) -> EPrint: - """Load an :class:`.EPrint` from the record.""" - record = self.load_record() - v_identifier = VersionedIdentifier.from_parts(identifier, version) - return record.load_eprint(v_identifier) \ No newline at end of file + +class RepositoryService(Repository): + @classmethod + def init_app(cls, app: Flask) -> None: + """Set default configuration parameters for an app instance.""" + app.config.setdefault(f'CANONICAL_BUCKET', 'arxiv-canonical-record') + app.config.setdefault(f'CANONICAL_VERIFY', True) + app.config.setdefault('AWS_REGION', 'us-east-1') + app.config.setdefault('AWS_ACCESS_KEY_ID', None) + app.config.setdefault('AWS_SECRET_ACCESS_KEY', None) + + @classmethod + def get_session(cls, app: Optional[Flask] = None) -> 'RepositoryService': + """Get a new session with the RepositoryService.""" + if app is None: + app = current_app + try: + params = app.config.get_namespace(f'CANONICAL_') + storage = CanonicalStore( + params['bucket'], + verify=params.get('verify', True), + region_name=app.config['AWS_REGION'], + endpoint_url=params.get('endpoint_url', None), + aws_access_key_id=app.config['AWS_ACCESS_KEY_ID'], + aws_secret_access_key=app.config['AWS_SECRET_ACCESS_KEY'] + ) + except KeyError as e: + raise RuntimeError('Must call init_app() on app before use') from e + return cls(storage, [storage], None) + + @classmethod + def current_session(cls) -> 'RepositoryService': + """Get or create a RepositoryService session for this context.""" + if not g: + return cls.get_session() + elif 'repository' not in g: + g.repository = cls.get_session() + session: RepositoryService = g.repository + return session \ No newline at end of file diff --git a/repository/repository/tests/test_controllers.py b/repository/repository/tests/test_controllers.py index fdf9daa..ae44b0c 100644 --- a/repository/repository/tests/test_controllers.py +++ b/repository/repository/tests/test_controllers.py @@ -1,58 +1,123 @@ """Tests for :mod:`repository.controllers`.""" -from unittest import TestCase, mock +import io +from datetime import datetime from http import HTTPStatus +from unittest import TestCase, mock +from pytz import UTC from werkzeug.exceptions import NotFound -from arxiv.canonical.domain import Event, File -from arxiv.canonical.services import store -from ..services.record import CanonicalStore +from arxiv.canonical import domain as D +from arxiv.canonical import NoSuchResource +from ..services.record import RepositoryService from .. import controllers -class TestGetEPrintEvents(TestCase): +class ControllerTestCase(TestCase): + def setUp(self): + self.mock_pdf = lambda *a, **k: io.BytesIO(b'foocontent') + + identifier = D.VersionedIdentifier('2901.00345v1') + created = datetime(2029, 1, 29, 20, 4, 23, tzinfo=UTC) + listing_id = D.ListingIdentifier.from_parts(created.date(), 'foo') + + self.render = D.CanonicalFile( + filename='2901.00345v1.pdf', + modified=created, + size_bytes=404, + content_type=D.ContentType.pdf, + ref=D.URI('/fake/path.pdf') + ) + + self.version = D.Version( + identifier=identifier, + announced_date=created.date(), + announced_date_first=created.date(), + submitted_date=created, + updated_date=created, + is_announced=True, + events=[], + previous_versions=[], + metadata=D.Metadata( + primary_classification=D.Category('cs.DL'), + secondary_classification=[D.Category('cs.IR')], + title='Foo title', + abstract='It is abstract', + authors='Ima N. Author (FSU)', + license=D.License(href="http://some.license") + ), + source=D.CanonicalFile( + filename='2901.00345v1.tar', + modified=created, + size_bytes=4_304, + content_type=D.ContentType.tar, + ref=D.URI('/fake/path.tar'), + is_gzipped=False, + ), + render=self.render + ) + self.event = D.Event( + identifier=identifier, + event_date=created, + event_type=D.EventType.NEW, + categories=[D.Category('cs.DL')], + version=self.version + ) + self.version.events.append(self.event.summary) + + +class TestGetEPrintEvents(ControllerTestCase): """Tests for :func:`.controllers.get_eprint_events`.""" - @mock.patch(f'{controllers.__name__}.CanonicalStore') - def test_request_for_nonexistant_eprint(self, mock_CanonicalStore): + @mock.patch(f'{controllers.__name__}.RepositoryService') + def test_request_for_nonexistant_eprint(self, mock_RepositoryService): """A request is received for a nonexistant e-print.""" - mock_CanonicalStore.current_session.return_value = mock.MagicMock( - load_eprint=mock.MagicMock(side_effect=store.DoesNotExist) - ) + mock_repo = mock.MagicMock() + mock_repo.register.load_version.side_effect = NoSuchResource + mock_RepositoryService.current_session.return_value = mock_repo with self.assertRaises(NotFound): controllers.get_eprint_events('1901.00123', 4) - - @mock.patch(f'{controllers.__name__}.CanonicalStore') - def test_request_for_existant_eprint(self, mock_CanonicalStore): + + @mock.patch(f'{controllers.__name__}.RepositoryService') + def test_request_for_existant_eprint(self, mock_RepositoryService): """A request is received for an existant e-print.""" - mock_CanonicalStore.current_session.return_value \ - = CanonicalStore.current_session() + mock_repo = mock.MagicMock() + mock_repo.register.load_version.return_value = self.version + mock_RepositoryService.current_session.return_value = mock_repo + data, code, headers = controllers.get_eprint_events('1901.00123', 4) + self.assertIsInstance(data, list, 'Returns a list of objects') self.assertGreater(len(data), 0, 'Returns at least one object') for obj in data: - self.assertIsInstance(obj, Event, 'List items are Events') + self.assertIsInstance(obj, D.EventSummary, + 'List items are EventSummary') self.assertEqual(code, HTTPStatus.OK, 'Return status is 200 OK') -class TestGetEPrintPDF(TestCase): +class TestGetEPrintPDF(ControllerTestCase): """Tests for :func:`.controllers.get_eprint_pdf`.""" - @mock.patch(f'{controllers.__name__}.CanonicalStore') - def test_request_for_nonexistant_eprint(self, mock_CanonicalStore): + @mock.patch(f'{controllers.__name__}.RepositoryService') + def test_request_for_nonexistant_eprint(self, mock_RepositoryService): """A request is received for a nonexistant e-print.""" - mock_CanonicalStore.current_session.return_value = mock.MagicMock( - load_eprint=mock.MagicMock(side_effect=store.DoesNotExist) - ) + mock_repo = mock.MagicMock() + mock_repo.register.load_render.side_effect = NoSuchResource + mock_RepositoryService.current_session.return_value = mock_repo + with self.assertRaises(NotFound): controllers.get_eprint_pdf('1901.00123', 4) - @mock.patch(f'{controllers.__name__}.CanonicalStore') - def test_request_for_existant_eprint(self, mock_CanonicalStore): + @mock.patch(f'{controllers.__name__}.RepositoryService') + def test_request_for_existant_eprint(self, mock_RepositoryService): """A request is received for an existant e-print.""" - mock_CanonicalStore.current_session.return_value \ - = CanonicalStore.current_session() + mock_repo = mock.MagicMock() + mock_repo.register.load_render.return_value \ + = self.render, io.BytesIO(b'foo') + mock_RepositoryService.current_session.return_value = mock_repo + data, code, headers = controllers.get_eprint_pdf('1901.00123', 4) - self.assertIsInstance(data, File, 'Returns a File') + self.assertIsInstance(data['metadata'], D.CanonicalFile, + 'Returns a File') self.assertEqual(code, HTTPStatus.OK, 'Return status is 200 OK') diff --git a/repository/tests/test_app.py b/repository/tests/test_app.py index f35d3f1..5e82669 100644 --- a/repository/tests/test_app.py +++ b/repository/tests/test_app.py @@ -1,62 +1,121 @@ -from unittest import TestCase, mock +import io +from datetime import datetime from http import HTTPStatus +from unittest import TestCase, mock -from repository.services.record import CanonicalStore, DoesNotExist -from repository.factory import create_api_app +from pytz import UTC +from arxiv.canonical import Primary +from arxiv.canonical import domain as D +from arxiv.canonical.services.store import InMemoryStorage + +from repository.services import record +from repository.factory import create_api_app -class TestGetEPrintEvents(TestCase): - """Requests for e-print events.""" +class AppTestCase(TestCase): def setUp(self): - """Spin up an app.""" + self.mock_source = mock.MagicMock() + self.mock_source.can_resolve.return_value = True + + self.mock_source.load = \ + lambda *a, **k: io.BytesIO(b'foocontent') + + self.storage = InMemoryStorage() + self.primary = Primary( + self.storage, + [self.storage, self.mock_source], + mock.MagicMock() + ) + + identifier = D.VersionedIdentifier('2901.00345v1') + created = datetime(2029, 1, 29, 20, 4, 23, tzinfo=UTC) + listing_id = D.ListingIdentifier.from_parts(created.date(), 'foo') + + version = D.Version( + identifier=identifier, + announced_date=created.date(), + announced_date_first=created.date(), + submitted_date=created, + updated_date=created, + is_announced=True, + events=[], + previous_versions=[], + metadata=D.Metadata( + primary_classification=D.Category('cs.DL'), + secondary_classification=[D.Category('cs.IR')], + title='Foo title', + abstract='It is abstract', + authors='Ima N. Author (FSU)', + license=D.License(href="http://some.license") + ), + source=D.CanonicalFile( + filename='2901.00345v1.tar', + modified=created, + size_bytes=4_304, + content_type=D.ContentType.tar, + ref=D.URI('/fake/path.tar'), + is_gzipped=False, + ), + render=D.CanonicalFile( + filename='2901.00345v1.pdf', + modified=created, + size_bytes=404, + content_type=D.ContentType.pdf, + ref=D.URI('/fake/path.pdf') + ) + ) + self.event = D.Event( + identifier=identifier, + event_date=created, + event_type=D.EventType.NEW, + categories=[D.Category('cs.DL')], + version=version + ) + self.timestamp = created + self.event_date = self.timestamp.date() + self.primary.register.add_events(self.event) + self.app = create_api_app() + record.RepositoryService.init_app(self.app) self.client = self.app.test_client() - @mock.patch(f'repository.controllers.CanonicalStore') + +class TestGetEPrintEvents(AppTestCase): + """Requests for e-print events.""" + + @mock.patch(f'{record.__name__}.CanonicalStore') def test_request_for_nonexistant_eprint(self, mock_CanonicalStore): """Get events for a nonexistant e-print.""" - mock_CanonicalStore.current_session.return_value = mock.MagicMock( - load_eprint=mock.MagicMock(side_effect=DoesNotExist) - ) + mock_CanonicalStore.return_value = self.storage response = self.client.get('/e-print/1902.00123v4/events') self.assertEqual(response.status_code, HTTPStatus.NOT_FOUND) - - @mock.patch(f'repository.controllers.CanonicalStore') + + @mock.patch(f'{record.__name__}.CanonicalStore') def test_request_for_existant_eprint(self, mock_CanonicalStore): """A request is received for an existant e-print.""" - mock_CanonicalStore.current_session.return_value \ - = CanonicalStore.current_session() - - response = self.client.get('/e-print/1902.00123v4/events') + mock_CanonicalStore.return_value = self.storage + response = self.client.get('/e-print/2901.00345v1/events') self.assertEqual(response.status_code, HTTPStatus.OK) -class TestGetEPrintPDF(TestCase): +class TestGetEPrintPDF(AppTestCase): """Requests for e-print PDFs.""" - def setUp(self): - """Spin up an app.""" - self.app = create_api_app() - self.client = self.app.test_client() - - @mock.patch(f'repository.controllers.CanonicalStore') + @mock.patch(f'{record.__name__}.CanonicalStore') def test_request_for_nonexistant_eprint(self, mock_CanonicalStore): """Get PDF for a nonexistant e-print.""" - mock_CanonicalStore.current_session.return_value = mock.MagicMock( - load_eprint=mock.MagicMock(side_effect=DoesNotExist) - ) + mock_CanonicalStore.return_value = self.storage response = self.client.get('/e-print/1902.00123v4/pdf') self.assertEqual(response.status_code, HTTPStatus.NOT_FOUND) - @mock.patch(f'repository.controllers.CanonicalStore') + @mock.patch(f'{record.__name__}.CanonicalStore') def test_request_for_existant_eprint(self, mock_CanonicalStore): """A request is received for an existant e-print.""" - mock_CanonicalStore.current_session.return_value \ - = CanonicalStore.current_session() - response = self.client.get('/e-print/1902.00123v4/pdf') + mock_CanonicalStore.return_value = self.storage + response = self.client.get('/e-print/2901.00345v1/pdf') self.assertEqual(response.status_code, HTTPStatus.OK) self.assertEqual(response.headers['Content-Disposition'], - 'attachment; filename=1901.00123.pdf') - self.assertEqual(response.data, b'foopdf') + 'attachment; filename=2901.00345v1.pdf') + self.assertEqual(response.data, b'foocontent') diff --git a/schema/Identifier.json b/schema/Identifier.json new file mode 100644 index 0000000..da88a9e --- /dev/null +++ b/schema/Identifier.json @@ -0,0 +1,5 @@ +{ + "title": "Identifier", + "type": "string", + "pattern": "^([0-9]{4}\\.[0-9]{4,5})|([a-z\\-]+\\/[0-9]{2}[01][0-9]{4})$" +} \ No newline at end of file diff --git a/schema/resources/EPrintMetadata.json b/schema/resources/Abs.json similarity index 68% rename from schema/resources/EPrintMetadata.json rename to schema/resources/Abs.json index 50b3875..f021ddb 100644 --- a/schema/resources/EPrintMetadata.json +++ b/schema/resources/Abs.json @@ -1,34 +1,29 @@ { - "title": "EPrint", - "description": "Canonical metadata record for an arXiv e-print.", + "title": "Abs", + "description": "Legacy metadata record for an arXiv e-print.", "type": "object", "additionalProperties": false, + "required": [ + "identifier", + "submitted_date", + "announced_month", + "license", + "primary_classification", + "title", + "abstract", + "authors" + ], "properties": { "@type": { "type": "string" }, - "arxiv_id": { + "identifier": { "description": "Canonical arXiv e-print identifier", - "type": "string" - }, - "version": { - "description": "The version number for this e-print.", - "minimum": 1, - "type": "integer" + "$ref": "./VersionedIdentifier.json" }, "is_withdrawn": { "type": "boolean" }, - "reason_for_withdrawal": { - "oneOf": [ - {"type": "string"}, - {"type": "null"} - ] - }, - "legacy": { - "description": "Legacy records were carried forward from arXiv Classic.", - "type": "boolean" - }, "submitter": { "description": "The person who submitted the e-print.", "oneOf": [ @@ -41,9 +36,10 @@ "type": "string", "format": "date-time" }, - "announced_date": { + "announced_month": { "description": "Year and month (``%Y-%m``) this e-print version was announced.", - "type": "string" + "type": "string", + "pattern": "^[0-9]{4}-[0-9]{2}$" }, "license": { "$ref": "./License.json" @@ -53,34 +49,29 @@ "items": { "type": "object", "properties": { - "arxiv_id": { + "identifier": { "description": "Canonical arXiv e-print identifier", - "type": "string" - }, - "version": { - "description": "The version number for this e-print.", - "minimum": 1, - "type": "integer" + "$ref": "./VersionedIdentifier.json" }, "submitted_date": { "description": "Date this version of the e-print was submitted.", "type": "string", "format": "date-time" }, - "announced_date": { + "announced_month": { "description": "Year and month (``%Y-%m``) this e-print version was announced.", + "type": "string", + "pattern": "^[0-9]{4}-[0-9]{2}$" + }, + "source_type": { "type": "string" + }, + "size_kilobytes": { + "type": "number" } } } }, - "history": { - "type": "array", - "description": "Log of all transformative events for this e-print, up to and including this version.", - "items": { - "$ref": "./Event.json" - } - }, "primary_classification": { "type": "string" }, @@ -149,18 +140,6 @@ {"type": "string"}, {"type": "null"} ] - }, - "source_package": { - "oneOf": [ - {"$ref": "./File.json"}, - {"type": "null"} - ] - }, - "pdf": { - "oneOf": [ - {"$ref": "./File.json"}, - {"type": "null"} - ] } } } diff --git a/schema/resources/CanonicalFile.json b/schema/resources/CanonicalFile.json new file mode 100644 index 0000000..adfc144 --- /dev/null +++ b/schema/resources/CanonicalFile.json @@ -0,0 +1,29 @@ +{ + "title": "CanonicalFile", + "type": "object", + "properties": { + "creaged": { + "type": "string", + "format": "date-time" + }, + "modified": { + "type": "string", + "format": "date-time" + }, + "size_bytes": { + "type": "number" + }, + "content_type": { + "$ref": "./ContentType.json" + }, + "filename": { + "oneOf": [ + {"type": "string"}, + {"type": "null"} + ] + }, + "ref": { + "type": "string" + } + } +} \ No newline at end of file diff --git a/schema/resources/ContentType.json b/schema/resources/ContentType.json new file mode 100644 index 0000000..a0eee0b --- /dev/null +++ b/schema/resources/ContentType.json @@ -0,0 +1,12 @@ +{ + "type": "string", + "enum": [ + "pdf", + "tar", + "json", + "abs", + "html", + "tex", + "ps" + ] +} \ No newline at end of file diff --git a/schema/resources/Event.json b/schema/resources/Event.json index b87eb79..39753e8 100644 --- a/schema/resources/Event.json +++ b/schema/resources/Event.json @@ -1,44 +1,15 @@ { "title": "Event", - "type": "object", - "properties": { - "@type": { - "type": "string" - }, - "arxiv_id": { - "description": "Canonical arXiv e-print identifier", - "type": "string" - }, - "version": { - "description": "The version number for this e-print.", - "minimum": 1, - "type": "integer" - }, - "legacy": { - "description": "Legacy records were carried forward from arXiv Classic.", - "type": "boolean" - }, - "event_date": { - "type": "string", - "format": "date-time" - }, - "event_type": { - "type": "string", - "enum": [ - "created", - "updated", - "cross", - "withdrawn" - ] - }, - "event_agent": { - "type": "string", - "description": "Username or service name." - }, - "description": { - "type": "string", - "description": "Brief description of the event and its context." + "allOf": [ + {"$ref": "./EventBase.json"}, + { + "type": "object", + "required": ["version"], + "properties": { + "version": { + "$ref": "./Version.json" + } + } } - }, - "required": ["event_date", "event_type", "event_agent"] + ] } diff --git a/schema/resources/EventBase.json b/schema/resources/EventBase.json new file mode 100644 index 0000000..5f958c4 --- /dev/null +++ b/schema/resources/EventBase.json @@ -0,0 +1,51 @@ +{ + "title": "EventBase", + "type": "object", + "required": [ + "identifier", + "event_date", + "event_type", + "event_id", + "is_legacy" + ], + "properties": { + "@type": { + "type": "string" + }, + "identifier": { + "description": "Canonical arXiv e-print identifier", + "$ref": "./VersionedIdentifier.json" + }, + "event_date": { + "type": "string", + "format": "date-time" + }, + "event_type": { + "$ref": "./EventType.json" + }, + "event_id": { + "$ref": "./EventIdentifier.json" + }, + "categories": { + "type": "array", + "items": { + "type": "string" + } + }, + "description": { + "type": "string", + "description": "Brief description of the event and its context." + }, + "is_legacy": { + "description": "Legacy records were carried forward from arXiv Classic.", + "type": "boolean" + }, + "event_agent": { + "oneOf": [ + {"type": "string"}, + {"type": "null"} + ], + "description": "Username or service name." + } + } +} diff --git a/schema/resources/EventIdentifier.json b/schema/resources/EventIdentifier.json new file mode 100644 index 0000000..7fa836e --- /dev/null +++ b/schema/resources/EventIdentifier.json @@ -0,0 +1,4 @@ +{ + "title": "EventIdentifier", + "type": "string" +} \ No newline at end of file diff --git a/schema/resources/EventSummary.json b/schema/resources/EventSummary.json new file mode 100644 index 0000000..f82df62 --- /dev/null +++ b/schema/resources/EventSummary.json @@ -0,0 +1,4 @@ +{ + "title": "EventSummary", + "$ref": "./EventBase.json" +} diff --git a/schema/resources/EventType.json b/schema/resources/EventType.json new file mode 100644 index 0000000..e711705 --- /dev/null +++ b/schema/resources/EventType.json @@ -0,0 +1,14 @@ +{ + "type": "string", + "enum": [ + "new", + "update", + "update_metadata", + "replace", + "cross", + "jref", + "withdraw", + "migrate", + "migrae_metadata" + ] +} \ No newline at end of file diff --git a/schema/resources/File.json b/schema/resources/File.json index 3e35303..41b846b 100644 --- a/schema/resources/File.json +++ b/schema/resources/File.json @@ -14,10 +14,6 @@ "checksum": { "type": "string" }, - "created": { - "type": "string", - "format": "date-time" - }, "modified": { "type": "string", "format": "date-time" diff --git a/schema/resources/Listing.json b/schema/resources/Listing.json index d04cc96..c9ed26e 100644 --- a/schema/resources/Listing.json +++ b/schema/resources/Listing.json @@ -2,10 +2,14 @@ "title": "Listing", "description": "Canonical announcement record.", "type": "object", + "additionalProperties": false, "properties": { "@type": { "type": "string" }, + "identifier": { + "type": "string" + }, "date": { "type": "string", "format": "date-time" @@ -13,7 +17,7 @@ "events": { "type": "array", "items": { - "$ref": "./ListingEvent.json" + "$ref": "./Event.json" } } } diff --git a/schema/resources/Metadata.json b/schema/resources/Metadata.json new file mode 100644 index 0000000..8289cbf --- /dev/null +++ b/schema/resources/Metadata.json @@ -0,0 +1,69 @@ +{ + "title": "Metadata", + "type": "object", + "properties": { + "@type": { + "type": "string" + }, + "primary_classification": { + "type": "string" + }, + "secondary_classification": { + "type": "array", + "items": { + "type": "string" + } + }, + "title": { + "type": "string" + }, + "abstract": { + "type": "string" + }, + "authors": { + "type": "string" + }, + "license": { + "$ref": "./License.json" + }, + "comments": { + "oneOf": [ + {"type": "string"}, + {"type": "null"} + ] + }, + + "journal_ref": { + "oneOf": [ + {"type": "string"}, + {"type": "null"} + ] + }, + "report_num": { + "oneOf": [ + {"type": "string"}, + {"type": "null"} + ] + }, + "doi": { + "oneOf": [ + {"type": "string"}, + {"type": "null"} + ] + }, + "msc_class": { + "description": "Classifications from American Mathematical Society Mathematical Subject Classification (MSC)", + "oneOf": [ + {"type": "string"}, + {"type": "null"} + ] + }, + "acm_class": { + "description": "Classifications from ACM Computing Classification System", + "oneOf": [ + {"type": "string"}, + {"type": "null"} + ] + } + } +} \ No newline at end of file diff --git a/schema/resources/Version.json b/schema/resources/Version.json new file mode 100644 index 0000000..2ebfa2e --- /dev/null +++ b/schema/resources/Version.json @@ -0,0 +1,117 @@ +{ + "title": "Version", + "type": "object", + "additionalProperties": false, + "required": [ + "identifier", + "announced_date", + "announced_date_first", + "submitted_date", + "updated_date", + "metadata", + "is_announced", + "is_withdrawn", + "events", + "is_legacy", + "render", + "source" + ], + "properties": { + "@type": { + "type": "string" + }, + "identifier": { + "description": "Canonical arXiv e-print identifier", + "$ref": "./VersionedIdentifier.json" + }, + "announced_date": { + "description": "Date this e-print version was announced.", + "type": "string", + "format": "date" + }, + "announced_date_first": { + "description": "Date that the first version of the e-print was announced.", + "type": "string", + "format": "date" + }, + "submitted_date": { + "description": "Date this version of the e-print was submitted.", + "type": "string", + "format": "date-time" + }, + "updated_date": { + "description": "The last time the record for this version was written.", + "type": "string", + "format": "date-time" + }, + "metadata": { + "$ref": "./Metadata.json" + }, + "events": { + "type": "array", + "items": { + "$ref": "./EventSummary.json" + } + }, + "previous_versions": { + "type": "array", + "items": { + "$ref": "./VersionReference.json" + } + }, + "submitter": { + "oneOf": [ + {"$ref": "./Person.json"}, + {"type": "null"} + ] + }, + "proxy": { + "oneOf": [ + {"type": "string"}, + {"type": "null"} + ] + }, + "is_announced": { + "type": "boolean" + }, + "is_withdrawn": { + "type": "boolean" + }, + "reason_for_withdrawal": { + "oneOf": [ + {"type": "string"}, + {"type": "null"} + ] + }, + "is_legacy": { + "description": "Legacy records were carried forward from arXiv Classic.", + "type": "boolean" + }, + "render": { + "$ref": "./CanonicalFile.json" + }, + "source": { + "$ref": "./CanonicalFile.json" + }, + "formats": { + "type": "array", + "items": { + "type": "object", + "properties": { + "format": { + "type": "string" + }, + "content": { + "$ref": "./CanonicalFile.json" + } + } + } + }, + "source_type": { + "oneOf": [ + {"type": "string"}, + {"type": "null"} + ] + } + } +} \ No newline at end of file diff --git a/schema/resources/VersionReference.json b/schema/resources/VersionReference.json new file mode 100644 index 0000000..bb5ed5f --- /dev/null +++ b/schema/resources/VersionReference.json @@ -0,0 +1,23 @@ +{ + "title": "VersionReference", + "type": "object", + "properties": { + "@type": { + "type": "string" + }, + "identifier": { + "description": "Canonical arXiv e-print identifier", + "$ref": "./VersionedIdentifier.json" + }, + "announced_date": { + "description": "Date this e-print version was announced.", + "type": "string", + "format": "date" + }, + "submitted_date": { + "description": "Date this version of the e-print was submitted.", + "type": "string", + "format": "date-time" + } + } +} \ No newline at end of file diff --git a/schema/resources/VersionedIdentifier.json b/schema/resources/VersionedIdentifier.json new file mode 100644 index 0000000..00e38db --- /dev/null +++ b/schema/resources/VersionedIdentifier.json @@ -0,0 +1,5 @@ +{ + "title": "VersionedIdentifier", + "type": "string", + "pattern": "^(([0-9]{4}\\.[0-9]{4,5})|([a-z\\-]+\\/[0-9]{2}[01][0-9]{4}))v[0-9]+$" +} \ No newline at end of file diff --git a/tests/type-check.sh b/tests/type-check.sh index a864bbc..3e405cd 100755 --- a/tests/type-check.sh +++ b/tests/type-check.sh @@ -4,13 +4,13 @@ set -e PROJECT=$1 -touch ${PROJECT}/__init__.py -MYPY_STATUS=$( pipenv run mypy -p ${PROJECT} | tee /dev/tty | grep -v "test.*" | wc -l | tr -d '[:space:]' ) +touch ${PROJECT}/__init__.py || echo "Not a package..." +MYPY_STATUS=$( pipenv run mypy -p ${PROJECT} | grep -v "test.*" | tee /dev/tty | wc -l | tr -d '[:space:]' ) if [ $MYPY_STATUS -ne 0 ]; then MYPY_STATE="failure" && echo "mypy failed"; else MYPY_STATE="success" && echo "mypy passed"; fi if [ -z ${GITHUB_TOKEN} ]; then echo "Github token not set; will not report results"; -else +else curl -u $USERNAME:$GITHUB_TOKEN \ -d '{"state": "'$MYPY_STATE'", "target_url": "https://travis-ci.org/'$TRAVIS_REPO_SLUG'/builds/'$TRAVIS_BUILD_ID'", "description": "", "context": "'$PROJECT'/code-quality/mypy"}' \ -XPOST https://api.github.com/repos/$TRAVIS_REPO_SLUG/statuses/$SHA \ diff --git a/transform.py b/transform.py deleted file mode 100644 index b903ede..0000000 --- a/transform.py +++ /dev/null @@ -1,40 +0,0 @@ -"""Transform Classic abs files into NG canonical format.""" - -import os -from typing import Iterable - -import click - -from arxiv import canonical - - -@click.command() -@click.option('--abs', help='Path to directory with classic abs files') -@click.option('--daily', help='Path to daily.log file') -@click.option('--output', help='Path to directory for NG canonical data') -def transform(abs: str, daily: str, output: str): - """.""" - if not os.path.exists(abs): - raise RuntimeError(f'No such path: {abs}') - if not os.path.exists(daily): - raise RuntimeError(f'No such path: {daily}') - if not os.path.exists(output): - os.makedirs(output) - - iter_data = (canonical.serialize.classic.parse(abs_path) - for abs_path in iter_abs(input)) - - for record in iter_data: - print(record.version, record.comments) - - -def iter_abs(input: str) -> Iterable[str]: - """Grab paths to abs files from the ``input`` directory.""" - for parent, dirs, fnames in os.walk(input): - for fname in fnames: - if fname.endswith('.abs'): - yield os.path.join(parent, fname) - - -if __name__ == '__main__': - transform()