From 8b5d1dbbb398ef1e4caf451aad372ee62f7aa200 Mon Sep 17 00:00:00 2001 From: Wenjun Si Date: Thu, 3 Oct 2024 09:56:17 +0800 Subject: [PATCH] Releases v0.12.0 (#251) --- License | 2 +- MANIFEST.in | 1 - benchmarks/perf_storage_api_arrow.py | 13 +- benchmarks/perf_tabletunnel.py | 57 +- benchmarks/perf_types.py | 28 +- bin/copyright.py | 200 +++ bin/string_fixer.py | 105 ++ cupid/io/table/__init__.py | 5 - cupid/io/table/core.py | 14 - cupid/io/table/pd.py | 142 -- cupid/runtime/ctypes_libs.py | 22 - docs/Makefile | 1 + docs/source/api-def.rst | 1 + docs/source/base-schemas.rst | 20 +- docs/source/base-sql.rst | 48 +- docs/source/base-sqlalchemy.rst | 33 +- docs/source/base-tables.rst | 150 +- docs/source/conf.py | 10 +- docs/source/faq.rst | 2 +- docs/source/locale/en/LC_MESSAGES/api-def.po | 875 ++++++---- docs/source/locale/en/LC_MESSAGES/api-df.po | 24 +- docs/source/locale/en/LC_MESSAGES/api.po | 2 +- .../locale/en/LC_MESSAGES/base-dbapi.po | 22 +- .../locale/en/LC_MESSAGES/base-functions.po | 11 +- .../locale/en/LC_MESSAGES/base-instances.po | 50 +- .../locale/en/LC_MESSAGES/base-models.po | 40 +- .../locale/en/LC_MESSAGES/base-projects.po | 6 +- .../locale/en/LC_MESSAGES/base-resources.po | 63 +- .../locale/en/LC_MESSAGES/base-schemas.po | 84 +- docs/source/locale/en/LC_MESSAGES/base-sql.po | 287 ++-- .../locale/en/LC_MESSAGES/base-sqlalchemy.po | 145 +- .../locale/en/LC_MESSAGES/base-tables.po | 795 +++++---- docs/source/locale/en/LC_MESSAGES/base.po | 11 +- docs/source/locale/en/LC_MESSAGES/df-agg.po | 46 +- docs/source/locale/en/LC_MESSAGES/df-basic.po | 321 ++-- .../en/LC_MESSAGES/df-debug-instruction.po | 47 +- .../locale/en/LC_MESSAGES/df-element.po | 159 +- docs/source/locale/en/LC_MESSAGES/df-merge.po | 77 +- docs/source/locale/en/LC_MESSAGES/df-plot.po | 10 +- .../locale/en/LC_MESSAGES/df-quickstart.po | 43 +- .../en/LC_MESSAGES/df-sort-distinct-apply.po | 259 +-- .../source/locale/en/LC_MESSAGES/df-window.po | 20 +- docs/source/locale/en/LC_MESSAGES/df.po | 22 +- docs/source/locale/en/LC_MESSAGES/faq-ext.po | 14 +- docs/source/locale/en/LC_MESSAGES/faq.po | 80 +- docs/source/locale/en/LC_MESSAGES/index.po | 27 +- .../locale/en/LC_MESSAGES/installation-ext.po | 11 +- docs/source/locale/en/LC_MESSAGES/options.po | 48 +- .../locale/en/LC_MESSAGES/platform-d2.po | 299 ++-- .../en/LC_MESSAGES/platform-migrate-ext.po | 20 +- docs/source/locale/en/LC_MESSAGES/platform.po | 15 +- .../en/LC_MESSAGES/pyodps-pack-minikube.po | 28 +- .../locale/en/LC_MESSAGES/pyodps-pack.po | 529 +++--- docs/source/norm_zh.py | 155 ++ docs/source/options.rst | 5 +- docs/source/platform-d2.rst | 65 +- docs/source/pyodps-pack.rst | 13 +- notebooks/nb_init.py | 1 + odps/__init__.py | 19 +- odps/_version.py | 12 +- odps/accounts.py | 206 ++- odps/apis/__init__.py | 4 +- odps/apis/storage_api/__init__.py | 4 +- odps/apis/storage_api/conftest.py | 6 +- odps/apis/storage_api/storage_api.py | 218 ++- odps/apis/storage_api/tests/__init__.py | 7 +- odps/apis/storage_api/tests/data_item.conf | 2 +- .../tests/record_batch_generate.py | 32 +- .../storage_api/tests/test_storage_api.py | 28 +- .../tests/test_storage_api_arrow.py | 77 +- odps/apis/storage_api/tests/util.py | 26 +- odps/compat.py | 132 +- odps/config.py | 422 +++-- odps/conftest.py | 14 +- odps/console.py | 289 ++-- odps/core.py | 1448 ++++++++--------- odps/counters.py | 19 +- odps/crc.py | 149 +- odps/dag.py | 54 +- odps/dbapi.py | 122 +- odps/df/backends/frame.py | 4 +- odps/df/backends/odpssql/engine.py | 20 +- odps/df/backends/odpssql/tests/test_engine.py | 56 +- odps/df/backends/odpssql/types.py | 1 - odps/df/backends/pd/tests/test_engine.py | 13 +- odps/df/backends/tests/test_mixed_engine.py | 6 +- odps/df/tests/test_delay.py | 23 +- odps/distcache.py | 5 +- odps/errors.py | 132 +- odps/examples/tables.py | 461 ++++-- odps/inter.py | 114 +- odps/ipython/__init__.py | 4 +- odps/ipython/completer.py | 95 +- odps/ipython/magics.py | 193 ++- odps/ipython/tests/test_completer.py | 38 +- odps/ipython/tests/test_magics.py | 30 +- odps/lab_extension/.eslintignore | 5 - odps/lab_extension/.eslintrc.js | 39 - odps/lab_extension/.gitignore | 112 -- odps/lab_extension/.prettierignore | 5 - odps/lab_extension/.prettierrc | 5 - odps/lab_extension/LICENSE | 28 - odps/lab_extension/MANIFEST.in | 24 - odps/lab_extension/README.md | 62 - odps/lab_extension/install.json | 5 - odps/lab_extension/package.json | 92 -- .../pyodps-lab-extension/_version.py | 19 - odps/lab_extension/pyproject.toml | 3 - odps/lab_extension/setup.py | 94 -- odps/lab_extension/src/Container/index.ts | 19 - .../DataVisualization/RenderTableSchema.tsx | 59 - .../ConfigBuilder/EditorConfigBuilder.ts | 31 - .../ConfigBuilder/LSPEditorConfigBuilder.ts | 105 -- .../MonacoEditorConfigBuilder.ts | 67 - odps/lab_extension/src/Editor/SqlEditor.tsx | 133 -- odps/lab_extension/src/Editor/Template.ts | 95 -- .../src/Register/RegisterDataVisualization.ts | 18 - .../src/Register/RegisterSelectCellType.ts | 27 - .../src/Register/RegisterSqlEditorReverser.ts | 31 - odps/lab_extension/src/Utils/injectCDN.ts | 18 - odps/lab_extension/src/Utils/isInner.ts | 3 - odps/lab_extension/src/global.d.ts | 4 - odps/lab_extension/src/index.ts | 38 - odps/lab_extension/style/base.css | 0 odps/lab_extension/style/index.css | 1 - odps/lab_extension/style/index.js | 1 - odps/lab_extension/tsconfig.json | 24 - odps/lib/tblib/__init__.py | 251 ++- odps/lib/tblib/cpython.py | 4 +- odps/lib/tblib/decorators.py | 13 +- odps/lib/tblib/pickling_support.py | 87 +- odps/models/__init__.py | 45 +- odps/models/cache.py | 32 +- odps/models/cluster_info.py | 12 +- odps/models/core.py | 69 +- odps/models/function.py | 47 +- odps/models/functions.py | 43 +- odps/models/instance.py | 861 ++++++---- odps/models/instances.py | 152 +- odps/models/job.py | 33 +- odps/models/ml/__init__.py | 4 +- odps/models/ml/offlinemodel.py | 54 +- odps/models/ml/offlinemodels.py | 28 +- odps/models/partition.py | 174 +- odps/models/partitions.py | 83 +- odps/models/project.py | 135 +- odps/models/projects.py | 59 +- odps/models/quota.py | 142 ++ odps/models/quotas.py | 79 + odps/models/readers.py | 232 ++- odps/models/record.py | 10 +- odps/models/resource.py | 308 ++-- odps/models/resourcefile.py | 39 +- odps/models/resources.py | 58 +- odps/models/schema.py | 44 +- odps/models/schemas.py | 37 +- odps/models/security/config.py | 64 +- odps/models/security/roles.py | 35 +- odps/models/security/users.py | 37 +- odps/models/session.py | 558 ++++++- odps/models/storage_tier.py | 13 +- odps/models/table.py | 678 +++++--- odps/models/tableio.py | 513 +++++- odps/models/tables.py | 117 +- odps/models/tasks.py | 342 ---- .../{tunnel/pdio => models/tasks}/__init__.py | 18 +- odps/models/tasks/copy.py | 71 + odps/models/tasks/core.py | 197 +++ odps/models/tasks/maxframe.py | 67 + odps/models/tasks/merge.py | 410 +++++ .../__init__.py => models/tasks/misc.py} | 22 +- odps/models/tasks/sql.py | 128 ++ .../tasks/tests/__init__.py} | 10 +- odps/models/tasks/tests/test_merge.py | 133 ++ odps/models/{ => tasks}/tests/test_tasks.py | 143 +- odps/models/tenant.py | 22 +- odps/models/tests/test_functions.py | 47 +- odps/models/tests/test_instances.py | 445 +++-- odps/models/tests/test_offline_models.py | 5 +- odps/models/tests/test_partitions.py | 90 +- odps/models/tests/test_projects.py | 6 +- odps/models/tests/test_quotas.py | 33 + odps/models/tests/test_resources.py | 183 ++- odps/models/tests/test_schemas.py | 49 +- odps/models/tests/test_security.py | 40 +- odps/models/tests/test_session.py | 171 +- odps/models/tests/test_storage_tier.py | 2 +- odps/models/tests/test_tableio.py | 481 ++++-- odps/models/tests/test_tables.py | 195 ++- odps/models/tests/test_tenant.py | 2 +- odps/models/tests/test_volumes.py | 134 +- odps/models/tests/test_xflows.py | 55 +- odps/models/volume_ext.py | 70 +- odps/models/volume_fs.py | 220 +-- odps/models/volume_parted.py | 158 +- odps/models/volumes.py | 115 +- odps/models/worker.py | 48 +- odps/models/xflow.py | 4 +- odps/models/xflows.py | 105 +- odps/readers.py | 195 ++- odps/rest.py | 137 +- odps/serializers.py | 241 +-- odps/sqlalchemy_odps.py | 317 +++- odps/src/crc32c_c.pxd | 5 +- odps/src/stringstream.pxd | 1 + odps/src/types_c.pxd | 2 +- odps/src/types_c.pyx | 47 +- odps/src/utils_c.pxd | 2 +- odps/src/utils_c.pyx | 39 +- odps/superset_odps.py | 137 +- odps/tempobj.py | 205 ++- odps/tests/core.py | 122 +- odps/tests/dictconfig.py | 318 ++-- odps/tests/test_accounts.py | 79 +- odps/tests/test_config.py | 52 +- odps/tests/test_crc.py | 6 +- odps/tests/test_dag.py | 78 +- odps/tests/test_errors.py | 55 +- odps/tests/test_inter.py | 35 +- odps/tests/test_serializers.py | 186 ++- odps/tests/test_sqlalchemy_odps.py | 299 ++-- odps/tests/test_superset_odps.py | 22 +- odps/tests/test_tempobjs.py | 132 +- odps/tests/test_types.py | 380 +++-- odps/tests/test_unixsocket.py | 3 +- odps/tests/test_utils.py | 193 ++- odps/tunnel/__init__.py | 10 +- odps/tunnel/base.py | 30 +- odps/tunnel/checksum.py | 28 +- odps/tunnel/checksum_c.pxd | 17 +- odps/tunnel/checksum_c.pyx | 18 +- odps/tunnel/errors.py | 20 +- odps/tunnel/hasher.py | 24 +- odps/tunnel/hasher_c.pxd | 6 +- odps/tunnel/hasher_c.pyx | 12 +- odps/tunnel/instancetunnel.py | 224 ++- odps/tunnel/io/reader.py | 330 +++- odps/tunnel/io/reader_c.pxd | 13 +- odps/tunnel/io/reader_c.pyx | 110 +- odps/tunnel/io/stream.py | 118 +- odps/tunnel/io/types.py | 80 +- odps/tunnel/io/writer.py | 86 +- odps/tunnel/io/writer_c.pxd | 29 +- odps/tunnel/io/writer_c.pyx | 53 +- odps/tunnel/pb/decoder.py | 14 +- odps/tunnel/pb/decoder_c.pxd | 2 +- odps/tunnel/pb/decoder_c.pyx | 9 +- odps/tunnel/pb/encoder.py | 16 +- odps/tunnel/pb/encoder_c.pxd | 21 +- odps/tunnel/pb/encoder_c.pyx | 21 +- odps/tunnel/pb/errors.py | 1 - odps/tunnel/pb/input_stream.py | 33 +- odps/tunnel/pb/output_stream.py | 41 +- odps/tunnel/pb/util_c.pxi | 10 +- odps/tunnel/pb/wire_format.py | 33 +- odps/tunnel/pdio/block_decoder_c.pxd | 54 - odps/tunnel/pdio/block_decoder_c.pyx | 207 --- odps/tunnel/pdio/block_encoder_c.pxd | 45 - odps/tunnel/pdio/block_encoder_c.pyx | 206 --- odps/tunnel/pdio/pdreader_c.pxd | 63 - odps/tunnel/pdio/pdreader_c.pyx | 390 ----- odps/tunnel/pdio/pdwriter.py | 45 - odps/tunnel/pdio/pdwriter_c.pxd | 74 - odps/tunnel/pdio/pdwriter_c.pyx | 446 ----- odps/tunnel/pdio/util_c.pxd | 29 - odps/tunnel/tabletunnel.py | 547 ++++--- odps/tunnel/tests/test_arrow_tabletunnel.py | 164 +- odps/tunnel/tests/test_hasher.py | 98 +- odps/tunnel/tests/test_instancetunnel.py | 155 +- odps/tunnel/tests/test_pb.py | 63 +- odps/tunnel/tests/test_pdio.py | 258 --- odps/tunnel/tests/test_streamio.py | 62 +- odps/tunnel/tests/test_tabletunnel.py | 562 +++++-- odps/tunnel/tests/test_volumetunnel.py | 24 +- odps/tunnel/volumetunnel.py | 536 +++--- odps/types.py | 604 ++++--- odps/udf/__init__.py | 5 +- odps/udf/runtime.py | 50 +- odps/udf/tests/test_executioncontext.py | 4 +- odps/udf/tests/test_resource.py | 10 +- odps/udf/tests/test_runners.py | 105 ++ odps/udf/tests/test_simple_run.py | 14 +- odps/udf/tests/test_types_py2.py | 43 + odps/udf/tests/test_usercounter.py | 61 +- odps/udf/tests/udf_examples.py | 43 +- odps/udf/tools/runners.py | 280 ++-- odps/udf/tools/utils.py | 39 +- odps/ui/__init__.py | 4 +- odps/ui/common.py | 22 +- odps/ui/progress.py | 170 +- odps/ui/tests/base.py | 56 +- odps/ui/tests/test_ui.py | 17 +- odps/utils.py | 361 ++-- odps_scripts/pyodps_pack.py | 456 ++++-- odps_scripts/pyou.py | 86 +- pyproject.toml | 32 + setup.cfg | 69 + setup.py | 288 ++-- 298 files changed, 18891 insertions(+), 13259 deletions(-) create mode 100644 bin/copyright.py create mode 100644 bin/string_fixer.py delete mode 100644 cupid/io/table/pd.py create mode 100644 docs/source/norm_zh.py delete mode 100644 odps/lab_extension/.eslintignore delete mode 100644 odps/lab_extension/.eslintrc.js delete mode 100644 odps/lab_extension/.gitignore delete mode 100644 odps/lab_extension/.prettierignore delete mode 100644 odps/lab_extension/.prettierrc delete mode 100644 odps/lab_extension/LICENSE delete mode 100644 odps/lab_extension/MANIFEST.in delete mode 100644 odps/lab_extension/README.md delete mode 100644 odps/lab_extension/install.json delete mode 100644 odps/lab_extension/package.json delete mode 100644 odps/lab_extension/pyodps-lab-extension/_version.py delete mode 100644 odps/lab_extension/pyproject.toml delete mode 100644 odps/lab_extension/setup.py delete mode 100644 odps/lab_extension/src/Container/index.ts delete mode 100644 odps/lab_extension/src/DataVisualization/RenderTableSchema.tsx delete mode 100644 odps/lab_extension/src/Editor/ConfigBuilder/EditorConfigBuilder.ts delete mode 100644 odps/lab_extension/src/Editor/ConfigBuilder/LSPEditorConfigBuilder.ts delete mode 100644 odps/lab_extension/src/Editor/ConfigBuilder/MonacoEditorConfigBuilder.ts delete mode 100644 odps/lab_extension/src/Editor/SqlEditor.tsx delete mode 100644 odps/lab_extension/src/Editor/Template.ts delete mode 100644 odps/lab_extension/src/Register/RegisterDataVisualization.ts delete mode 100644 odps/lab_extension/src/Register/RegisterSelectCellType.ts delete mode 100644 odps/lab_extension/src/Register/RegisterSqlEditorReverser.ts delete mode 100644 odps/lab_extension/src/Utils/injectCDN.ts delete mode 100644 odps/lab_extension/src/Utils/isInner.ts delete mode 100644 odps/lab_extension/src/global.d.ts delete mode 100644 odps/lab_extension/src/index.ts delete mode 100644 odps/lab_extension/style/base.css delete mode 100644 odps/lab_extension/style/index.css delete mode 100644 odps/lab_extension/style/index.js delete mode 100644 odps/lab_extension/tsconfig.json create mode 100644 odps/models/quota.py create mode 100644 odps/models/quotas.py delete mode 100644 odps/models/tasks.py rename odps/{tunnel/pdio => models/tasks}/__init__.py (66%) create mode 100644 odps/models/tasks/copy.py create mode 100644 odps/models/tasks/core.py create mode 100644 odps/models/tasks/maxframe.py create mode 100644 odps/models/tasks/merge.py rename odps/{lab_extension/pyodps-lab-extension/__init__.py => models/tasks/misc.py} (50%) create mode 100644 odps/models/tasks/sql.py rename odps/{tunnel/pdio/errno.py => models/tasks/tests/__init__.py} (72%) create mode 100644 odps/models/tasks/tests/test_merge.py rename odps/models/{ => tasks}/tests/test_tasks.py (62%) create mode 100644 odps/models/tests/test_quotas.py delete mode 100644 odps/tunnel/pdio/block_decoder_c.pxd delete mode 100644 odps/tunnel/pdio/block_decoder_c.pyx delete mode 100644 odps/tunnel/pdio/block_encoder_c.pxd delete mode 100644 odps/tunnel/pdio/block_encoder_c.pyx delete mode 100644 odps/tunnel/pdio/pdreader_c.pxd delete mode 100644 odps/tunnel/pdio/pdreader_c.pyx delete mode 100644 odps/tunnel/pdio/pdwriter.py delete mode 100644 odps/tunnel/pdio/pdwriter_c.pxd delete mode 100644 odps/tunnel/pdio/pdwriter_c.pyx delete mode 100644 odps/tunnel/pdio/util_c.pxd delete mode 100644 odps/tunnel/tests/test_pdio.py create mode 100644 odps/udf/tests/test_runners.py create mode 100644 odps/udf/tests/test_types_py2.py create mode 100644 setup.cfg diff --git a/License b/License index 3c350891..26f99759 100644 --- a/License +++ b/License @@ -233,7 +233,7 @@ BSD 3-Clause BSD 2-Clause ------------ -- python-tblib:1.3.2 +- python-tblib:3.0.0 MIT License diff --git a/MANIFEST.in b/MANIFEST.in index dd3a1f1a..e87f9a54 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,4 @@ recursive-include odps/static *.* -recursive-include odps/internal/static *.* prune odps/static/ui/node_modules include requirements.txt global-include odps/**/*.yml diff --git a/benchmarks/perf_storage_api_arrow.py b/benchmarks/perf_storage_api_arrow.py index 06048f60..d15a1333 100644 --- a/benchmarks/perf_storage_api_arrow.py +++ b/benchmarks/perf_storage_api_arrow.py @@ -14,12 +14,13 @@ import logging import sys -import time import threading +import time import pytest from odps.apis.storage_api.conftest import storage_api_client # noqa: F401 + if sys.version_info[0] == 3: from odps.apis.storage_api import * else: @@ -87,7 +88,10 @@ def test_read_thread(storage_api_client): global global_total_record read_performance_threads = [] for i in range(0, thread_num): - read_performance_thread = threading.Thread(target=read_performance, args=[storage_api_client,]) + read_performance_thread = threading.Thread( + target=read_performance, + args=[storage_api_client], + ) read_performance_threads.append(read_performance_thread) start = time.time() @@ -104,7 +108,10 @@ def test_read_thread(storage_api_client): time.sleep(1) now = time.time() now_count = global_total_record - logger.info("index: %d, read, %f records per second" % (count, (now_count - start_count) / (now - start))) + logger.info( + "index: %d, read, %f records per second" + % (count, (now_count - start_count) / (now - start)) + ) if judge and cal_count < 5: cal_total_count += (now_count - start_count) / (now - start) diff --git a/benchmarks/perf_tabletunnel.py b/benchmarks/perf_tabletunnel.py index d40e0dbb..bd12151b 100644 --- a/benchmarks/perf_tabletunnel.py +++ b/benchmarks/perf_tabletunnel.py @@ -15,6 +15,7 @@ # limitations under the License. from __future__ import print_function + import cProfile import json import os @@ -26,12 +27,14 @@ if bool(json.loads(os.getenv("FORCE_PY", "0"))): from odps import options + options.force_py = True +from datetime import datetime + from odps.compat import Decimal from odps.conftest import odps, tunnel # noqa: F401 from odps.models import TableSchema -from datetime import datetime # remember to reset False before committing ENABLE_PROFILE = bool(json.loads(os.getenv("ENABLE_PROFILE", "0"))) @@ -40,14 +43,16 @@ COMPRESS_DATA = True BUFFER_SIZE = 1024 * 1024 DATA_AMOUNT = 100000 -STRING_LITERAL = "Soft kitty, warm kitty, little ball of fur; happy kitty, sleepy kitty, purr, purr" +STRING_LITERAL = ( + "Soft kitty, warm kitty, little ball of fur; happy kitty, sleepy kitty, purr, purr" +) NUMERIC_ONLY = bool(json.loads(os.getenv("NUMERIC_ONLY", "0"))) @pytest.fixture def schema(): - fields = ['a', 'b', 'c', 'd', 'e', 'f'] - types = ['bigint', 'double', 'datetime', 'boolean', 'string', 'decimal'] + fields = ["a", "b", "c", "d", "e", "f"] + types = ["bigint", "double", "datetime", "boolean", "string", "decimal"] return TableSchema.from_lists(fields, types) @@ -61,18 +66,18 @@ def profiled(): finally: if ENABLE_PROFILE: if DUMP_PROFILE: - pr.dump_stats('profile.out') + pr.dump_stats("profile.out") p = Stats(pr) p.strip_dirs() - p.sort_stats('time') + p.sort_stats("time") p.print_stats(40) - p.print_callees('types.py:846\(validate_value', 20) - p.print_callees('types.py:828\(_validate_primitive_value', 20) - p.print_callees('tabletunnel.py:185\(write', 20) + p.print_callees("types.py:846\(validate_value", 20) + p.print_callees("types.py:828\(_validate_primitive_value", 20) + p.print_callees("tabletunnel.py:185\(write", 20) def test_write(odps, schema, tunnel): - table_name = 'pyodps_test_tunnel_write_performance' + table_name = "pyodps_test_tunnel_write_performance" odps.create_table(table_name, schema, if_not_exists=True) ss = tunnel.create_upload_session(table_name) r = ss.new_record() @@ -80,33 +85,35 @@ def test_write(odps, schema, tunnel): start = time.time() with ss.open_record_writer(0) as writer, profiled(): for i in range(DATA_AMOUNT): - r[0] = 2**63-1 + r[0] = 2**63 - 1 r[1] = 0.0001 r[2] = datetime(2015, 11, 11) if not NUMERIC_ONLY else None r[3] = True r[4] = STRING_LITERAL if not NUMERIC_ONLY else None - r[5] = Decimal('3.15') if not NUMERIC_ONLY else None + r[5] = Decimal("3.15") if not NUMERIC_ONLY else None writer.write(r) n_bytes = writer.n_bytes - print(n_bytes, 'bytes', float(n_bytes) / 1024 / 1024 / (time.time() - start), 'MiB/s') + print( + n_bytes, "bytes", float(n_bytes) / 1024 / 1024 / (time.time() - start), "MiB/s" + ) ss.commit([0]) odps.delete_table(table_name, if_exists=True) def test_read(odps, schema, tunnel): - table_name = 'pyodps_test_tunnel_read_performance' + table_name = "pyodps_test_tunnel_read_performance" odps.delete_table(table_name, if_exists=True) t = odps.create_table(table_name, schema) def gen_data(): for i in range(DATA_AMOUNT): r = t.new_record() - r[0] = 2 ** 63 - 1 + r[0] = 2**63 - 1 r[1] = 0.0001 r[2] = datetime(2015, 11, 11) if not NUMERIC_ONLY else None r[3] = True r[4] = STRING_LITERAL if not NUMERIC_ONLY else None - r[5] = Decimal('3.15') if not NUMERIC_ONLY else None + r[5] = Decimal("3.15") if not NUMERIC_ONLY else None yield r odps.write_table(t, gen_data()) @@ -119,28 +126,34 @@ def gen_data(): for _ in reader: cnt += 1 n_bytes = reader.n_bytes - print(n_bytes, 'bytes', float(n_bytes) / 1024 / 1024 / (time.time() - start), 'MiB/s') + print( + n_bytes, "bytes", float(n_bytes) / 1024 / 1024 / (time.time() - start), "MiB/s" + ) assert DATA_AMOUNT == cnt odps.delete_table(table_name, if_exists=True) def test_buffered_write(odps, schema, tunnel): - table_name = 'test_tunnel_bufferred_write' + table_name = "test_tunnel_bufferred_write" odps.create_table(table_name, schema, if_not_exists=True) ss = tunnel.create_upload_session(table_name) r = ss.new_record() start = time.time() - with ss.open_record_writer(buffer_size=BUFFER_SIZE, compress=COMPRESS_DATA) as writer: + with ss.open_record_writer( + buffer_size=BUFFER_SIZE, compress=COMPRESS_DATA + ) as writer: for i in range(DATA_AMOUNT): - r[0] = 2**63-1 + r[0] = 2**63 - 1 r[1] = 0.0001 r[2] = datetime(2015, 11, 11) if not NUMERIC_ONLY else None r[3] = True r[4] = STRING_LITERAL if not NUMERIC_ONLY else None - r[5] = Decimal('3.15') if not NUMERIC_ONLY else None + r[5] = Decimal("3.15") if not NUMERIC_ONLY else None writer.write(r) n_bytes = writer.n_bytes - print(n_bytes, 'bytes', float(n_bytes) / 1024 / 1024 / (time.time() - start), 'MiB/s') + print( + n_bytes, "bytes", float(n_bytes) / 1024 / 1024 / (time.time() - start), "MiB/s" + ) ss.commit(writer.get_blocks_written()) odps.delete_table(table_name, if_exists=True) diff --git a/benchmarks/perf_types.py b/benchmarks/perf_types.py index e612eeb8..b28bc1f9 100644 --- a/benchmarks/perf_types.py +++ b/benchmarks/perf_types.py @@ -21,60 +21,64 @@ import pytest -from odps.models import TableSchema, Record +from odps.models import Record, TableSchema COMPRESS_DATA = True BUFFER_SIZE = 1024 * 1024 DATA_AMOUNT = 100000 -STRING_LITERAL = "Soft kitty, warm kitty, little ball of fur; happy kitty, sleepy kitty, purr, purr" +STRING_LITERAL = ( + "Soft kitty, warm kitty, little ball of fur; happy kitty, sleepy kitty, purr, purr" +) @pytest.fixture def schema(): pr = cProfile.Profile() pr.enable() - fields = ['bigint', 'double', 'datetime', 'boolean', 'string', 'decimal'] - types = ['bigint', 'double', 'datetime', 'boolean', 'string', 'decimal'] + fields = ["bigint", "double", "datetime", "boolean", "string", "decimal"] + types = ["bigint", "double", "datetime", "boolean", "string", "decimal"] try: - yield TableSchema.from_lists(fields, types) + schema = TableSchema.from_lists(fields, types) + schema.build_snapshot() + yield schema finally: p = Stats(pr) p.strip_dirs() - p.sort_stats('cumtime') + p.sort_stats("cumtime") p.print_stats(40) def test_set_record_field_bigint(schema): r = Record(schema=schema) for i in range(10**6): - r['bigint'] = 2**63-1 + r["bigint"] = 2**63 - 1 def test_set_record_field_double(schema): r = Record(schema=schema) for i in range(10**6): - r['double'] = 0.0001 + r["double"] = 0.0001 def test_set_record_field_boolean(schema): r = Record(schema=schema) for i in range(10**6): - r['boolean'] = False + r["boolean"] = False def test_set_record_field_string(schema): r = Record(schema=schema) for i in range(10**6): - r['string'] = STRING_LITERAL + r["string"] = STRING_LITERAL def test_write_set_record_field_datetime(schema): r = Record(schema=schema) for i in range(10**6): - r['datetime'] = datetime(2016, 1, 1) + r["datetime"] = datetime(2016, 1, 1) def test_set_record_field_decimal(schema): r = Record(schema=schema) for i in range(10**6): - r['decimal'] = Decimal('1.111111') + r["decimal"] = Decimal("1.111111") diff --git a/bin/copyright.py b/bin/copyright.py new file mode 100644 index 00000000..6380e263 --- /dev/null +++ b/bin/copyright.py @@ -0,0 +1,200 @@ +# from https://github.com/sbrunner/hooks/blob/280356fe7906110b1a2275c553f2f40343a0e195/sbrunner_hooks/copyright.py +# with minor modifications. +# licensed under BSD 2-Clause "Simplified" license +# Copyright (c) 2022-2024, Stéphane Brunner +"""Update the copyright header of the files.""" + +import argparse +import datetime +import os.path +import re +import subprocess # nosec +import sys +from typing import TYPE_CHECKING, Tuple + +import yaml + +if TYPE_CHECKING: + StrPattern = re.Pattern[str] +else: + StrPattern = re.Pattern + +CURRENT_YEAR = str(datetime.datetime.now().year) + + +def main() -> None: + """Update the copyright header of the files.""" + args_parser = argparse.ArgumentParser("Update the copyright header of the files") + args_parser.add_argument( + "--config", help="The configuration file", default=".github/copyright.yaml" + ) + args_parser.add_argument( + "--required", action="store_true", help="The copyright is required" + ) + args_parser.add_argument( + "--verbose", "-v", action="store_true", help="Verbose mode" + ) + args_parser.add_argument( + "files", nargs=argparse.REMAINDER, help="The files to update" + ) + args = args_parser.parse_args() + + config = {} + if os.path.exists(args.config): + with open(args.config, encoding="utf-8") as config_file: + config = yaml.load(config_file, Loader=yaml.SafeLoader) + + one_date_re = re.compile( + config.get("one_date_re", r" Copyright (?P[0-9]{4})") + ) + two_date_re = re.compile( + config.get("two_date_re", r" Copyright (?P[0-9]{4})-(?P[0-9]{4})") + ) + one_date_format = config.get("one_date_format", " Copyright {year}") + two_date_format = config.get("two_date_format", " Copyright {from}-{to}") + year_re = re.compile(r"^(?P[0-9]{4})-") + license_file = config.get("license_file", "LICENSE") + + success = True + no_git_log = False + for file_name in args.files: + try: + status_str = subprocess.run( # nosec + ["git", "status", "--porcelain", "--", file_name], + check=True, + encoding="utf-8", + stdout=subprocess.PIPE, + ).stdout + if status_str: + used_year = CURRENT_YEAR + if args.verbose: + print(f"File '{file_name}' is not committed.") + else: + if file_name == license_file: + date_str = subprocess.run( # nosec + ["git", "log", "--pretty=format:%ci", "-1"], + check=True, + encoding="utf-8", + stdout=subprocess.PIPE, + ).stdout + else: + date_str = subprocess.run( # nosec + [ + "git", + "log", + "--follow", + "--pretty=format:%ci", + "--", + file_name, + ], + check=True, + encoding="utf-8", + stdout=subprocess.PIPE, + ).stdout + if not date_str: + if args.verbose: + print(f"No log found with git on '{file_name}'.") + else: + if not no_git_log: + print( + f"No log found with git on '{file_name}' (the next messages will be hidden)." + ) + no_git_log = True + used_year = CURRENT_YEAR + else: + if args.verbose: + print(f"File '{file_name}' was committed on '{date_str}'.") + used_year_match = year_re.search(date_str) + assert used_year_match is not None # nosec + used_year = used_year_match.group("year") + except FileNotFoundError: + if not no_git_log: + print("No Git found.") + no_git_log = True + used_year = CURRENT_YEAR + except subprocess.CalledProcessError as error: + print(f"Error with Git on '{file_name}' ({str(error)}).") + used_year = CURRENT_YEAR + + with open(file_name, "r", encoding="utf-8") as file_obj: + content = file_obj.read() + file_success, content = update_file( + content, + used_year, + one_date_re, + two_date_re, + one_date_format, + two_date_format, + file_name, + args.required, + args.verbose, + ) + if not file_success: + success = False + with open(file_name, "w", encoding="utf-8") as file_obj: + file_obj.write(content) + print(f"Fixing copyright in '{file_name}'") + + if not success: + sys.exit(1) + + +def update_file( + content: str, + last_year: str, + one_date_re: StrPattern, + two_date_re: StrPattern, + one_date_format: str, + two_date_format: str, + filename: str = "", + required: bool = False, + verbose: bool = False, + current_year: str = CURRENT_YEAR, +) -> Tuple[bool, str]: + """Update the copyright header of the file content.""" + two_date_match = two_date_re.search(content) + if two_date_match: + if two_date_match.group("from") == two_date_match.group("to"): + if two_date_match.group("from") == current_year: + return False, two_date_re.sub( + one_date_format.format(**{"year": current_year}), content + ) + return ( + False, + two_date_re.sub( + two_date_format.format( + **{"from": two_date_match.group("from"), "to": current_year} + ), + content, + ), + ) + + if two_date_match.group("to") in (last_year, current_year): + return True, content + + return False, two_date_re.sub( + two_date_format.format( + **{"from": two_date_match.group("from"), "to": current_year} + ), + content, + ) + + one_date_match = one_date_re.search(content) + if one_date_match: + copyright_year = one_date_match.group("year") + + if copyright_year == last_year: + return True, content + + return False, one_date_re.sub( + two_date_format.format(**{"from": copyright_year, "to": current_year}), + content, + ) + + if required or verbose: + print(f"No copyright found on '{filename}'.") + return not required, content + + +if __name__ == "__main__": + main() diff --git a/bin/string_fixer.py b/bin/string_fixer.py new file mode 100644 index 00000000..7cc65083 --- /dev/null +++ b/bin/string_fixer.py @@ -0,0 +1,105 @@ +# from https://github.com/pre-commit/pre-commit-hooks/blob/f27ee318d2388b6e19ddc3e5281b5f09e261bcaf/pre_commit_hooks/string_fixer.py +# with minor modifications. +# licensed under MIT license +# Copyright (c) 2014, Anthony Sottile, Ken Struys +from __future__ import annotations + +import argparse +import io +import re +import sys +import tokenize +from typing import Sequence + +if sys.version_info >= (3, 12): # pragma: >=3.12 cover + FSTRING_START = tokenize.FSTRING_START + FSTRING_END = tokenize.FSTRING_END +else: # pragma: <3.12 cover + FSTRING_START = FSTRING_END = -1 + +START_QUOTE_RE = re.compile("^[a-zA-Z]*['\"]") + + +def handle_match(token_text: str, replace_single: bool = False) -> str: + if '"""' in token_text or "'''" in token_text: + return token_text + + match = START_QUOTE_RE.match(token_text) + if match is not None: + meat = token_text[match.end():-1] + if '"' in meat or "'" in meat: + return token_text + elif replace_single: + return match.group().replace("'", '"') + meat + '"' + else: + return match.group().replace('"', "'") + meat + "'" + else: + return token_text + + +def get_line_offsets_by_line_no(src: str) -> list[int]: + # Padded so we can index with line number + offsets = [-1, 0] + for line in src.splitlines(True): + offsets.append(offsets[-1] + len(line)) + return offsets + + +def fix_strings(filename: str, replace_single: bool = False) -> int: + with open(filename, encoding='UTF-8', newline='') as f: + contents = f.read() + line_offsets = get_line_offsets_by_line_no(contents) + + # Basically a mutable string + splitcontents = list(contents) + + fstring_depth = 0 + + # Iterate in reverse so the offsets are always correct + tokens_l = list(tokenize.generate_tokens(io.StringIO(contents).readline)) + tokens = reversed(tokens_l) + for token_type, token_text, (srow, scol), (erow, ecol), _ in tokens: + if token_type == FSTRING_START: # pragma: >=3.12 cover + fstring_depth += 1 + elif token_type == FSTRING_END: # pragma: >=3.12 cover + fstring_depth -= 1 + elif fstring_depth == 0 and token_type == tokenize.STRING: + new_text = handle_match(token_text, replace_single=replace_single) + splitcontents[ + line_offsets[srow] + scol: + line_offsets[erow] + ecol + ] = new_text + + new_contents = ''.join(splitcontents) + if contents != new_contents: + with open(filename, 'w', encoding='UTF-8', newline='') as f: + f.write(new_contents) + return 1 + else: + return 0 + + +def main(argv: Sequence[str] | None = None) -> int: + parser = argparse.ArgumentParser() + parser.add_argument('filenames', nargs='*', help='Filenames to fix') + parser.add_argument( + '--replace-single', + action='store_true', + default=False, + help='Replace single quotes into double quotes', + ) + args = parser.parse_args(argv) + + retv = 0 + + for filename in args.filenames: + return_value = fix_strings(filename, replace_single=args.replace_single) + if return_value != 0: + print(f'Fixing strings in {filename}') + retv |= return_value + + return retv + + +if __name__ == '__main__': + raise SystemExit(main()) diff --git a/cupid/io/table/__init__.py b/cupid/io/table/__init__.py index 90669bfa..d5cbd953 100644 --- a/cupid/io/table/__init__.py +++ b/cupid/io/table/__init__.py @@ -14,8 +14,3 @@ from .core import CupidTableUploadSession, CupidTableDownloadSession, TableSplit from .record import CupidRecordReader, CupidRecordWriter - -try: - from .pd import CupidPandasReader, CupidPandasWriter -except ImportError: - pass diff --git a/cupid/io/table/core.py b/cupid/io/table/core.py index 04ec1ebc..a9e9b013 100644 --- a/cupid/io/table/core.py +++ b/cupid/io/table/core.py @@ -146,14 +146,6 @@ def open_record_reader(self): logger.debug('Obtained schema: %s', schema) return context.channel_client.create_record_reader(read_iter, schema) - def open_pandas_reader(self): - from ...runtime import context - context = context() - - read_iter, schema = self._register_reader() - logger.debug('Obtained schema: %s', schema) - return context.channel_client.create_pandas_reader(read_iter, schema) - def open_arrow_file_reader(self): from ...runtime import context import pyarrow as pa @@ -197,9 +189,6 @@ def splits(self): def open_record_reader(self, split_id=0): return self._splits[split_id].open_record_reader() - def open_pandas_reader(self, split_id=0): - return self._splits[split_id].open_pandas_reader() - class BlockWriter(object): __slots__ = '_table_name', '_project_name', '_table_schema', '_partition_spec', '_block_id', '_handle' @@ -275,9 +264,6 @@ def open_arrow_writer(self, partition=None): def open_record_writer(self, partition=None): return self._open_writer(partition=partition or self._partition_spec, create_method='create_record_writer') - def open_pandas_writer(self, partition=None): - return self._open_writer(partition=partition or self._partition_spec, create_method='create_pandas_writer') - def commit(self): channel = SandboxRpcChannel() stub = subprocess_pb.CupidSubProcessService_Stub(channel) diff --git a/cupid/io/table/pd.py b/cupid/io/table/pd.py deleted file mode 100644 index 90262883..00000000 --- a/cupid/io/table/pd.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import ctypes -import threading -import multiprocessing -import logging - -from odps import options -from odps.tunnel.pdio import TunnelPandasReader, BasePandasWriter - -from cupid.errors import SubprocessStreamEOFError - -import numpy as np -del np - -logger = logging.getLogger(__name__) - - -class CupidPandasReader(TunnelPandasReader): - def __init__(self, schema, input_stream, columns=None): - if isinstance(input_stream, tuple): - self._refill_data = input_stream - input_stream = None - else: - self._refill_data = None - - super(CupidPandasReader, self).__init__(schema, input_stream, columns=columns) - - self._input_stream = input_stream - self._table_schema = schema - self._input_columns = columns - self._stream_eof = False - self._closed = False - - def to_forkable(self): - return type(self)(self._table_schema, self._build_refill_data(), self._input_columns) - - def __repr__(self): - cls = type(self) - if self._refill_data is not None: - return '<%s.%s (slave) at 0x%x>' % (cls.__module__, cls.__name__, id(self)) - else: - return '<%s.%s at 0x%x>' % (cls.__module__, cls.__name__, id(self)) - - def _build_refill_data(self): - if self._input_stream is None: - return self._refill_data - if self._refill_data is not None: - return self._refill_data - - from multiprocessing.sharedctypes import RawArray - req_queue = multiprocessing.Queue() - rep_queue = multiprocessing.Queue() - buf = RawArray(ctypes.c_char, options.cupid.mp_buffer_size) - - def _mp_thread(): - try: - while True: - req_body = req_queue.get(timeout=60) - if req_body is None: - return - left_size, bound = req_body - try: - buf[:left_size] = buf[bound - left_size:bound] - read_size = self._input_stream.readinto(buf, left_size) - except SubprocessStreamEOFError: - return - rep_queue.put(read_size) - finally: - rep_queue.put(-1) - self.close() - - stream_thread = threading.Thread(target=_mp_thread) - stream_thread.daemon = True - stream_thread.start() - self._refill_data = (buf, req_queue, rep_queue) - return self._refill_data - - def refill_cache(self): - if self._refill_data is None: - return super(CupidPandasReader, self).refill_cache() - if self._stream_eof or self._closed: - return 0 - - buf, req_queue, rep_queue = self._refill_data - left_size = self.mem_cache_bound - self.row_mem_ptr - req_queue.put((left_size, self.mem_cache_bound)) - read_size = rep_queue.get(timeout=60) - if read_size <= 0: - self._stream_eof = True - self.close() - return 0 - self.reset_positions(buf, read_size + left_size) - return read_size - - def close(self): - super(CupidPandasReader, self).close() - if self._input_stream is None and self._refill_data: - buf, req_queue, rep_queue = self._refill_data - req_queue.put(None) - self._closed = True - - -class CupidPandasWriter(BasePandasWriter): - def __init__(self, schema, output_stream): - super(CupidPandasWriter, self).__init__(schema, output_stream) - self._stream = output_stream - self._block_id = None - self._partition_spec = None - self._table_schema = schema - - @property - def block_id(self): - return self._block_id - - @property - def partition_spec(self): - return self._partition_spec - - def write_stream(self, data, length): - self._stream.write(data, length) - - def close(self): - super(CupidPandasWriter, self).close() - - # sync by get result - result = self._stream.result() - logger.debug('Result fetched on writer close: %s', result) - - self._stream.close() diff --git a/cupid/runtime/ctypes_libs.py b/cupid/runtime/ctypes_libs.py index a7dc4697..56a8e7d1 100644 --- a/cupid/runtime/ctypes_libs.py +++ b/cupid/runtime/ctypes_libs.py @@ -474,25 +474,3 @@ def create_arrow_writer(self, label): create_table_reader = create_record_reader create_table_writer = create_record_writer - - def create_pandas_reader(self, label, schema, columns=None): - try: - from ..io.table import CupidPandasReader - except ImportError: - return None - - params = json.dumps(dict(type='ReadByLabel', label=label)) - stream = self.create_file_reader('createTableInputStream', params.encode()) - reader = CupidPandasReader(schema, stream, columns=columns) - return reader - - def create_pandas_writer(self, label, schema): - try: - from ..io.table import CupidPandasWriter - except ImportError: - return None - - params = json.dumps(dict(type='WriteByLabel', label=label)) - stream = self.create_file_writer('createTableOutputStream', params.encode()) - writer = CupidPandasWriter(schema, stream) - return writer diff --git a/docs/Makefile b/docs/Makefile index 8fd58eba..c62d0023 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -161,6 +161,7 @@ info: gettext: $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale $(SPHINXINTL) update -p $(BUILDDIR)/locale $(I18NSPHINXLANGS) + python $(SOURCEDIR)/norm_zh.py @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." diff --git a/docs/source/api-def.rst b/docs/source/api-def.rst index a261a1c7..f137d128 100644 --- a/docs/source/api-def.rst +++ b/docs/source/api-def.rst @@ -5,6 +5,7 @@ Definitions .. autoclass:: odps.ODPS :members: + :exclude-members: attach_session, create_session, default_session .. autoclass:: odps.models.Project :members: diff --git a/docs/source/base-schemas.rst b/docs/source/base-schemas.rst index 925f6aa7..8e76f42d 100644 --- a/docs/source/base-schemas.rst +++ b/docs/source/base-schemas.rst @@ -74,6 +74,13 @@ Schema 基本操作 for table in o.list_tables(schema='test_schema'): print(table) +下列方法给出了如何从 ``test_schema`` 获取表 ``dual`` 并输出表结构: + +.. code-block:: python + + table = o.get_table('dual', schema='test_schema') + print(table.table_schema) + 在执行 SQL 时,可以指定默认 Schema: .. code-block:: python @@ -81,13 +88,18 @@ Schema 基本操作 o.execute_sql("SELECT * FROM dual", default_schema="test_schema") 对于表而言,如果项目空间没有启用 Schema,``get_table`` 方法对于 ``x.y`` 形式的表名,默认按照 -``project.table`` 处理。如果当前租户开启了 ``odps.namespace.schema`` 配置,``get_table`` -会将 ``x.y`` 作为 ``schema.table`` 处理,否则依然按照 ``project.table`` 处理。如果租户上 -没有配置该选项,可以配置 ``options.always_enable_schema = True``,此后所有 ``x.y`` +``project.table`` 处理。如果当前租户开启了\ `租户级语法开关 `_\ ,\ +``get_table`` 会将 ``x.y`` 作为 ``schema.table`` 处理,否则依然按照 ``project.table`` +处理。如果租户上没有配置该选项,可以配置 ``options.enable_schema = True``,此后所有 ``x.y`` 都将被作为 ``schema.table`` 处理: .. code-block:: python from odps import options - options.always_enable_schema = True + options.enable_schema = True print(o.get_table("myschema.mytable")) + +.. note:: + + ``options.enable_schema`` 自 PyODPS 0.12.0 开始支持,低版本 PyODPS 需要使用 + ``options.always_enable_schema``。 diff --git a/docs/source/base-sql.rst b/docs/source/base-sql.rst index fe1b42b3..df1465b7 100644 --- a/docs/source/base-sql.rst +++ b/docs/source/base-sql.rst @@ -120,19 +120,23 @@ MCQA Instance,你需要自行等待 Instance 完成。需要注意的是,该 .. code-block:: python - >>> o.execute_sql('select * from pyodps_iris', hints={'odps.sql.mapper.split.size': 16}) + >>> hints = {'odps.stage.mapper.split.size': 16, 'odps.sql.reducer.instances': 1024} + >>> o.execute_sql('select * from pyodps_iris', hints=hints) 我们可以对于全局配置设置sql.settings后,每次运行时则都会添加相关的运行时参数。 .. code-block:: python >>> from odps import options - >>> options.sql.settings = {'odps.sql.mapper.split.size': 16} + >>> options.sql.settings = { + >>> 'odps.stage.mapper.split.size': 16, + >>> 'odps.sql.reducer.instances': 1024, + >>> } >>> o.execute_sql('select * from pyodps_iris') # 会根据全局配置添加hints .. _read_sql_exec_result: -读取SQL执行结果 +读取 SQL 执行结果 --------------- 运行 SQL 的 instance 能够直接执行 ``open_reader`` 的操作,一种情况是SQL返回了结构化的数据。 @@ -168,13 +172,13 @@ MCQA Instance,你需要自行等待 Instance 完成。需要注意的是,该 PyODPS 默认不限制能够从 Instance 读取的数据规模,但 Project Owner 可能在 MaxCompute Project 上增加保护设置以限制对 Instance 结果的读取,此时只能使用受限读取模式读取数据,在此模式下可读取的行数受到 Project 配置限制,通常为 10000 行。如果 -PyODPS 检测到读取 Instance 数据被限制,且 `options.tunnel.limit_instance_tunnel` 未设置,会自动启用受限读取模式。 -如果你的 Project 被保护,想要手动启用受限读取模式,可以为 `open_reader` 方法增加 `limit=True` 选项,或者设置 -`options.tunnel.limit_instance_tunnel = True` 。 +PyODPS 检测到读取 Instance 数据被限制,且 ``options.tunnel.limit_instance_tunnel`` 未设置,会自动启用受限读取模式。 +如果你的 Project 被保护,想要手动启用受限读取模式,可以为 ``open_reader`` 方法增加 ``limit=True`` 选项,或者设置 +``options.tunnel.limit_instance_tunnel = True`` 。 -在部分环境中,例如 DataWorks,`options.tunnel.limit_instance_tunnel` 可能默认被置为 True。此时,如果需要读取所有数据,需要为 -`open_reader` 增加参数 `tunnel=True, limit=False` 。需要注意的是,如果 Project 本身被保护,这两个参数 **不能** -解除保护,此时应联系 Project Owner 开放相应的读权限。 +在部分环境中,例如 DataWorks,``options.tunnel.limit_instance_tunnel`` 可能默认被置为 True。此时,如果需要读取\ +所有数据,需要为 ``open_reader`` 增加参数 `tunnel=True, limit=False` 。需要注意的是,如果 Project 本身被保护,\ +这两个参数\ **不能**\ 解除保护,此时应联系 Project Owner 开放相应的读权限。 如果你所使用的 MaxCompute 只能支持旧 Result 接口,同时你需要读取所有数据,可将 SQL 结果写入另一张表后用读表接口读取 (可能受到 Project 安全设置的限制)。 @@ -190,7 +194,7 @@ PyODPS 检测到读取 Instance 数据被限制,且 `options.tunnel.limit_inst .. _sql_to_pandas_mp: -如果需要使用多核加速读取速度,可以通过 `n_process` 指定使用进程数: +如果需要使用多核加速读取速度,可以通过 ``n_process`` 指定使用进程数: .. code-block:: python @@ -202,9 +206,29 @@ PyODPS 检测到读取 Instance 数据被限制,且 `options.tunnel.limit_inst .. note:: - 目前 Instance 结果暂不支持使用 Arrow 格式读取。 + 从 2024 年年末开始,MaxCompute 服务将支持离线 SQL 任务 ``open_reader`` 使用与表类似的 Arrow + 接口,MCQA 作业暂不支持。在此之前,使用 ``Instance.open_reader(arrow=True)`` 读取数据将报错。 -设置alias +从 PyODPS 0.12.0 开始,你也可以直接调用 Instance 上的 ``to_pandas`` 方法直接将数据转换为 pandas。\ +你可以指定转换为 pandas 的起始行号和行数,若不指定则读取所有数据。该方法也支持 ``limit`` 参数,具体定义\ +与 ``open_reader`` 方法相同。该方法默认会使用 Arrow 格式读取,并转换为 pandas。如果 Arrow 格式不被\ +支持,将会回退到 Record 接口。 + +.. code-block:: python + + >>> inst = o.execute_sql('select * from dual') + >>> pd_df = inst.to_pandas(start=10, count=20) + +与表类似,从 PyODPS 0.12.0 开始,你也可以使用 Instance 上的 ``iter_pandas`` 方法按多个批次读取 +pandas DataFrame,参数与 ``Table.iter_pandas`` 类似。 + +.. code-block:: python + + >>> inst = o.execute_sql('select * from dual') + >>> for batch in inst.iter_pandas(start=0, count=1000, batch_size=100): + >>> print(batch) + +设置 alias ------------ 有时在运行时,比如某个UDF引用的资源是动态变化的,我们可以alias旧的资源名到新的资源,这样免去了重新删除并重新创建UDF的麻烦。 diff --git a/docs/source/base-sqlalchemy.rst b/docs/source/base-sqlalchemy.rst index d031d0a3..32f5c5c9 100644 --- a/docs/source/base-sqlalchemy.rst +++ b/docs/source/base-sqlalchemy.rst @@ -10,7 +10,7 @@ PyODPS 支持集成 SQLAlchemy,可以使用 SQLAlchemy 查询 MaxCompute 数 创建连接 ----------- -创建连接可以在连接串中指定 ``access_id``、``access_key`` 和 ``project`` 等。 +创建连接可以在连接字符串中指定 ``access_id``、``access_key`` 和 ``project`` 等。 .. code-block:: python @@ -19,23 +19,20 @@ PyODPS 支持集成 SQLAlchemy,可以使用 SQLAlchemy 查询 MaxCompute 数 # 确保 ALIBABA_CLOUD_ACCESS_KEY_ID 环境变量设置为用户 Access Key ID, # ALIBABA_CLOUD_ACCESS_KEY_SECRET 环境变量设置为用户 Access Key Secret, - # 不建议直接使用 Access Key ID / Access Key Secret 字符串 + # 不建议直接使用 Access Key ID / Access Key Secret 字符串,下同 conn_string = 'odps://%s:%s@' % ( os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'), os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'), ) engine = create_engine(conn_string) -要在连接串中指定 ``endpoint``,可以按如下方式: +要在连接字符串中指定 ``endpoint``,可以按如下方式: .. code-block:: python import os from sqlalchemy import create_engine - # 确保 ALIBABA_CLOUD_ACCESS_KEY_ID 环境变量设置为用户 Access Key ID, - # ALIBABA_CLOUD_ACCESS_KEY_SECRET 环境变量设置为用户 Access Key Secret, - # 不建议直接使用 Access Key ID / Access Key Secret 字符串 conn_string = 'odps://%s:%s@/?endpoint=' % ( os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'), os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'), @@ -44,7 +41,7 @@ PyODPS 支持集成 SQLAlchemy,可以使用 SQLAlchemy 查询 MaxCompute 数 这里把 ```` 等替换成相应的账号。 -对于已有的 ODPS 对象 ``o`` ,调用 ``o.to_global()`` 设为全局账号后,在连接串中就不需要指定了。 +对于已有的 ODPS 对象 ``o`` ,调用 ``o.to_global()`` 设为全局账号后,在连接字符串中就不需要指定了。 .. code-block:: python @@ -66,9 +63,6 @@ PyODPS 支持集成 SQLAlchemy,可以使用 SQLAlchemy 查询 MaxCompute 数 from odps import options from sqlalchemy import create_engine - # 确保 ALIBABA_CLOUD_ACCESS_KEY_ID 环境变量设置为用户 Access Key ID, - # ALIBABA_CLOUD_ACCESS_KEY_SECRET 环境变量设置为用户 Access Key Secret, - # 不建议直接使用 Access Key ID / Access Key Secret 字符串 conn_string = 'odps://%s:%s@/?endpoint=' % ( os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'), os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'), @@ -83,9 +77,6 @@ PyODPS 支持集成 SQLAlchemy,可以使用 SQLAlchemy 查询 MaxCompute 数 import os from sqlalchemy import create_engine - # 确保 ALIBABA_CLOUD_ACCESS_KEY_ID 环境变量设置为用户 Access Key ID, - # ALIBABA_CLOUD_ACCESS_KEY_SECRET 环境变量设置为用户 Access Key Secret, - # 不建议直接使用 Access Key ID / Access Key Secret 字符串 conn_string = 'odps://%s:%s@/?endpoint=&odps.sql.hive.compatible=true' % ( os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'), os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'), @@ -94,6 +85,22 @@ PyODPS 支持集成 SQLAlchemy,可以使用 SQLAlchemy 查询 MaxCompute 数 使用上述方式时,每个 engine 对象都会拥有不同的选项。 +部分商业智能引擎(例如 Apache Superset)可能会频繁列举 MaxCompute 表等对象,这可能会带来较大的延迟。\ +如果你在数据分析过程中对新增的 MaxCompute 对象不敏感,在 PyODPS 0.12.0 及以上版本中可以考虑为连接字符串\ +增加 ``cache_names=true`` 选项以启用对象名缓存,并可指定缓存超时的时间 ``cache_seconds=<超时秒数>`` +(默认为 24 * 3600)。下面的例子开启缓存并将缓存超时时间设定为 1200 秒。 + +.. code-block:: python + + import os + from sqlalchemy import create_engine + + conn_string = 'odps://%s:%s@/?endpoint=&cache_names=true&cache_seconds=1200' % ( + os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'), + os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'), + ) + engine = create_engine(conn_string) + 调用 SQLAlchemy 接口 ---------------------- diff --git a/docs/source/base-tables.rst b/docs/source/base-tables.rst index 672d209f..60fd09cf 100644 --- a/docs/source/base-tables.rst +++ b/docs/source/base-tables.rst @@ -184,7 +184,7 @@ PyODPS 0.11.5 及后续版本中,可以为 ``list_tables`` 添加 ``extended=T 读写数据 -------- -行记录Record +行记录 Record ~~~~~~~~~~~~~ Record表示表的一行记录,我们在 Table 对象上调用 new_record 就可以创建一个新的 Record。 @@ -251,30 +251,76 @@ Record表示表的一行记录,我们在 Table 对象上调用 new_record 就 >>> for record in o.read_table('test_table', partition='pt=test,pt2=test2'): >>> # 处理一条记录 -直接读取成 Pandas DataFrame: +从 0.11.2 开始,PyODPS 支持使用 `https://arrow.apache.org/ `_ 格式读写数据,该格式可以以更高\ +效率与 pandas 等格式互相转换。安装 pyarrow 后,在调用 ``open_reader`` 时增加 ``arrow=True`` 参数,即可按 +`https://arrow.apache.org/docs/python/data.html#record-batches `_ +格式读取表内容。 .. code-block:: python - >>> with t.open_reader(partition='pt=test,pt2=test2') as reader: + >>> with t.open_reader(partition='pt=test,pt2=test2', arrow=True) as reader: + >>> count = reader.count + >>> for batch in reader: # 可以执行多次,直到将所有 RecordBatch 读完 + >>> # 处理一个 RecordBatch,例如转换为 Pandas + >>> print(batch.to_pandas()) + +你也可以直接调用 reader 上的 ``to_pandas`` 方法直接从 reader 获取 pandas DataFrame。 +读取时,可以指定起始行号(从0开始)和行数。如果不指定,则默认读取所有数据。 + +.. code-block:: python + + >>> with t.open_reader(partition='pt=test,pt2=test2', arrow=True) as reader: + >>> # 指定起始行号和行数 + >>> pd_df = reader.to_pandas(start=10, count=20) + >>> # 如不指定,则读取所有数据 >>> pd_df = reader.to_pandas() .. _table_to_pandas_mp: -利用多进程加速读取: +你可以利用多进程加速读取 Pandas DataFrame: .. code-block:: python >>> import multiprocessing >>> n_process = multiprocessing.cpu_count() - >>> with t.open_reader(partition='pt=test,pt2=test2') as reader: + >>> with t.open_reader(partition='pt=test,pt2=test2', arrow=True) as reader: >>> pd_df = reader.to_pandas(n_process=n_process) +为方便读取数据为 pandas,从 PyODPS 0.12.0 开始,Table 和 Partition 对象支持直接调用 ``to_pandas`` +方法。 + +.. code-block:: python + + >>> # 将表读取为 pandas DataFrame + >>> pd_df = table.to_pandas(start=10, count=20) + >>> # 通过2个进程读取所有数据 + >>> pd_df = table.to_pandas(n_process=2) + >>> # 将分区读取为 pandas + >>> pd_df = partitioned_table.to_pandas(partition="pt=test", start=10, count=20) + +与此同时,从 PyODPS 0.12.0 开始,你也可以使用 ``iter_pandas`` 方法从一张表或分区按多个批次读取 pandas +DataFrame,并通过 ``batch_size`` 参数指定每次读取的 DataFrame 批次大小,该大小默认值为 +``options.tunnel.read_row_batch_size`` 指定,默认为 1024。 + +.. code-block:: python + + >>> # 以默认 batch_size 读取所有数据 + >>> for batch in table.iter_pandas(): + >>> print(batch) + >>> # 以 batch_size==100 读取前 1000 行数据 + >>> for batch in table.iter_pandas(batch_size=100, start=0, count=1000): + >>> print(batch) + .. note:: - ``open_reader`` 或者 ``read_table`` 方法仅支持读取单个分区。如果需要读取多个分区的值,例如\ - 读取所有符合 ``dt>20230119`` 这样条件的分区,需要使用 ``iterate_partitions`` 方法,详见 + ``open_reader``、``read_table`` 以及 ``to_pandas`` 方法仅支持读取单个分区。如果需要读取多个分区\ + 的值,例如读取所有符合 ``dt>20230119`` 这样条件的分区,需要使用 ``iterate_partitions`` 方法,详见 :ref:`遍历表分区 ` 章节。 +导出数据是否包含分区列的值由输出格式决定。Record 格式数据默认包含分区列的值,而 Arrow 格式默认不包含。\ +从 PyODPS 0.12.0 开始,你可以通过指定 ``append_partitions=True`` 显示引入分区列的值,通过 +``append_partitions=False`` 将分区列排除在结果之外。 + .. _table_write: 向表写数据 @@ -394,6 +440,58 @@ open_writer 创建的 Writer 对象通过 multiprocessing 标准库传递到需 # 等待子进程中的执行完成 [f.get() for f in futures] +从 0.11.2 开始,PyODPS 支持使用 `https://arrow.apache.org/ `_ 格式读写数据,该格式可以以更高\ +效率与 pandas 等格式互相转换。安装 pyarrow 后,在调用 ``open_writer`` 时增加 ``arrow=True`` 参数,即可按 +`https://arrow.apache.org/docs/python/data.html#record-batches `_ +格式写入表内容。PyODPS 也支持直接写入 pandas DataFrame,支持自动转换为 Arrow RecordBatch。 + +.. code-block:: python + + >>> import pandas as pd + >>> import pyarrow as pa + >>> + >>> with t.open_writer(partition='pt=test', create_partition=True, arrow=True) as writer: + >>> records = [[111, 'aaa', True], + >>> [222, 'bbb', False], + >>> [333, 'ccc', True], + >>> [444, '中文', False]] + >>> df = pd.DataFrame(records, columns=["int_val", "str_val", "bool_val"]) + >>> # 写入 RecordBatch + >>> batch = pa.RecordBatch.from_pandas(df) + >>> writer.write(batch) + >>> # 也可以直接写入 Pandas DataFrame + >>> writer.write(df) + +为方便写入 pandas DataFrame,从 0.12.0 开始,PyODPS 支持直接通过 ``write_table`` 方法写入 pandas DataFrame。\ +如果写入数据前对应表不存在,可以增加 ``create_table=True`` 参数以自动创建表。 + +.. code-block:: python + + >>> import pandas as pd + >>> df = pd.DataFrame([ + >>> [111, 'aaa', True], + >>> [222, 'bbb', False], + >>> [333, 'ccc', True], + >>> [444, '中文', False] + >>> ], columns=['num_col', 'str_col', 'bool_col']) + >>> # 如果表 test_table 不存在,将会自动创建 + >>> o.write_table('test_table', df, partition='pt=test', create_table=True, create_partition=True) + +从 PyODPS 0.12.0 开始,``write_table`` 方法也支持动态分区,可通过 ``partitions`` 参数传入需要作为分区的列名,\ +并指定 ``create_partition=True``,相应的分区将会自动创建。 + +.. code-block:: python + + >>> import pandas as pd + >>> df = pd.DataFrame([ + >>> [111, 'aaa', True, 'p1'], + >>> [222, 'bbb', False, 'p1'], + >>> [333, 'ccc', True, 'p2'], + >>> [444, '中文', False, 'p2'] + >>> ], columns=['num_col', 'str_col', 'bool_col', 'pt']) + >>> # 如果分区 pt=p1 或 pt=p2 不存在,将会自动创建。 + >>> o.write_table('test_part_table', df, partitions=['pt'], create_partition=True) + 压缩选项 ~~~~~~~~ 为加快数据上传 / 下载速度,你可以在上传 / 下载数据时设置压缩选项。具体地,可以创建一个 ``CompressOption`` @@ -424,44 +522,6 @@ open_writer 创建的 Writer 对象通过 multiprocessing 标准库传递到需 with table.open_writer(compress_algo="zlib") as writer: # 写入数据,此处从略 -.. _table_arrow_io: - -使用 Arrow 格式读写数据 -~~~~~~~~~~~~~~~~~~~~~~~~ -`Apache Arrow `_ 是一种跨语言的通用数据读写格式,支持在各种不同平台间进行数据交换。\ -自2021年起, MaxCompute 支持使用 Arrow 格式读取表数据,PyODPS 则从 0.11.2 版本开始支持该功能。具体地,如果在 -Python 环境中安装 pyarrow 后,在调用 ``open_reader`` 或者 ``open_writer`` 时增加 ``arrow=True`` 参数,即可读写 -`Arrow RecordBatch `_ 。 - -按 RecordBatch 读取表内容: - -.. code-block:: python - - >>> reader = t.open_reader(partition='pt=test', arrow=True) - >>> count = reader.count - >>> for batch in reader: # 可以执行多次,直到将所有 RecordBatch 读完 - >>> # 处理一个 RecordBatch,例如转换为 Pandas - >>> print(batch.to_pandas()) - -写入 RecordBatch: - -.. code-block:: python - - >>> import pandas as pd - >>> import pyarrow as pa - >>> - >>> with t.open_writer(partition='pt=test', create_partition=True, arrow=True) as writer: - >>> records = [[111, 'aaa', True], - >>> [222, 'bbb', False], - >>> [333, 'ccc', True], - >>> [444, '中文', False]] - >>> df = pd.DataFrame(records, columns=["int_val", "str_val", "bool_val"]) - >>> # 写入 RecordBatch - >>> batch = pa.RecordBatch.from_pandas(df) - >>> writer.write(batch) - >>> # 也可以直接写入 Pandas DataFrame - >>> writer.write(df) - 删除表 ------- diff --git a/docs/source/conf.py b/docs/source/conf.py index e67b4363..1119480c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -12,14 +12,15 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys +import atexit +import codecs import os +import re import shutil -import atexit +import sys import tempfile import textwrap -import codecs -import re + try: from sphinx.directives import Include except ImportError: @@ -82,6 +83,7 @@ # # The short X.Y version. from odps import __version__ + version = __version__ # The full version, including alpha/beta/rc tags. release = __version__ diff --git a/docs/source/faq.rst b/docs/source/faq.rst index c203e0bf..77617587 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -116,4 +116,4 @@ PyODPS DataFrame 不支持遍历每行数据。这样设计的原因是由于 Py ``df.memory_usage(deep=True).sum()`` 获得的大小更接近实际内存使用,具体可参考 `这篇 Pandas 文档 `_ 。 -为减小读取数据时的内存开销,可以考虑使用 Arrow 格式,具体可以参考 :ref:`这里 `。 +为减小读取数据时的内存开销,可以考虑使用 Arrow 格式,具体可以参考 :ref:`这里 `。 diff --git a/docs/source/locale/en/LC_MESSAGES/api-def.po b/docs/source/locale/en/LC_MESSAGES/api-def.po index 47c221aa..de7d6961 100644 --- a/docs/source/locale/en/LC_MESSAGES/api-def.po +++ b/docs/source/locale/en/LC_MESSAGES/api-def.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: PyODPS 0.7.16\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2024-07-18 15:20+0800\n" +"POT-Creation-Date: 2024-08-31 12:41+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -47,24 +47,23 @@ msgid "" "automatically." msgstr "" -#: odps.core.ODPS odps.core.ODPS.as_account odps.core.ODPS.attach_session -#: odps.core.ODPS.copy_offline_model odps.core.ODPS.create_external_volume -#: odps.core.ODPS.create_fs_volume odps.core.ODPS.create_function -#: odps.core.ODPS.create_parted_volume odps.core.ODPS.create_resource -#: odps.core.ODPS.create_role odps.core.ODPS.create_schema -#: odps.core.ODPS.create_session odps.core.ODPS.create_table +#: odps.core.ODPS odps.core.ODPS.as_account odps.core.ODPS.copy_offline_model +#: odps.core.ODPS.create_external_volume odps.core.ODPS.create_fs_volume +#: odps.core.ODPS.create_function odps.core.ODPS.create_parted_volume +#: odps.core.ODPS.create_resource odps.core.ODPS.create_role +#: odps.core.ODPS.create_schema odps.core.ODPS.create_table #: odps.core.ODPS.create_user odps.core.ODPS.create_volume_directory -#: odps.core.ODPS.delete_function odps.core.ODPS.delete_offline_model -#: odps.core.ODPS.delete_resource odps.core.ODPS.delete_role -#: odps.core.ODPS.delete_schema odps.core.ODPS.delete_table -#: odps.core.ODPS.delete_user odps.core.ODPS.delete_volume +#: odps.core.ODPS.delete_function odps.core.ODPS.delete_materialized_view +#: odps.core.ODPS.delete_offline_model odps.core.ODPS.delete_resource +#: odps.core.ODPS.delete_role odps.core.ODPS.delete_schema +#: odps.core.ODPS.delete_table odps.core.ODPS.delete_user +#: odps.core.ODPS.delete_view odps.core.ODPS.delete_volume #: odps.core.ODPS.delete_volume_file odps.core.ODPS.delete_volume_partition -#: odps.core.ODPS.delete_xflow odps.core.ODPS.execute_archive_table -#: odps.core.ODPS.execute_merge_files odps.core.ODPS.execute_security_query +#: odps.core.ODPS.delete_xflow odps.core.ODPS.execute_security_query #: odps.core.ODPS.execute_sql odps.core.ODPS.execute_sql_cost -#: odps.core.ODPS.execute_sql_interactive odps.core.ODPS.execute_xflow -#: odps.core.ODPS.exist_function odps.core.ODPS.exist_instance -#: odps.core.ODPS.exist_offline_model odps.core.ODPS.exist_project +#: odps.core.ODPS.execute_xflow odps.core.ODPS.exist_function +#: odps.core.ODPS.exist_instance odps.core.ODPS.exist_offline_model +#: odps.core.ODPS.exist_project odps.core.ODPS.exist_quota #: odps.core.ODPS.exist_resource odps.core.ODPS.exist_role #: odps.core.ODPS.exist_schema odps.core.ODPS.exist_table #: odps.core.ODPS.exist_user odps.core.ODPS.exist_volume @@ -72,28 +71,28 @@ msgstr "" #: odps.core.ODPS.get_function odps.core.ODPS.get_instance #: odps.core.ODPS.get_logview_address odps.core.ODPS.get_offline_model #: odps.core.ODPS.get_project odps.core.ODPS.get_project_policy -#: odps.core.ODPS.get_resource odps.core.ODPS.get_role_policy -#: odps.core.ODPS.get_schema odps.core.ODPS.get_security_option -#: odps.core.ODPS.get_security_options odps.core.ODPS.get_table -#: odps.core.ODPS.get_volume odps.core.ODPS.get_volume_file -#: odps.core.ODPS.get_volume_partition odps.core.ODPS.get_xflow -#: odps.core.ODPS.get_xflow_results odps.core.ODPS.get_xflow_sub_instances +#: odps.core.ODPS.get_quota odps.core.ODPS.get_resource +#: odps.core.ODPS.get_role_policy odps.core.ODPS.get_schema +#: odps.core.ODPS.get_security_option odps.core.ODPS.get_security_options +#: odps.core.ODPS.get_table odps.core.ODPS.get_volume +#: odps.core.ODPS.get_volume_file odps.core.ODPS.get_volume_partition +#: odps.core.ODPS.get_xflow odps.core.ODPS.get_xflow_results +#: odps.core.ODPS.get_xflow_sub_instances #: odps.core.ODPS.iter_xflow_sub_instances odps.core.ODPS.list_functions #: odps.core.ODPS.list_instance_queueing_infos odps.core.ODPS.list_instances #: odps.core.ODPS.list_offline_models odps.core.ODPS.list_projects -#: odps.core.ODPS.list_resources odps.core.ODPS.list_role_users -#: odps.core.ODPS.list_roles odps.core.ODPS.list_schemas -#: odps.core.ODPS.list_tables odps.core.ODPS.list_user_roles -#: odps.core.ODPS.list_users odps.core.ODPS.list_volume_files -#: odps.core.ODPS.list_volume_partitions odps.core.ODPS.list_volumes -#: odps.core.ODPS.list_xflows odps.core.ODPS.move_volume_file -#: odps.core.ODPS.open_resource odps.core.ODPS.open_volume_reader -#: odps.core.ODPS.open_volume_writer odps.core.ODPS.read_table -#: odps.core.ODPS.run_archive_table odps.core.ODPS.run_merge_files +#: odps.core.ODPS.list_quotas odps.core.ODPS.list_resources +#: odps.core.ODPS.list_role_users odps.core.ODPS.list_roles +#: odps.core.ODPS.list_schemas odps.core.ODPS.list_tables +#: odps.core.ODPS.list_user_roles odps.core.ODPS.list_users +#: odps.core.ODPS.list_volume_files odps.core.ODPS.list_volume_partitions +#: odps.core.ODPS.list_volumes odps.core.ODPS.list_xflows +#: odps.core.ODPS.move_volume_file odps.core.ODPS.open_resource +#: odps.core.ODPS.open_volume_reader odps.core.ODPS.open_volume_writer #: odps.core.ODPS.run_security_query odps.core.ODPS.run_sql #: odps.core.ODPS.run_xflow odps.core.ODPS.set_project_policy #: odps.core.ODPS.set_role_policy odps.core.ODPS.set_security_option -#: odps.core.ODPS.stop_instance odps.core.ODPS.write_table +#: odps.core.ODPS.stop_instance #: odps.internal.core.InternalODPSMixin.execute_kube_task #: odps.internal.core.InternalODPSMixin.execute_ps #: odps.internal.core.InternalODPSMixin.execute_ps_extract_model @@ -117,13 +116,17 @@ msgstr "" #: odps.models.instance.Instance.get_task_result #: odps.models.instance.Instance.get_task_summary #: odps.models.instance.Instance.get_worker_log +#: odps.models.instance.Instance.iter_pandas #: odps.models.instance.Instance.open_reader #: odps.models.instance.Instance.put_task_info +#: odps.models.instance.Instance.to_pandas #: odps.models.instance.Instance.wait_for_completion #: odps.models.instance.Instance.wait_for_success #: odps.models.ml.offlinemodel.OfflineModel.copy #: odps.models.partition.Partition.drop odps.models.partition.Partition.head +#: odps.models.partition.Partition.iter_pandas #: odps.models.partition.Partition.open_reader +#: odps.models.partition.Partition.to_pandas #: odps.models.resource.FileResource.open #: odps.models.resource.FileResource.read #: odps.models.resource.FileResource.readline @@ -133,15 +136,25 @@ msgstr "" #: odps.models.resource.FileResource.write #: odps.models.resource.FileResource.writelines #: odps.models.resource.TableResource.update +#: odps.models.session.SessionMethods.execute_sql_interactive +#: odps.models.session.SessionMethods.run_sql_interactive #: odps.models.table.Table.create_partition #: odps.models.table.Table.delete_partition odps.models.table.Table.drop #: odps.models.table.Table.exist_partition #: odps.models.table.Table.exist_partitions odps.models.table.Table.get_ddl #: odps.models.table.Table.get_max_partition #: odps.models.table.Table.get_partition odps.models.table.Table.head +#: odps.models.table.Table.iter_pandas #: odps.models.table.Table.iterate_partitions #: odps.models.table.Table.new_record odps.models.table.Table.open_reader -#: odps.models.table.Table.open_writer odps.models.table.Table.truncate +#: odps.models.table.Table.open_writer odps.models.table.Table.to_pandas +#: odps.models.table.Table.truncate +#: odps.models.tableio.TableIOMethods.read_table +#: odps.models.tableio.TableIOMethods.write_table +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack +#: odps.models.tasks.merge.MergeTask.run_archive_table +#: odps.models.tasks.merge.MergeTask.run_freeze_command +#: odps.models.tasks.merge.MergeTask.run_merge_files #: odps.models.worker.Worker.get_log of msgid "Parameters" msgstr "" @@ -180,7 +193,6 @@ msgstr "" #: odps.core.ODPS.execute_sql odps.core.ODPS.execute_sql_cost #: odps.core.ODPS.list_volume_files odps.core.ODPS.open_resource #: odps.core.ODPS.open_volume_reader odps.core.ODPS.open_volume_writer -#: odps.core.ODPS.read_table odps.core.ODPS.write_table #: odps.internal.core.InternalODPSMixin.execute_kube_task #: odps.models.instance.Instance #: odps.models.instance.Instance.Task.TaskProgress @@ -190,7 +202,8 @@ msgstr "" #: odps.models.resource.FileResource.open odps.models.table.Table #: odps.models.table.Table.new_record odps.models.table.Table.open_reader #: odps.models.table.Table.open_writer odps.models.table.TableSchema -#: odps.types.Record of +#: odps.models.tableio.TableIOMethods.read_table +#: odps.models.tableio.TableIOMethods.write_table odps.types.Record of msgid "Example" msgstr "" @@ -210,48 +223,45 @@ msgstr "" msgid "new account object, if `access_id` and `secret_access_key` not supplied" msgstr "" -#: odps.core.ODPS.as_account odps.core.ODPS.attach_session -#: odps.core.ODPS.create_external_volume odps.core.ODPS.create_fs_volume -#: odps.core.ODPS.create_function odps.core.ODPS.create_parted_volume -#: odps.core.ODPS.create_resource odps.core.ODPS.create_role -#: odps.core.ODPS.create_schema odps.core.ODPS.create_session +#: odps.core.ODPS.as_account odps.core.ODPS.create_external_volume +#: odps.core.ODPS.create_fs_volume odps.core.ODPS.create_function +#: odps.core.ODPS.create_parted_volume odps.core.ODPS.create_resource +#: odps.core.ODPS.create_role odps.core.ODPS.create_schema #: odps.core.ODPS.create_table odps.core.ODPS.create_user -#: odps.core.ODPS.create_volume_directory odps.core.ODPS.default_session -#: odps.core.ODPS.delete_function odps.core.ODPS.delete_offline_model +#: odps.core.ODPS.create_volume_directory odps.core.ODPS.delete_function +#: odps.core.ODPS.delete_materialized_view odps.core.ODPS.delete_offline_model #: odps.core.ODPS.delete_resource odps.core.ODPS.delete_table -#: odps.core.ODPS.delete_volume odps.core.ODPS.delete_volume_file -#: odps.core.ODPS.delete_xflow odps.core.ODPS.execute_archive_table -#: odps.core.ODPS.execute_merge_files odps.core.ODPS.execute_security_query -#: odps.core.ODPS.execute_sql odps.core.ODPS.execute_sql_cost -#: odps.core.ODPS.execute_sql_interactive odps.core.ODPS.execute_xflow +#: odps.core.ODPS.delete_view odps.core.ODPS.delete_volume +#: odps.core.ODPS.delete_volume_file odps.core.ODPS.delete_xflow +#: odps.core.ODPS.execute_security_query odps.core.ODPS.execute_sql +#: odps.core.ODPS.execute_sql_cost odps.core.ODPS.execute_xflow #: odps.core.ODPS.exist_function odps.core.ODPS.exist_instance #: odps.core.ODPS.exist_offline_model odps.core.ODPS.exist_project -#: odps.core.ODPS.exist_resource odps.core.ODPS.exist_schema -#: odps.core.ODPS.exist_table odps.core.ODPS.exist_volume -#: odps.core.ODPS.exist_xflow odps.core.ODPS.get_function -#: odps.core.ODPS.get_instance odps.core.ODPS.get_logview_address -#: odps.core.ODPS.get_offline_model odps.core.ODPS.get_project -#: odps.core.ODPS.get_project_policy odps.core.ODPS.get_resource -#: odps.core.ODPS.get_role_policy odps.core.ODPS.get_schema -#: odps.core.ODPS.get_security_option odps.core.ODPS.get_security_options -#: odps.core.ODPS.get_table odps.core.ODPS.get_volume -#: odps.core.ODPS.get_volume_file odps.core.ODPS.get_volume_partition -#: odps.core.ODPS.get_xflow odps.core.ODPS.get_xflow_results -#: odps.core.ODPS.get_xflow_sub_instances +#: odps.core.ODPS.exist_quota odps.core.ODPS.exist_resource +#: odps.core.ODPS.exist_schema odps.core.ODPS.exist_table +#: odps.core.ODPS.exist_volume odps.core.ODPS.exist_xflow +#: odps.core.ODPS.get_function odps.core.ODPS.get_instance +#: odps.core.ODPS.get_logview_address odps.core.ODPS.get_offline_model +#: odps.core.ODPS.get_project odps.core.ODPS.get_project_policy +#: odps.core.ODPS.get_resource odps.core.ODPS.get_role_policy +#: odps.core.ODPS.get_schema odps.core.ODPS.get_security_option +#: odps.core.ODPS.get_security_options odps.core.ODPS.get_table +#: odps.core.ODPS.get_volume odps.core.ODPS.get_volume_file +#: odps.core.ODPS.get_volume_partition odps.core.ODPS.get_xflow +#: odps.core.ODPS.get_xflow_results odps.core.ODPS.get_xflow_sub_instances #: odps.core.ODPS.iter_xflow_sub_instances odps.core.ODPS.list_functions #: odps.core.ODPS.list_instance_queueing_infos odps.core.ODPS.list_instances #: odps.core.ODPS.list_offline_models odps.core.ODPS.list_projects -#: odps.core.ODPS.list_resources odps.core.ODPS.list_role_users -#: odps.core.ODPS.list_roles odps.core.ODPS.list_schemas -#: odps.core.ODPS.list_tables odps.core.ODPS.list_user_roles -#: odps.core.ODPS.list_users odps.core.ODPS.list_volume_files -#: odps.core.ODPS.list_volume_partitions odps.core.ODPS.list_volumes -#: odps.core.ODPS.list_xflows odps.core.ODPS.move_volume_file -#: odps.core.ODPS.open_resource odps.core.ODPS.read_table -#: odps.core.ODPS.run_archive_table odps.core.ODPS.run_merge_files +#: odps.core.ODPS.list_quotas odps.core.ODPS.list_resources +#: odps.core.ODPS.list_role_users odps.core.ODPS.list_roles +#: odps.core.ODPS.list_schemas odps.core.ODPS.list_tables +#: odps.core.ODPS.list_user_roles odps.core.ODPS.list_users +#: odps.core.ODPS.list_volume_files odps.core.ODPS.list_volume_partitions +#: odps.core.ODPS.list_volumes odps.core.ODPS.list_xflows +#: odps.core.ODPS.move_volume_file odps.core.ODPS.open_resource #: odps.core.ODPS.run_security_query odps.core.ODPS.run_sql #: odps.core.ODPS.run_xflow odps.core.ODPS.set_project_policy -#: odps.core.ODPS.stop_instance odps.core.ODPS.write_table +#: odps.core.ODPS.stop_instance #: odps.internal.core.InternalODPSMixin.execute_kube_task #: odps.internal.core.InternalODPSMixin.execute_ps #: odps.internal.core.InternalODPSMixin.execute_ps_extract_model @@ -302,35 +312,24 @@ msgstr "" #: odps.models.resource.FileResource.write #: odps.models.resource.FileResource.writelines #: odps.models.resource.TableResource.update +#: odps.models.session.SessionMethods.execute_sql_interactive +#: odps.models.session.SessionMethods.run_sql_interactive #: odps.models.table.Table.create_partition odps.models.table.Table.drop #: odps.models.table.Table.exist_partitions odps.models.table.Table.get_ddl #: odps.models.table.Table.get_max_partition #: odps.models.table.Table.get_partition odps.models.table.Table.head #: odps.models.table.Table.new_record odps.models.table.Table.open_reader #: odps.models.table.Table.open_writer odps.models.table.Table.to_df -#: odps.models.table.Table.truncate odps.models.worker.Worker.get_log of +#: odps.models.table.Table.truncate +#: odps.models.tableio.TableIOMethods.read_table +#: odps.models.tableio.TableIOMethods.write_table +#: odps.models.tasks.merge.MergeTask.run_archive_table +#: odps.models.tasks.merge.MergeTask.run_freeze_command +#: odps.models.tasks.merge.MergeTask.run_merge_files +#: odps.models.worker.Worker.get_log of msgid "Returns" msgstr "" -#: odps.core.ODPS.attach_session:1 of -msgid "Attach to an existing session." -msgstr "" - -#: odps.core.ODPS.attach_session:3 of -msgid "The session name." -msgstr "" - -#: odps.core.ODPS.attach_session:4 odps.core.ODPS.create_session:8 of -msgid "" -"The created sqlrt task name. If not provided, the default value is used. " -"Mostly doesn't matter, default works." -msgstr "" - -#: odps.core.ODPS.attach_session:6 odps.core.ODPS.create_session:12 -#: odps.core.ODPS.default_session:3 of -msgid "A SessionInstance you may execute select tasks within." -msgstr "" - #: odps.core.ODPS.copy_offline_model:1 #: odps.models.ml.offlinemodel.OfflineModel.copy:1 of msgid "Copy current model into a new location." @@ -371,28 +370,28 @@ msgstr "" #: odps.core.ODPS.create_resource:13 odps.core.ODPS.create_role:4 #: odps.core.ODPS.create_schema:4 odps.core.ODPS.create_table:6 #: odps.core.ODPS.create_user:4 odps.core.ODPS.delete_function:4 +#: odps.core.ODPS.delete_materialized_view:4 #: odps.core.ODPS.delete_offline_model:5 odps.core.ODPS.delete_resource:4 #: odps.core.ODPS.delete_role:4 odps.core.ODPS.delete_schema:4 #: odps.core.ODPS.delete_table:4 odps.core.ODPS.delete_user:4 -#: odps.core.ODPS.delete_volume:4 odps.core.ODPS.delete_volume_partition:5 -#: odps.core.ODPS.delete_xflow:4 odps.core.ODPS.execute_archive_table:5 -#: odps.core.ODPS.execute_merge_files:5 odps.core.ODPS.execute_security_query:5 -#: odps.core.ODPS.execute_sql:5 odps.core.ODPS.execute_sql_cost:3 -#: odps.core.ODPS.execute_xflow:9 odps.core.ODPS.exist_function:4 -#: odps.core.ODPS.exist_instance:4 odps.core.ODPS.exist_offline_model:4 -#: odps.core.ODPS.exist_resource:6 odps.core.ODPS.exist_role:4 -#: odps.core.ODPS.exist_schema:4 odps.core.ODPS.exist_table:4 -#: odps.core.ODPS.exist_user:4 odps.core.ODPS.exist_volume:4 -#: odps.core.ODPS.exist_volume_partition:5 odps.core.ODPS.exist_xflow:4 -#: odps.core.ODPS.get_function:4 odps.core.ODPS.get_instance:4 -#: odps.core.ODPS.get_logview_address:5 odps.core.ODPS.get_offline_model:4 -#: odps.core.ODPS.get_project:3 odps.core.ODPS.get_project_policy:3 -#: odps.core.ODPS.get_resource:4 odps.core.ODPS.get_role_policy:4 -#: odps.core.ODPS.get_schema:4 odps.core.ODPS.get_security_option:4 -#: odps.core.ODPS.get_security_options:3 odps.core.ODPS.get_table:4 -#: odps.core.ODPS.get_volume:4 odps.core.ODPS.get_volume_partition:5 -#: odps.core.ODPS.get_xflow:4 odps.core.ODPS.get_xflow_results:5 -#: odps.core.ODPS.get_xflow_sub_instances:5 +#: odps.core.ODPS.delete_view:4 odps.core.ODPS.delete_volume:4 +#: odps.core.ODPS.delete_volume_partition:5 odps.core.ODPS.delete_xflow:4 +#: odps.core.ODPS.execute_security_query:5 odps.core.ODPS.execute_sql:4 +#: odps.core.ODPS.execute_sql_cost:3 odps.core.ODPS.execute_xflow:9 +#: odps.core.ODPS.exist_function:4 odps.core.ODPS.exist_instance:4 +#: odps.core.ODPS.exist_offline_model:4 odps.core.ODPS.exist_resource:6 +#: odps.core.ODPS.exist_role:4 odps.core.ODPS.exist_schema:4 +#: odps.core.ODPS.exist_table:4 odps.core.ODPS.exist_user:4 +#: odps.core.ODPS.exist_volume:4 odps.core.ODPS.exist_volume_partition:5 +#: odps.core.ODPS.exist_xflow:4 odps.core.ODPS.get_function:4 +#: odps.core.ODPS.get_instance:4 odps.core.ODPS.get_logview_address:5 +#: odps.core.ODPS.get_offline_model:4 odps.core.ODPS.get_project:3 +#: odps.core.ODPS.get_project_policy:3 odps.core.ODPS.get_resource:4 +#: odps.core.ODPS.get_role_policy:4 odps.core.ODPS.get_schema:4 +#: odps.core.ODPS.get_security_option:4 odps.core.ODPS.get_security_options:3 +#: odps.core.ODPS.get_table:4 odps.core.ODPS.get_volume:4 +#: odps.core.ODPS.get_volume_partition:5 odps.core.ODPS.get_xflow:4 +#: odps.core.ODPS.get_xflow_results:5 odps.core.ODPS.get_xflow_sub_instances:5 #: odps.core.ODPS.iter_xflow_sub_instances:6 odps.core.ODPS.list_functions:3 #: odps.core.ODPS.list_instance_queueing_infos:3 #: odps.core.ODPS.list_instances:4 odps.core.ODPS.list_offline_models:3 @@ -403,11 +402,9 @@ msgstr "" #: odps.core.ODPS.list_volume_partitions:4 odps.core.ODPS.list_volumes:3 #: odps.core.ODPS.list_xflows:3 odps.core.ODPS.open_resource:23 #: odps.core.ODPS.open_volume_reader:7 odps.core.ODPS.open_volume_writer:11 -#: odps.core.ODPS.read_table:8 odps.core.ODPS.run_archive_table:5 -#: odps.core.ODPS.run_merge_files:5 odps.core.ODPS.run_security_query:6 -#: odps.core.ODPS.run_sql:5 odps.core.ODPS.run_xflow:9 -#: odps.core.ODPS.set_project_policy:4 odps.core.ODPS.set_role_policy:5 -#: odps.core.ODPS.stop_instance:4 odps.core.ODPS.write_table:6 +#: odps.core.ODPS.run_security_query:6 odps.core.ODPS.run_sql:4 +#: odps.core.ODPS.run_xflow:9 odps.core.ODPS.set_project_policy:4 +#: odps.core.ODPS.set_role_policy:5 odps.core.ODPS.stop_instance:4 #: odps.internal.core.InternalODPSMixin.execute_kube_task:4 #: odps.internal.core.InternalODPSMixin.execute_ps:7 #: odps.internal.core.InternalODPSMixin.execute_ps_extract_model:11 @@ -415,7 +412,13 @@ msgstr "" #: odps.internal.core.InternalODPSMixin.run_kube_task:4 #: odps.internal.core.InternalODPSMixin.run_ps:7 #: odps.internal.core.InternalODPSMixin.run_ps_extract_model:11 -#: odps.internal.core.InternalODPSMixin.run_ps_train:20 of +#: odps.internal.core.InternalODPSMixin.run_ps_train:20 +#: odps.models.tableio.TableIOMethods.read_table:8 +#: odps.models.tableio.TableIOMethods.write_table:7 +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:41 +#: odps.models.tasks.merge.MergeTask.run_archive_table:5 +#: odps.models.tasks.merge.MergeTask.run_freeze_command:6 +#: odps.models.tasks.merge.MergeTask.run_merge_files:5 of msgid "project name, if not provided, will be the default project" msgstr "" @@ -423,27 +426,42 @@ msgstr "" #: odps.core.ODPS.create_function:5 odps.core.ODPS.create_parted_volume:5 #: odps.core.ODPS.create_resource:14 odps.core.ODPS.create_table:8 #: odps.core.ODPS.create_volume_directory:6 odps.core.ODPS.delete_function:5 -#: odps.core.ODPS.delete_resource:5 odps.core.ODPS.delete_table:6 +#: odps.core.ODPS.delete_materialized_view:7 odps.core.ODPS.delete_resource:5 +#: odps.core.ODPS.delete_table:6 odps.core.ODPS.delete_view:6 #: odps.core.ODPS.delete_volume:5 odps.core.ODPS.delete_volume_file:7 #: odps.core.ODPS.delete_volume_partition:6 -#: odps.core.ODPS.execute_merge_files:6 odps.core.ODPS.execute_security_query:6 -#: odps.core.ODPS.exist_function:5 odps.core.ODPS.exist_resource:4 -#: odps.core.ODPS.exist_table:5 odps.core.ODPS.exist_volume:5 -#: odps.core.ODPS.exist_volume_partition:6 odps.core.ODPS.get_function:5 -#: odps.core.ODPS.get_resource:5 odps.core.ODPS.get_schema:3 -#: odps.core.ODPS.get_table:5 odps.core.ODPS.get_volume:5 -#: odps.core.ODPS.get_volume_file:6 odps.core.ODPS.get_volume_partition:6 -#: odps.core.ODPS.list_functions:6 odps.core.ODPS.list_resources:6 -#: odps.core.ODPS.list_tables:8 odps.core.ODPS.list_volume_files:7 -#: odps.core.ODPS.list_volume_partitions:5 odps.core.ODPS.list_volumes:4 -#: odps.core.ODPS.move_volume_file:7 odps.core.ODPS.open_resource:24 -#: odps.core.ODPS.open_volume_reader:8 odps.core.ODPS.open_volume_writer:12 -#: odps.core.ODPS.read_table:9 odps.core.ODPS.run_merge_files:6 -#: odps.core.ODPS.run_security_query:7 odps.core.ODPS.write_table:7 of +#: odps.core.ODPS.execute_security_query:6 odps.core.ODPS.exist_function:5 +#: odps.core.ODPS.exist_resource:4 odps.core.ODPS.exist_table:5 +#: odps.core.ODPS.exist_volume:5 odps.core.ODPS.exist_volume_partition:6 +#: odps.core.ODPS.get_function:5 odps.core.ODPS.get_resource:5 +#: odps.core.ODPS.get_schema:3 odps.core.ODPS.get_table:5 +#: odps.core.ODPS.get_volume:5 odps.core.ODPS.get_volume_file:6 +#: odps.core.ODPS.get_volume_partition:6 odps.core.ODPS.list_functions:6 +#: odps.core.ODPS.list_resources:6 odps.core.ODPS.list_tables:8 +#: odps.core.ODPS.list_volume_files:7 odps.core.ODPS.list_volume_partitions:5 +#: odps.core.ODPS.list_volumes:4 odps.core.ODPS.move_volume_file:7 +#: odps.core.ODPS.open_resource:24 odps.core.ODPS.open_volume_reader:8 +#: odps.core.ODPS.open_volume_writer:12 odps.core.ODPS.run_security_query:7 +#: odps.models.tableio.TableIOMethods.read_table:9 +#: odps.models.tableio.TableIOMethods.write_table:8 +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:42 +#: odps.models.tasks.merge.MergeTask.run_merge_files:6 of msgid "schema name, if not provided, will be the default schema" msgstr "" -#: odps.core.ODPS.create_external_volume:6 odps.core.ODPS.create_fs_volume:6 +#: odps.core.ODPS.create_external_volume:6 of +msgid "location of OSS dir, should be oss://endpoint/bucket/path" +msgstr "" + +#: odps.core.ODPS.create_external_volume:7 of +msgid "role arn of the account hosting the OSS bucket" +msgstr "" + +#: odps.core.ODPS.create_external_volume:8 of +msgid "if True, will create directory automatically" +msgstr "" + +#: odps.core.ODPS.create_external_volume:9 odps.core.ODPS.create_fs_volume:6 #: odps.core.ODPS.create_parted_volume:6 of msgid "volume" msgstr "" @@ -451,25 +469,23 @@ msgstr "" #: odps.core.ODPS.create_external_volume odps.core.ODPS.create_fs_volume #: odps.core.ODPS.create_function odps.core.ODPS.create_parted_volume #: odps.core.ODPS.create_resource odps.core.ODPS.create_table -#: odps.core.ODPS.execute_archive_table odps.core.ODPS.execute_merge_files #: odps.core.ODPS.execute_sql odps.core.ODPS.execute_sql_cost #: odps.core.ODPS.execute_xflow odps.core.ODPS.exist_function #: odps.core.ODPS.exist_instance odps.core.ODPS.exist_offline_model -#: odps.core.ODPS.exist_project odps.core.ODPS.exist_resource -#: odps.core.ODPS.exist_schema odps.core.ODPS.exist_table -#: odps.core.ODPS.exist_volume odps.core.ODPS.exist_xflow -#: odps.core.ODPS.get_instance odps.core.ODPS.get_logview_address -#: odps.core.ODPS.get_offline_model odps.core.ODPS.get_project -#: odps.core.ODPS.get_resource odps.core.ODPS.get_table -#: odps.core.ODPS.get_volume odps.core.ODPS.get_volume_partition -#: odps.core.ODPS.get_xflow odps.core.ODPS.get_xflow_results -#: odps.core.ODPS.list_functions odps.core.ODPS.list_instance_queueing_infos -#: odps.core.ODPS.list_instances odps.core.ODPS.list_offline_models -#: odps.core.ODPS.list_projects odps.core.ODPS.list_resources -#: odps.core.ODPS.list_tables odps.core.ODPS.list_volume_files -#: odps.core.ODPS.list_volume_partitions odps.core.ODPS.list_volumes -#: odps.core.ODPS.list_xflows odps.core.ODPS.read_table -#: odps.core.ODPS.run_archive_table odps.core.ODPS.run_merge_files +#: odps.core.ODPS.exist_project odps.core.ODPS.exist_quota +#: odps.core.ODPS.exist_resource odps.core.ODPS.exist_schema +#: odps.core.ODPS.exist_table odps.core.ODPS.exist_volume +#: odps.core.ODPS.exist_xflow odps.core.ODPS.get_instance +#: odps.core.ODPS.get_logview_address odps.core.ODPS.get_offline_model +#: odps.core.ODPS.get_project odps.core.ODPS.get_resource +#: odps.core.ODPS.get_table odps.core.ODPS.get_volume +#: odps.core.ODPS.get_volume_partition odps.core.ODPS.get_xflow +#: odps.core.ODPS.get_xflow_results odps.core.ODPS.list_functions +#: odps.core.ODPS.list_instance_queueing_infos odps.core.ODPS.list_instances +#: odps.core.ODPS.list_offline_models odps.core.ODPS.list_projects +#: odps.core.ODPS.list_resources odps.core.ODPS.list_tables +#: odps.core.ODPS.list_volume_files odps.core.ODPS.list_volume_partitions +#: odps.core.ODPS.list_volumes odps.core.ODPS.list_xflows #: odps.core.ODPS.run_sql odps.core.ODPS.run_xflow #: odps.internal.core.InternalODPSMixin.execute_kube_task #: odps.internal.core.InternalODPSMixin.execute_ps @@ -499,12 +515,16 @@ msgstr "" #: odps.models.resource.FileResource.readlines #: odps.models.table.Table.create_partition #: odps.models.table.Table.get_partition odps.models.table.Table.head -#: odps.models.table.Table.new_record of +#: odps.models.table.Table.new_record +#: odps.models.tableio.TableIOMethods.read_table +#: odps.models.tasks.merge.MergeTask.run_archive_table +#: odps.models.tasks.merge.MergeTask.run_freeze_command +#: odps.models.tasks.merge.MergeTask.run_merge_files of msgid "Return type" msgstr "" -#: odps.core.ODPS.create_external_volume:7 -#: odps.core.ODPS.create_external_volume:9 odps.core.ODPS.create_fs_volume:7 +#: odps.core.ODPS.create_external_volume:10 +#: odps.core.ODPS.create_external_volume:12 odps.core.ODPS.create_fs_volume:7 #: odps.core.ODPS.create_fs_volume:9 of msgid ":class:`odps.models.FSVolume`" msgstr "" @@ -696,7 +716,8 @@ msgid "schema name" msgstr "" #: odps.core.ODPS.create_schema:5 odps.core.ODPS.create_table:17 -#: odps.core.ODPS.delete_schema:5 odps.core.ODPS.delete_table:8 of +#: odps.core.ODPS.delete_materialized_view:9 odps.core.ODPS.delete_schema:5 +#: odps.core.ODPS.delete_table:8 odps.core.ODPS.delete_view:8 of msgid "if True, will run asynchronously" msgstr "" @@ -704,34 +725,6 @@ msgstr "" msgid "if async_ is True, return instance, otherwise return Schema object." msgstr "" -#: odps.core.ODPS.create_session:1 of -msgid "Create session." -msgstr "" - -#: odps.core.ODPS.create_session:3 of -msgid "How much workers assigned to the session." -msgstr "" - -#: odps.core.ODPS.create_session:4 of -msgid "How much memory each worker consumes." -msgstr "" - -#: odps.core.ODPS.create_session:5 of -msgid "The session name. Not specifying to use its ID as name." -msgstr "" - -#: odps.core.ODPS.create_session:6 of -msgid "" -"format \"00-24\", allocated workers will be reduced during this time. Not" -" specifying to disable this." -msgstr "" - -#: odps.core.ODPS.create_session:10 of -msgid "" -"Extra hints provided to the session. Parameters of this method will " -"override certain hints." -msgstr "" - #: odps.core.ODPS.create_table:1 of msgid "Create a table by given schema and other optional parameters." msgstr "" @@ -768,7 +761,8 @@ msgstr "" msgid "hub lifecycle" msgstr "" -#: odps.core.ODPS.create_table:13 odps.core.ODPS.delete_table:7 of +#: odps.core.ODPS.create_table:13 odps.core.ODPS.delete_materialized_view:8 +#: odps.core.ODPS.delete_table:7 odps.core.ODPS.delete_view:7 of msgid "hints for the task" msgstr "" @@ -834,23 +828,18 @@ msgstr "" msgid "directory object." msgstr "" -#: odps.core.ODPS.default_session:1 of -msgid "Attach to the default session of your project." -msgstr "" - #: odps.core.ODPS.delete_function:1 of msgid "Delete a function by given name." msgstr "" #: odps.core.ODPS.delete_function:7 odps.core.ODPS.delete_offline_model:6 -#: odps.core.ODPS.delete_resource:7 odps.core.ODPS.delete_volume:6 +#: odps.core.ODPS.delete_resource:7 odps.core.ODPS.delete_volume:8 #: odps.core.ODPS.delete_xflow:5 odps.core.ODPS.stop_instance:5 -#: odps.core.ODPS.write_table:18 #: odps.mars_extension.oscar.core.persist_mars_dataframe:17 #: odps.models.function.Function.drop:3 odps.models.function.Function.update:3 #: odps.models.instance.Instance.stop:3 -#: odps.models.instance.Instance.wait_for_completion:7 -#: odps.models.instance.Instance.wait_for_success:7 +#: odps.models.instance.Instance.wait_for_completion:9 +#: odps.models.instance.Instance.wait_for_success:9 #: odps.models.partition.Partition.drop:5 #: odps.models.resource.FileResource.close:3 #: odps.models.resource.FileResource.flush:4 @@ -858,10 +847,30 @@ msgstr "" #: odps.models.resource.FileResource.truncate:6 #: odps.models.resource.FileResource.write:4 #: odps.models.resource.FileResource.writelines:4 -#: odps.models.table.Table.drop:6 odps.models.table.Table.truncate:6 of +#: odps.models.table.Table.drop:6 odps.models.table.Table.truncate:6 +#: odps.models.tableio.TableIOMethods.write_table:21 of msgid "None" msgstr "" +#: odps.core.ODPS.delete_materialized_view:1 of +msgid "Delete the materialized view with given name" +msgstr "" + +#: odps.core.ODPS.delete_materialized_view:3 of +msgid "materialized view name" +msgstr "" + +#: odps.core.ODPS.delete_materialized_view:5 of +msgid "" +"will not raise errors when the materialized view does not exist, default " +"False" +msgstr "" + +#: odps.core.ODPS.delete_materialized_view:10 odps.core.ODPS.delete_table:9 +#: odps.core.ODPS.delete_view:9 of +msgid "None if not async else odps instance" +msgstr "" + #: odps.core.ODPS.delete_offline_model:1 of msgid "Delete the offline model by given name." msgstr "" @@ -899,18 +908,34 @@ msgstr "" msgid "will not raise errors when the table does not exist, default False" msgstr "" -#: odps.core.ODPS.delete_table:9 of -msgid "None if not async else odps instance" -msgstr "" - #: odps.core.ODPS.delete_user:1 of msgid "Delete a user from the project" msgstr "" +#: odps.core.ODPS.delete_view:1 of +msgid "Delete the view with given name" +msgstr "" + +#: odps.core.ODPS.delete_view:3 of +msgid "view name" +msgstr "" + +#: odps.core.ODPS.delete_view:5 of +msgid "will not raise errors when the view does not exist, default False" +msgstr "" + #: odps.core.ODPS.delete_volume:1 of msgid "Delete volume by given name." msgstr "" +#: odps.core.ODPS.delete_volume:6 of +msgid "if True, directory created by external volume will be deleted" +msgstr "" + +#: odps.core.ODPS.delete_volume:7 of +msgid "if True, directory deletion should be recursive" +msgstr "" + #: odps.core.ODPS.delete_volume_file:1 of msgid "Delete a file / directory object under a file system volume." msgstr "" @@ -938,26 +963,28 @@ msgstr "" msgid "xflow name" msgstr "" -#: odps.core.ODPS.execute_archive_table:1 of +#: odps.models.tasks.merge.MergeTask.run_archive_table:1 +#: odps.models.tasks.merge.MergeTask.run_freeze_command:1 of msgid "Execute a task to archive tables and wait for termination." msgstr "" -#: odps.core.ODPS.execute_archive_table:3 odps.core.ODPS.run_archive_table:3 of +#: odps.models.tasks.merge.MergeTask.run_archive_table:3 +#: odps.models.tasks.merge.MergeTask.run_freeze_command:3 of msgid "name of the table to archive" msgstr "" -#: odps.core.ODPS.execute_archive_table:4 odps.core.ODPS.run_archive_table:4 of +#: odps.models.tasks.merge.MergeTask.run_archive_table:4 +#: odps.models.tasks.merge.MergeTask.run_freeze_command:4 of msgid "partition to archive" msgstr "" -#: odps.core.ODPS.execute_archive_table:6 odps.core.ODPS.run_archive_table:6 of +#: odps.models.tasks.merge.MergeTask.run_archive_table:6 +#: odps.models.tasks.merge.MergeTask.run_freeze_command:7 of msgid "settings for table archive task." msgstr "" -#: odps.core.ODPS.execute_archive_table:7 odps.core.ODPS.execute_merge_files:8 -#: odps.core.ODPS.execute_sql:6 odps.core.ODPS.execute_xflow:12 -#: odps.core.ODPS.run_archive_table:7 odps.core.ODPS.run_merge_files:8 -#: odps.core.ODPS.run_sql:6 odps.core.ODPS.run_xflow:12 +#: odps.core.ODPS.execute_sql:5 odps.core.ODPS.execute_xflow:12 +#: odps.core.ODPS.run_sql:5 odps.core.ODPS.run_xflow:12 #: odps.internal.core.InternalODPSMixin.execute_kube_task:6 #: odps.internal.core.InternalODPSMixin.execute_ps:9 #: odps.internal.core.InternalODPSMixin.execute_ps_extract_model:13 @@ -965,14 +992,16 @@ msgstr "" #: odps.internal.core.InternalODPSMixin.run_kube_task:6 #: odps.internal.core.InternalODPSMixin.run_ps:9 #: odps.internal.core.InternalODPSMixin.run_ps_extract_model:13 -#: odps.internal.core.InternalODPSMixin.run_ps_train:22 of +#: odps.internal.core.InternalODPSMixin.run_ps_train:22 +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:48 +#: odps.models.tasks.merge.MergeTask.run_archive_table:7 +#: odps.models.tasks.merge.MergeTask.run_freeze_command:8 +#: odps.models.tasks.merge.MergeTask.run_merge_files:8 of msgid "instance priority, 9 as default" msgstr "" -#: odps.core.ODPS.execute_archive_table:8 odps.core.ODPS.execute_merge_files:11 -#: odps.core.ODPS.execute_sql:11 odps.core.ODPS.execute_xflow:14 -#: odps.core.ODPS.run_archive_table:8 odps.core.ODPS.run_merge_files:11 -#: odps.core.ODPS.run_sql:13 odps.core.ODPS.run_xflow:14 +#: odps.core.ODPS.execute_sql:9 odps.core.ODPS.execute_xflow:14 +#: odps.core.ODPS.run_sql:10 odps.core.ODPS.run_xflow:14 #: odps.internal.core.InternalODPSMixin.execute_kube_task:8 #: odps.internal.core.InternalODPSMixin.execute_ps:10 #: odps.internal.core.InternalODPSMixin.execute_ps_extract_model:14 @@ -980,16 +1009,17 @@ msgstr "" #: odps.internal.core.InternalODPSMixin.run_kube_task:8 #: odps.internal.core.InternalODPSMixin.run_ps:10 #: odps.internal.core.InternalODPSMixin.run_ps_extract_model:14 -#: odps.internal.core.InternalODPSMixin.run_ps_train:24 of +#: odps.internal.core.InternalODPSMixin.run_ps_train:24 +#: odps.models.tasks.merge.MergeTask.run_archive_table:8 +#: odps.models.tasks.merge.MergeTask.run_freeze_command:9 +#: odps.models.tasks.merge.MergeTask.run_merge_files:11 of msgid "instance" msgstr "" -#: odps.core.ODPS.execute_archive_table:9 odps.core.ODPS.execute_merge_files:12 -#: odps.core.ODPS.execute_sql:12 odps.core.ODPS.execute_sql:25 +#: odps.core.ODPS.execute_sql:10 odps.core.ODPS.execute_sql:23 #: odps.core.ODPS.execute_xflow:15 odps.core.ODPS.execute_xflow:17 #: odps.core.ODPS.get_instance:6 odps.core.ODPS.get_instance:9 -#: odps.core.ODPS.run_archive_table:9 odps.core.ODPS.run_merge_files:12 -#: odps.core.ODPS.run_sql:14 odps.core.ODPS.run_sql:16 +#: odps.core.ODPS.run_sql:11 odps.core.ODPS.run_sql:13 #: odps.core.ODPS.run_xflow:15 odps.core.ODPS.run_xflow:17 #: odps.internal.core.InternalODPSMixin.execute_kube_task:9 #: odps.internal.core.InternalODPSMixin.execute_ps:11 @@ -998,10 +1028,17 @@ msgstr "" #: odps.internal.core.InternalODPSMixin.run_kube_task:9 #: odps.internal.core.InternalODPSMixin.run_ps:11 #: odps.internal.core.InternalODPSMixin.run_ps_extract_model:15 -#: odps.internal.core.InternalODPSMixin.run_ps_train:25 of +#: odps.internal.core.InternalODPSMixin.run_ps_train:25 +#: odps.models.tasks.merge.MergeTask.run_archive_table:9 +#: odps.models.tasks.merge.MergeTask.run_freeze_command:10 +#: odps.models.tasks.merge.MergeTask.run_merge_files:12 of msgid ":class:`odps.models.Instance`" msgstr "" +#: odps.models.tasks.merge.MergeTask.run_freeze_command:5 of +msgid "freeze command to execute, can be freeze or restore" +msgstr "" + #: odps.internal.core.InternalODPSMixin.execute_kube_task:1 of msgid "Submit the kube task contains yaml file to odps cluster." msgstr "" @@ -1010,9 +1047,10 @@ msgstr "" msgid "yaml string or json object." msgstr "" -#: odps.core.ODPS.execute_merge_files:7 odps.core.ODPS.run_merge_files:7 #: odps.internal.core.InternalODPSMixin.execute_kube_task:5 -#: odps.internal.core.InternalODPSMixin.run_kube_task:5 of +#: odps.internal.core.InternalODPSMixin.run_kube_task:5 +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:47 +#: odps.models.tasks.merge.MergeTask.run_merge_files:7 of msgid "settings for merge task." msgstr "" @@ -1021,24 +1059,25 @@ msgstr "" msgid "task name." msgstr "" -#: odps.core.ODPS.execute_merge_files:1 of +#: odps.models.tasks.merge.MergeTask.run_merge_files:1 of msgid "Execute a task to merge multiple files in tables and wait for termination." msgstr "" -#: odps.core.ODPS.execute_merge_files:3 odps.core.ODPS.run_merge_files:3 of +#: odps.models.tasks.merge.MergeTask.run_merge_files:3 of msgid "name of the table to optimize" msgstr "" -#: odps.core.ODPS.execute_merge_files:4 odps.core.ODPS.run_merge_files:4 of +#: odps.models.tasks.merge.MergeTask.run_merge_files:4 of msgid "partition to optimize" msgstr "" -#: odps.core.ODPS.execute_merge_files:9 odps.core.ODPS.execute_sql:8 -#: odps.core.ODPS.run_merge_files:9 odps.core.ODPS.run_sql:8 of +#: odps.core.ODPS.execute_sql:6 odps.core.ODPS.run_sql:6 +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:49 +#: odps.models.tasks.merge.MergeTask.run_merge_files:9 of msgid "cluster to run this instance" msgstr "" -#: odps.core.ODPS.execute_merge_files:10 odps.core.ODPS.run_merge_files:10 of +#: odps.models.tasks.merge.MergeTask.run_merge_files:10 of msgid "compact option for transactional table, can be major or minor." msgstr "" @@ -1195,6 +1234,108 @@ msgstr "" msgid "extra configurations to be passed into the parameter server" msgstr "" +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:2 of +msgid "Execute PythonPack to create an archive with specified Python requirements" +msgstr "" + +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:2 of +msgid "and returns an Instance object after job execution is finished." +msgstr "" + +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:4 of +msgid "" +"requirements need to be packed. Every item of the array follows the " +"definition of requirements.txt of PEP 508." +msgstr "" + +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:6 of +msgid "" +"requirements to be installed before creating package. Note that these " +"items are not necessarily been included in the resulting package. Every " +"item of the array follows the definition of requirements.txt of PEP 508." +msgstr "" + +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:10 of +msgid "" +"requirements to be excluded in the package. Must be names of the packages" +" without version requirements." +msgstr "" + +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:12 of +msgid "" +"name of the resource to be built. If absent, the submitted job will " +"create a package name itself given ``is_production``. and " +"``schedule_id``." +msgstr "" + +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:15 of +msgid "" +"if True, will generate a package name with ``pythonpack-prod-`` prefix if" +" ``resource_name`` is absent. Otherwise ``pythonpack-dev-`` will be used." +msgstr "" + +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:18 of +msgid "" +"ID of the schedule node to be added as part of the resource name if " +"``is_production`` is True and ``resource_name`` is absent." +msgstr "" + +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:21 of +msgid "" +"if True, rebuild the resource if the resource already exists. Otherwise " +"return directly." +msgstr "" + +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:23 of +msgid "" +"tag of Python version, for example, ``cp37``. If absent, PyODPS will " +"choose a supported Python version of MaxCompute if " +"``align_python_version`` is False, otherwise version of the current " +"Python interpreter will be used." +msgstr "" + +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:27 of +msgid "" +"take effect when ``python_tag`` is absent. If True, PyODPS will choose a " +"supported Python version of MaxCompute, otherwise version of the current " +"Python interpreter will be used. Set it to False when you are creating " +"packages for other environments like DataWorks." +msgstr "" + +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:32 of +msgid "" +"if True, prefer package versions with wheels other than source archives. " +"See ``--prefer-binary`` argument of ``pip install`` for more details." +msgstr "" + +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:35 of +msgid "" +"if True, will not include dependencies of requirements. See ``--no-deps``" +" argument of ``pip install`` for more details." +msgstr "" + +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:37 of +msgid "if True, will not collect dependent binary packages. False by default." +msgstr "" + +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:39 of +msgid "" +"if True, will include pre-released versions of packages. See ``--pre`` " +"argument of ``pip install`` for more details." +msgstr "" + +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:43 of +msgid "" +"project name of the target archive resource. If not provided, will be the" +" default project" +msgstr "" + +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:45 of +msgid "" +"schema name of the target archive resource. If not provided, will be the " +"default schema" +msgstr "" + #: odps.core.ODPS.execute_security_query:1 of msgid "" "Execute a security query to grant / revoke / query privileges and returns" @@ -1225,42 +1366,49 @@ msgstr "" msgid "SQL statement" msgstr "" -#: odps.core.ODPS.execute_sql:9 odps.core.ODPS.execute_sql_cost:4 -#: odps.core.ODPS.run_sql:9 of +#: odps.core.ODPS.execute_sql:7 odps.core.ODPS.execute_sql_cost:4 +#: odps.core.ODPS.run_sql:7 of msgid "settings for SQL, e.g. `odps.mapred.map.split.size`" msgstr "" +#: odps.core.ODPS.execute_sql:8 odps.core.ODPS.run_sql:9 of +msgid "name of quota to use for SQL job" +msgstr "" + #: odps.core.ODPS.execute_sql_cost:6 #: odps.models.instance.Instance.get_sql_task_cost:7 of msgid "cost info in dict format" msgstr "" -#: odps.core.ODPS.execute_sql_interactive:1 of +#: odps.models.session.SessionMethods.execute_sql_interactive:1 of msgid "" "Run SQL query in interactive mode (a.k.a MaxCompute QueryAcceleration). " "If query is not supported or fails, and fallback is True, will fallback " "to offline mode automatically" msgstr "" -#: odps.core.ODPS.execute_sql_interactive:5 of +#: odps.models.session.SessionMethods.execute_sql_interactive:5 +#: odps.models.session.SessionMethods.run_sql_interactive:4 of msgid "the sql query." msgstr "" -#: odps.core.ODPS.execute_sql_interactive:6 of +#: odps.models.session.SessionMethods.execute_sql_interactive:6 +#: odps.models.session.SessionMethods.run_sql_interactive:5 of msgid "settings for sql query." msgstr "" -#: odps.core.ODPS.execute_sql_interactive:7 of +#: odps.models.session.SessionMethods.execute_sql_interactive:7 of msgid "" "fallback query to non-interactive mode, True by default. Both boolean " "type and policy names separated by commas are acceptable." msgstr "" -#: odps.core.ODPS.execute_sql_interactive:9 of +#: odps.models.session.SessionMethods.execute_sql_interactive:9 of msgid "wait fallback instance to finish, True by default." msgstr "" -#: odps.core.ODPS.execute_sql_interactive:10 of +#: odps.models.session.SessionMethods.execute_sql_interactive:10 +#: odps.models.session.SessionMethods.run_sql_interactive:6 of msgid "instance." msgstr "" @@ -1304,8 +1452,9 @@ msgid "instance id" msgstr "" #: odps.core.ODPS.exist_instance:5 odps.core.ODPS.exist_project:4 -#: odps.core.ODPS.exist_resource:7 odps.core.ODPS.exist_schema:5 -#: odps.core.ODPS.exist_volume:6 odps.core.ODPS.exist_xflow:5 of +#: odps.core.ODPS.exist_quota:4 odps.core.ODPS.exist_resource:7 +#: odps.core.ODPS.exist_schema:5 odps.core.ODPS.exist_volume:6 +#: odps.core.ODPS.exist_xflow:5 of msgid "True if exists or False" msgstr "" @@ -1321,6 +1470,14 @@ msgstr "" msgid "If project name which provided exists or not." msgstr "" +#: odps.core.ODPS.exist_quota:1 of +msgid "If quota name which provided exists or not." +msgstr "" + +#: odps.core.ODPS.exist_quota:3 of +msgid "quota name" +msgstr "" + #: odps.core.ODPS.exist_resource:1 of msgid "If the resource with given name exists or not." msgstr "" @@ -1448,6 +1605,14 @@ msgstr "" msgid "JSON object" msgstr "" +#: odps.core.ODPS.get_quota:1 of +msgid "Get quota by name" +msgstr "" + +#: odps.core.ODPS.get_quota:3 of +msgid "quota name, if not provided, will be the name in ODPS entry" +msgstr "" + #: odps.core.ODPS.get_resource:1 of msgid "Get a resource by given name" msgstr "" @@ -1705,6 +1870,18 @@ msgstr "" msgid "projects in this endpoint." msgstr "" +#: odps.core.ODPS.list_quotas:1 of +msgid "List quotas by region id" +msgstr "" + +#: odps.core.ODPS.list_quotas:3 of +msgid "Region ID" +msgstr "" + +#: odps.core.ODPS.list_quotas:4 of +msgid "quotas" +msgstr "" + #: odps.core.ODPS.list_resources:1 of msgid "List all resources of a project." msgstr "" @@ -1948,7 +2125,8 @@ msgid "length limit" msgstr "" #: odps.core.ODPS.open_volume_reader:11 odps.core.ODPS.open_volume_writer:13 -#: odps.core.ODPS.read_table:16 odps.core.ODPS.write_table:13 of +#: odps.models.tableio.TableIOMethods.read_table:13 +#: odps.models.tableio.TableIOMethods.write_table:16 of msgid "the compression algorithm, level and strategy" msgstr "" @@ -2026,69 +2204,75 @@ msgstr "" #: odps.mars_extension.oscar.core.persist_mars_dataframe:15 #: odps.mars_extension.oscar.core.to_mars_dataframe:17 -#: odps.models.table.Table.open_reader:9 odps.models.table.Table.open_writer:12 -#: of +#: odps.models.table.Table.open_reader:10 +#: odps.models.table.Table.open_writer:10 of msgid "name of tunnel quota" msgstr "" -#: odps.core.ODPS.read_table:1 of +#: odps.models.tableio.TableIOMethods.read_table:1 of msgid "Read table's records." msgstr "" -#: odps.core.ODPS.read_table:3 odps.core.ODPS.write_table:3 of +#: odps.models.tableio.TableIOMethods.read_table:3 +#: odps.models.tableio.TableIOMethods.write_table:3 of msgid "table or table name" msgstr "" -#: odps.core.ODPS.read_table:5 of +#: odps.models.tableio.TableIOMethods.read_table:5 of msgid "the records' size, if None will read all records from the table" msgstr "" -#: odps.core.ODPS.read_table:6 of +#: odps.models.tableio.TableIOMethods.read_table:6 of msgid "the record where read starts with" msgstr "" -#: odps.core.ODPS.read_table:7 of +#: odps.models.tableio.TableIOMethods.read_table:7 of msgid "default as 1" msgstr "" -#: odps.core.ODPS.read_table:11 of +#: odps.models.tableio.TableIOMethods.read_table:10 of msgid "the partition of this table to read" msgstr "" -#: odps.core.ODPS.read_table:12 of +#: odps.models.tableio.TableIOMethods.read_table:11 of msgid "the columns' names which are the parts of table's columns" msgstr "" -#: odps.core.ODPS.read_table:14 of +#: odps.models.tableio.TableIOMethods.read_table:12 of msgid "if True, the data will be compressed during downloading" msgstr "" -#: odps.core.ODPS.read_table:18 odps.core.ODPS.write_table:15 of +#: odps.models.tableio.TableIOMethods.read_table:15 +#: odps.models.tableio.TableIOMethods.write_table:18 of msgid "tunnel service URL" msgstr "" -#: odps.core.ODPS.read_table:19 of +#: odps.models.tableio.TableIOMethods.read_table:16 of msgid "" "reading the table will reuse the session which opened last time, if set " "to True will open a new download session, default as False" msgstr "" -#: odps.core.ODPS.read_table:21 odps.models.partition.Partition.head:5 -#: odps.models.table.Table.head:6 of +#: odps.models.partition.Partition.head:5 odps.models.table.Table.head:6 +#: odps.models.tableio.TableIOMethods.read_table:18 of msgid "records" msgstr "" -#: odps.core.ODPS.read_table:31 odps.core.ODPS.write_table:30 #: odps.models.partition.Partition.head:8 odps.models.table.Table.head:9 #: odps.models.table.Table.new_record:6 odps.models.table.Table.new_record:16 -#: of +#: odps.models.tableio.TableIOMethods.read_table:28 +#: odps.models.tableio.TableIOMethods.write_table:59 of msgid ":class:`odps.models.Record`" msgstr "" -#: odps.core.ODPS.run_archive_table:1 of +#: odps.models.tasks.merge.MergeTask.run_archive_table:1 of msgid "Start running a task to archive tables." msgstr "" +#: odps.models.tasks.merge.MergeTask.run_freeze_command:1 of +msgid "Start running a task to freeze or restore tables." +msgstr "" + #: odps.internal.core.InternalODPSMixin.run_kube_task:1 of msgid "Start running the kube_task based on the yaml file." msgstr "" @@ -2097,7 +2281,7 @@ msgstr "" msgid "specs of kubernetes objects" msgstr "" -#: odps.core.ODPS.run_merge_files:1 of +#: odps.models.tasks.merge.MergeTask.run_merge_files:1 of msgid "Start running a task to merge multiple files in tables." msgstr "" @@ -2117,6 +2301,12 @@ msgid "" "object." msgstr "" +#: odps.models.tasks.maxframe.MaxFrameTask.run_pythonpack:1 of +msgid "" +"Run PythonPack to create an archive with specified Python requirements " +"and returns an Instance object after job submission." +msgstr "" + #: odps.core.ODPS.run_security_query:1 of msgid "" "Run a security query to grant / revoke / query privileges. If the query " @@ -2129,12 +2319,11 @@ msgstr "" msgid "Run a given SQL statement asynchronously" msgstr "" -#: odps.core.ODPS.run_sql_interactive:1 of +#: odps.models.session.SessionMethods.run_sql_interactive:1 of msgid "" "Run SQL query in interactive mode (a.k.a MaxCompute QueryAcceleration). " "Won't fallback to offline mode automatically if query not supported or " -"fails :param sql: the sql query. :param hints: settings for sql query. " -":return: instance." +"fails" msgstr "" #: odps.core.ODPS.run_xflow:1 of @@ -2231,32 +2420,72 @@ msgstr "" msgid "Get or set tunnel endpoint of the ODPS object" msgstr "" -#: odps.core.ODPS.write_table:1 of +#: odps.models.tableio.TableIOMethods.write_table:1 of msgid "Write records into given table." msgstr "" -#: odps.core.ODPS.write_table:5 of -msgid "if given records only, the block id will be 0 as default." +#: odps.models.tableio.TableIOMethods.write_table:5 of +msgid "" +"records / DataFrame, or block ids and records / DataFrame. If given " +"records or DataFrame only, the block id will be 0 as default." msgstr "" -#: odps.core.ODPS.write_table:9 of +#: odps.models.tableio.TableIOMethods.write_table:9 of msgid "the partition of this table to write" msgstr "" -#: odps.core.ODPS.write_table:10 odps.models.table.Table.open_writer:13 of +#: odps.models.tableio.TableIOMethods.write_table:10 of +msgid "fields representing partitions" +msgstr "" + +#: odps.models.table.Table.open_writer:11 +#: odps.models.tableio.TableIOMethods.write_table:11 of msgid "if True, will overwrite existing data" msgstr "" -#: odps.core.ODPS.write_table:11 of +#: odps.models.tableio.TableIOMethods.write_table:12 of +msgid "if true, the table will be created if not exist" +msgstr "" + +#: odps.models.tableio.TableIOMethods.write_table:13 of +msgid "specify table lifecycle when creating tables" +msgstr "" + +#: odps.models.table.Table.open_writer:6 +#: odps.models.tableio.TableIOMethods.write_table:14 of +msgid "if true, the partition will be created if not exist" +msgstr "" + +#: odps.models.tableio.TableIOMethods.write_table:15 of msgid "if True, the data will be compressed during uploading" msgstr "" -#: odps.core.ODPS.write_table:16 of +#: odps.models.tableio.TableIOMethods.write_table:19 of msgid "" "writing the table will reuse the session which opened last time, if set " "to True will open a new upload session, default as False" msgstr "" +#: odps.models.tableio.TableIOMethods.write_table:25 of +msgid "Write records into a specified table." +msgstr "" + +#: odps.models.tableio.TableIOMethods.write_table:29 of +msgid "Write records into multiple blocks." +msgstr "" + +#: odps.models.tableio.TableIOMethods.write_table:33 of +msgid "Write into a given partition." +msgstr "" + +#: odps.models.tableio.TableIOMethods.write_table:37 of +msgid "Write a pandas DataFrame." +msgstr "" + +#: odps.models.tableio.TableIOMethods.write_table:48 of +msgid "Write a dynamic partition." +msgstr "" + #: odps.models.project.Project:1 of msgid "Project is the counterpart of **database** in a RDBMS." msgstr "" @@ -2406,8 +2635,9 @@ msgstr "" msgid "records' size, 10000 at most" msgstr "" -#: odps.models.table.Table.head:4 odps.models.table.Table.open_reader:3 -#: odps.models.table.Table.open_writer:3 of +#: odps.models.table.Table.head:4 odps.models.table.Table.iter_pandas:3 +#: odps.models.table.Table.open_reader:3 odps.models.table.Table.open_writer:3 +#: odps.models.table.Table.to_pandas:3 of msgid "partition of this table" msgstr "" @@ -2415,6 +2645,57 @@ msgstr "" msgid "the columns which is subset of the table columns" msgstr "" +#: odps.models.table.Table.iter_pandas:1 of +msgid "Iterate table data in blocks as pandas DataFrame" +msgstr "" + +#: odps.models.instance.Instance.iter_pandas:4 +#: odps.models.instance.Instance.to_pandas:4 +#: odps.models.partition.Partition.iter_pandas:3 +#: odps.models.partition.Partition.to_pandas:3 +#: odps.models.table.Table.iter_pandas:4 odps.models.table.Table.open_reader:9 +#: odps.models.table.Table.to_pandas:4 of +msgid "columns to read" +msgstr "" + +#: odps.models.instance.Instance.iter_pandas:6 +#: odps.models.partition.Partition.iter_pandas:4 +#: odps.models.table.Table.iter_pandas:5 of +msgid "size of DataFrame batch to read" +msgstr "" + +#: odps.models.instance.Instance.iter_pandas:7 +#: odps.models.instance.Instance.to_pandas:6 +#: odps.models.partition.Partition.iter_pandas:5 +#: odps.models.partition.Partition.to_pandas:4 +#: odps.models.table.Table.iter_pandas:6 odps.models.table.Table.to_pandas:5 of +msgid "start row index from 0" +msgstr "" + +#: odps.models.instance.Instance.iter_pandas:8 +#: odps.models.instance.Instance.to_pandas:7 +#: odps.models.partition.Partition.iter_pandas:6 +#: odps.models.partition.Partition.to_pandas:5 +#: odps.models.table.Table.iter_pandas:7 odps.models.table.Table.to_pandas:6 of +msgid "data count to read" +msgstr "" + +#: odps.models.partition.Partition.iter_pandas:8 +#: odps.models.partition.Partition.to_pandas:8 +#: odps.models.table.Table.iter_pandas:8 odps.models.table.Table.open_reader:19 +#: odps.models.table.Table.to_pandas:8 of +msgid "if True, partition values will be appended to the output" +msgstr "" + +#: odps.models.instance.Instance.iter_pandas:9 +#: odps.models.instance.Instance.to_pandas:9 +#: odps.models.partition.Partition.iter_pandas:7 +#: odps.models.partition.Partition.to_pandas:7 +#: odps.models.table.Table.iter_pandas:10 odps.models.table.Table.to_pandas:10 +#: of +msgid "name of tunnel quota to use" +msgstr "" + #: odps.models.table.Table.iterate_partitions:1 of msgid "Create an iterable object to iterate over partitions." msgstr "" @@ -2450,7 +2731,7 @@ msgstr "" #: odps.models.instance.Instance.open_reader:15 #: odps.models.partition.Partition.open_reader:5 -#: odps.models.table.Table.open_reader:6 odps.models.table.Table.open_writer:9 +#: odps.models.table.Table.open_reader:6 odps.models.table.Table.open_writer:7 #: of msgid "the tunnel service URL" msgstr "" @@ -2463,7 +2744,7 @@ msgstr "" msgid "use arrow tunnel to read data" msgstr "" -#: odps.models.table.Table.open_reader:10 of +#: odps.models.table.Table.open_reader:11 of msgid "" "enable async mode to create tunnels, can set True if session creation " "takes a long time." @@ -2471,15 +2752,15 @@ msgstr "" #: odps.models.instance.Instance.open_reader:16 #: odps.models.partition.Partition.open_reader:6 -#: odps.models.table.Table.open_reader:12 -#: odps.models.table.Table.open_writer:14 of +#: odps.models.table.Table.open_reader:13 +#: odps.models.table.Table.open_writer:12 of msgid "compression algorithm, level and strategy" msgstr "" #: odps.models.instance.Instance.open_reader:18 #: odps.models.partition.Partition.open_reader:8 -#: odps.models.table.Table.open_reader:14 -#: odps.models.table.Table.open_writer:16 of +#: odps.models.table.Table.open_reader:15 +#: odps.models.table.Table.open_writer:14 of msgid "" "compression algorithm, work when ``compress_option`` is not provided, can" " be ``zlib``, ``snappy``" @@ -2489,16 +2770,16 @@ msgstr "" #: odps.models.instance.Instance.open_reader:21 #: odps.models.partition.Partition.open_reader:10 #: odps.models.partition.Partition.open_reader:11 -#: odps.models.table.Table.open_reader:16 #: odps.models.table.Table.open_reader:17 -#: odps.models.table.Table.open_writer:18 -#: odps.models.table.Table.open_writer:19 of +#: odps.models.table.Table.open_reader:18 +#: odps.models.table.Table.open_writer:16 +#: odps.models.table.Table.open_writer:17 of msgid "used for ``zlib``, work when ``compress_option`` is not provided" msgstr "" #: odps.models.instance.Instance.open_reader:22 #: odps.models.partition.Partition.open_reader:12 -#: odps.models.table.Table.open_reader:18 of +#: odps.models.table.Table.open_reader:21 of msgid "reader, ``count`` means the full size, ``status`` means the tunnel status" msgstr "" @@ -2510,19 +2791,15 @@ msgstr "" msgid "block ids to open" msgstr "" -#: odps.models.table.Table.open_writer:7 of -msgid "if true, the partition will be created if not exist" -msgstr "" - -#: odps.models.table.Table.open_writer:10 of +#: odps.models.table.Table.open_writer:8 of msgid "use existing upload_id to upload data" msgstr "" -#: odps.models.table.Table.open_writer:11 of +#: odps.models.table.Table.open_writer:9 of msgid "use arrow tunnel to write data" msgstr "" -#: odps.models.table.Table.open_writer:20 of +#: odps.models.table.Table.open_writer:18 of msgid "writer, status means the tunnel writer status" msgstr "" @@ -2538,6 +2815,16 @@ msgstr "" msgid "DataFrame object" msgstr "" +#: odps.models.table.Table.to_pandas:1 of +msgid "Read table data into pandas DataFrame" +msgstr "" + +#: odps.models.instance.Instance.to_pandas:8 +#: odps.models.partition.Partition.to_pandas:6 +#: odps.models.table.Table.to_pandas:7 of +msgid "number of processes to accelerate reading" +msgstr "" + #: odps.models.table.Table.truncate:1 of msgid "truncate this table." msgstr "" @@ -2582,6 +2869,11 @@ msgstr "" msgid "Get the head records of a partition" msgstr "" +#: odps.models.partition.Partition.iter_pandas:1 +#: odps.models.partition.Partition.to_pandas:1 of +msgid "Read partition data into pandas DataFrame" +msgstr "" + #: odps.models.partition.Partition.open_reader:1 of msgid "Open the reader to read the entire records from this partition." msgstr "" @@ -2819,6 +3111,18 @@ msgstr "" msgid "True if finished else False" msgstr "" +#: odps.models.instance.Instance.iter_pandas:1 of +msgid "" +"Iterate table data in blocks as pandas DataFrame. The limit argument " +"follows definition of `open_reader` API." +msgstr "" + +#: odps.models.instance.Instance.iter_pandas:5 +#: odps.models.instance.Instance.open_reader:13 +#: odps.models.instance.Instance.to_pandas:5 of +msgid "if True, enable the limitation" +msgstr "" + #: odps.models.instance.Instance.open_reader:1 of msgid "" "Open the reader to read records from the result of the instance. If " @@ -2838,10 +3142,6 @@ msgid "" "will be used and automatic fallback is enabled." msgstr "" -#: odps.models.instance.Instance.open_reader:13 of -msgid "if True, enable the limitation" -msgstr "" - #: odps.models.instance.Instance.put_task_info:1 of msgid "Put information into a task." msgstr "" @@ -2854,6 +3154,12 @@ msgstr "" msgid "Stop this instance." msgstr "" +#: odps.models.instance.Instance.to_pandas:1 of +msgid "" +"Read instance data into pandas DataFrame. The limit argument follows " +"definition of `open_reader` API." +msgstr "" + #: odps.models.instance.Instance.wait_for_completion:1 of msgid "Wait for the instance to complete, and neglect the consequence." msgstr "" @@ -2870,11 +3176,18 @@ msgstr "" msgid "time" msgstr "" +#: odps.models.instance.Instance.wait_for_completion:7 +#: odps.models.instance.Instance.wait_for_success:7 of +msgid "" +"whether to block waiting at server side. Note that this option does not " +"affect client behavior." +msgstr "" + #: odps.models.instance.Instance.wait_for_success:1 of msgid "Wait for instance to complete, and check if the instance is successful." msgstr "" -#: odps.models.instance.Instance.wait_for_success:8 of +#: odps.models.instance.Instance.wait_for_success:10 of msgid ":class:`odps.errors.ODPSError` if the instance failed" msgstr "" diff --git a/docs/source/locale/en/LC_MESSAGES/api-df.po b/docs/source/locale/en/LC_MESSAGES/api-df.po index 19181881..51ce90bf 100644 --- a/docs/source/locale/en/LC_MESSAGES/api-df.po +++ b/docs/source/locale/en/LC_MESSAGES/api-df.po @@ -15,7 +15,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.9.1\n" +"Generated-By: Babel 2.12.1\n" #: ../../source/api-df.rst:4 msgid "DataFrame Reference" @@ -549,9 +549,9 @@ msgstr "" #: odps.df.expr.collections.dropna:4 of msgid "" -"can be ‘any’ or ‘all’. If 'any' is specified any NA values are present, " -"drop that label. If 'all' is specified and all values are NA, drop that " -"label." +"can be ‘any’ or ‘all’. If 'any' is specified any NA values are " +"present, drop that label. If 'all' is specified and all values are NA, " +"drop that label." msgstr "" #: odps.df.expr.collections.dropna:5 of @@ -928,10 +928,10 @@ msgstr "" #: odps.df.expr.collections.melt:3 of msgid "" "This function is useful to massage a DataFrame into a format where one or" -" more columns are identifier variables (id_vars), while all other " -"columns, considered measured variables (value_vars), are “unpivoted” to " -"the row axis, leaving just two non-identifier columns, ‘variable’ and " -"‘value’." +" more columns are identifier variables (id_vars), while all other columns" +", considered measured variables (value_vars), are “unpivoted” to the " +"row axis, leaving just two non-identifier columns, ‘variable’ and ‘" +"value’." msgstr "" #: odps.df.expr.collections.melt:8 of @@ -946,8 +946,8 @@ msgstr "" #: odps.df.expr.collections.melt:10 of msgid "" -"name to use for the ‘variable’ column. If None it uses frame.columns.name" -" or ‘variable’." +"name to use for the ‘variable’ column. If None it uses frame.columns." +"name or ‘variable’." msgstr "" #: odps.df.expr.collections.melt:11 of @@ -1079,8 +1079,8 @@ msgstr "" #: odps.df.expr.collections.pivot:1 of msgid "" -"Produce ‘pivot’ table based on 3 columns of this DataFrame. Uses unique " -"values from rows / columns and fills with values." +"Produce ‘pivot’ table based on 3 columns of this DataFrame. Uses unique" +" values from rows / columns and fills with values." msgstr "" #: odps.df.expr.collections.pivot:5 of diff --git a/docs/source/locale/en/LC_MESSAGES/api.po b/docs/source/locale/en/LC_MESSAGES/api.po index dbf73ce5..670cb364 100644 --- a/docs/source/locale/en/LC_MESSAGES/api.po +++ b/docs/source/locale/en/LC_MESSAGES/api.po @@ -15,7 +15,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.5.3\n" +"Generated-By: Babel 2.12.1\n" #: ../../source/api.rst:5 msgid "API Reference" diff --git a/docs/source/locale/en/LC_MESSAGES/base-dbapi.po b/docs/source/locale/en/LC_MESSAGES/base-dbapi.po index e9b191a9..e08951d8 100644 --- a/docs/source/locale/en/LC_MESSAGES/base-dbapi.po +++ b/docs/source/locale/en/LC_MESSAGES/base-dbapi.po @@ -15,22 +15,24 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.9.1\n" +"Generated-By: Babel 2.12.1\n" #: ../../source/base-dbapi.rst:4 msgid "DBAPI 接口" msgstr "DBAPI Interface" #: ../../source/base-dbapi.rst:6 -msgid "在 PyODPS 0.10.0 中开始支持。事务操作不被 MaxCompute 支持,因而未实现相关接口。" +msgid "" +"在 PyODPS 0.10.0 中开始支持。事务操作不被 MaxCompute 支持,因而未实现相关" +"接口。" msgstr "" -"Supported since PyODPS 0.10.0. As transaction operations are not supported " -"in MaxCompute, related interfaces are not implemented." +"Supported since PyODPS 0.10.0. As transaction operations are not " +"supported in MaxCompute, related interfaces are not implemented." #: ../../source/base-dbapi.rst:8 msgid "" -"PyODPS 支持使用 `Python DBAPI `_ " -"兼容的数据库访问接口访问 MaxCompute。" +"PyODPS 支持使用 `Python DBAPI `_ 兼容" +"的数据库访问接口访问 MaxCompute。" msgstr "" "PyODPS supports accessing MaxCompute data via `Python DBAPI " "`_ compatible interfaces." @@ -40,10 +42,12 @@ msgid "创建连接" msgstr "Create connections" #: ../../source/base-dbapi.rst:13 -msgid "可以通过指定 ``access_id``、``access_key``、``project``和``endpoint`` 来建立连接:" +msgid "" +"可以通过指定 ``access_id``、``access_key``、``project``和``endpoint`` 来" +"建立连接:" msgstr "" -"Connections can be established via ``access_id``, ``access_key``, `project`` " -"and ``endpoint``." +"Connections can be established via ``access_id``, ``access_key``, " +"`project`` and ``endpoint``." #: ../../source/base-dbapi.rst:16 msgid "" diff --git a/docs/source/locale/en/LC_MESSAGES/base-functions.po b/docs/source/locale/en/LC_MESSAGES/base-functions.po index f13e7468..c835b333 100644 --- a/docs/source/locale/en/LC_MESSAGES/base-functions.po +++ b/docs/source/locale/en/LC_MESSAGES/base-functions.po @@ -15,7 +15,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.9.1\n" +"Generated-By: Babel 2.12.1\n" #: ../../source/base-functions.rst:4 msgid "函数" @@ -23,8 +23,8 @@ msgstr "Functions" #: ../../source/base-functions.rst:6 msgid "" -"ODPS用户可以编写自定义 `函数 `_ " -"用在ODPS SQL中。" +"ODPS用户可以编写自定义 `函数 `_ 用在ODPS SQL中。" msgstr "" "You can write user-defined `functions " "`_ (UDFs) to " @@ -36,8 +36,8 @@ msgstr "Basic operations" #: ../../source/base-functions.rst:11 msgid "" -"可以调用 ODPS 入口对象的 ``list_functions`` 来获取项目空间下的所有函数,``exist_function`` " -"能判断是否存在函数, ``get_function`` 获取函数对象。" +"可以调用 ODPS 入口对象的 ``list_functions`` 来获取项目空间下的所有函数,`" +"`exist_function`` 能判断是否存在函数, ``get_function`` 获取函数对象。" msgstr "" "Use ``list_functions`` as the ODPS object to obtain all functions in the " "project. Use ``exist_function`` to check whether the specified function " @@ -100,3 +100,4 @@ msgstr "" ">>> function.class_type = 'my_udf2.Test'\n" ">>> function.resources = [new_resource, ]\n" ">>> function.update() # update metadata on the function object" + diff --git a/docs/source/locale/en/LC_MESSAGES/base-instances.po b/docs/source/locale/en/LC_MESSAGES/base-instances.po index 515a365d..343640e7 100644 --- a/docs/source/locale/en/LC_MESSAGES/base-instances.po +++ b/docs/source/locale/en/LC_MESSAGES/base-instances.po @@ -23,8 +23,9 @@ msgstr "Instance" #: ../../source/base-instances.rst:6 msgid "" -"Task如SQLTask是ODPS的基本计算单元,当一个Task在执行时会被实例化, 以 `ODPS实例 " -"`_ 的形式存在。" +"Task如SQLTask是ODPS的基本计算单元,当一个Task在执行时会被实例化, 以 `" +"ODPS实例 `_ 的形式" +"存在。" msgstr "" "Tasks such as SQLTask are the basic computing units in MaxCompute. When " "executed, a Task is instantiated as a `MaxCompute instance " @@ -36,8 +37,8 @@ msgstr "Basic operations" #: ../../source/base-instances.rst:12 msgid "" -"可以调用 ``list_instances`` 来获取项目空间下的所有instance, ``exist_instance`` " -"能判断是否存在某instance, ``get_instance`` 能获取实例。" +"可以调用 ``list_instances`` 来获取项目空间下的所有instance, ``exist_" +"instance`` 能判断是否存在某instance, ``get_instance`` 能获取实例。" msgstr "" "You can call ``list_instances`` to retrieve all the instances in the " "project. You can use ``exist_instance`` to determine if an instance " @@ -52,7 +53,9 @@ msgid "" msgstr "" #: ../../source/base-instances.rst:23 -msgid "停止一个instance可以在odps入口使用 ``stop_instance``,或者对 instance 对象调用 ``stop`` 方法:" +msgid "" +"停止一个instance可以在odps入口使用 ``stop_instance``,或者对 instance " +"对象调用 ``stop`` 方法:" msgstr "" "You can call ``stop_instance`` on an odps object to stop an instance, or " "call the ``stop`` method on an instance object." @@ -98,10 +101,13 @@ msgstr "" ">>> print(instance.get_logview_address())" #: ../../source/base-instances.rst:49 -msgid "对于 XFlow 任务,需要枚举其子任务,再获取子任务的 LogView。更多细节可以参考 :ref:`XFlow 和模型 ` 。" +msgid "" +"对于 XFlow 任务,需要枚举其子任务,再获取子任务的 LogView。更多细节可以" +"参考 :ref:`XFlow 和模型 ` 。" msgstr "" "For an XFlow task, you need to enumerate its subtasks and retrieve their " -"LogView as follows. More details can be seen at :ref:`XFlow and models `." +"LogView as follows. More details can be seen at :ref:`XFlow and models " +"`." #: ../../source/base-instances.rst:51 #, python-format @@ -120,10 +126,10 @@ msgstr "Instance status" #: ../../source/base-instances.rst:61 msgid "" -"一个instance的状态可以是 ``Running``、``Suspended`` 或者 ``Terminated``,用户可以通过 " -"``status`` 属性来获取状态。 ``is_terminated`` " -"方法返回当前instance是否已经执行完成,``is_successful`` 方法返回当前instance是否正确完成执行, " -"任务处于运行中或者执行失败都会返回False。" +"一个instance的状态可以是 ``Running``、``Suspended`` 或者 ``Terminated``," +"用户可以通过 ``status`` 属性来获取状态。 ``is_terminated`` 方法返回当前" +"instance是否已经执行完成,``is_successful`` 方法返回当前instance是否正确" +"完成执行, 任务处于运行中或者执行失败都会返回False。" msgstr "" "The status of an instance can be ``Running``, ``Suspended`` or " "``Terminated``. You can retrieve the status of an instance by using the " @@ -147,8 +153,9 @@ msgstr "" #: ../../source/base-instances.rst:77 msgid "" -"调用 ``wait_for_completion`` 方法会阻塞直到instance执行完成。 ``wait_for_success`` " -"方法同样会阻塞,不同的是, 如果最终任务执行失败,则会抛出相关异常。" +"调用 ``wait_for_completion`` 方法会阻塞直到instance执行完成。 ``wait_for_" +"success`` 方法同样会阻塞,不同的是, 如果最终任务执行失败,则会抛出相关" +"异常。" msgstr "" "The ``wait_for_completion`` method will block your thread until the " "execution of the current instance has been completed. The " @@ -160,14 +167,18 @@ msgid "子任务操作" msgstr "Subtask operations" #: ../../source/base-instances.rst:83 -msgid "一个Instance真正运行时,可能包含一个或者多个子任务,我们称为Task,要注意这个Task不同于ODPS的计算单元。" +msgid "" +"一个Instance真正运行时,可能包含一个或者多个子任务,我们称为Task,要注意" +"这个Task不同于ODPS的计算单元。" msgstr "" "When an instance is running, it may contain one or several subtasks, " "which are called Tasks. Note that these Tasks are different from the " "computing units in MaxCompute." #: ../../source/base-instances.rst:85 -msgid "我们可以通过 ``get_task_names`` 来获取所有的Task任务,它返回一个所有子任务的名称列表。" +msgid "" +"我们可以通过 ``get_task_names`` 来获取所有的Task任务,它返回一个所有子" +"任务的名称列表。" msgstr "" "You can call ``get_task_names`` to retrieve all Tasks. This method " "returns the Task names in a list type." @@ -180,8 +191,8 @@ msgstr "" #: ../../source/base-instances.rst:92 msgid "" -"拿到Task的名称,我们就可以通过 ``get_task_result`` 来获取这个Task的执行结果。 " -"``get_task_results`` 以字典的形式返回每个Task的执行结果" +"拿到Task的名称,我们就可以通过 ``get_task_result`` 来获取这个Task的执行" +"结果。 ``get_task_results`` 以字典的形式返回每个Task的执行结果" msgstr "" "After getting the Task names, you can use ``get_task_result`` to retrieve" " the execution results of these tasks. The ``get_task_results`` method " @@ -203,7 +214,9 @@ msgid "" msgstr "" #: ../../source/base-instances.rst:106 -msgid "有时候我们需要在任务实例运行时显示所有子任务的运行进程。使用 ``get_task_progress`` 能获得Task当前的运行进度。" +msgid "" +"有时候我们需要在任务实例运行时显示所有子任务的运行进程。使用 ``get_task_" +"progress`` 能获得Task当前的运行进度。" msgstr "" "You can use ``get_task_progress`` to retrieve the running progress of a " "Task." @@ -218,3 +231,4 @@ msgid "" ">>> time.sleep(10)\n" "20160519101349613gzbzufck2 2016-05-19 18:14:03 M1_Stg1_job0:0/1/1[100%]" msgstr "" + diff --git a/docs/source/locale/en/LC_MESSAGES/base-models.po b/docs/source/locale/en/LC_MESSAGES/base-models.po index b5579b1a..377c09fe 100644 --- a/docs/source/locale/en/LC_MESSAGES/base-models.po +++ b/docs/source/locale/en/LC_MESSAGES/base-models.po @@ -15,7 +15,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.9.1\n" +"Generated-By: Babel 2.12.1\n" #: ../../source/base-models.rst:4 msgid "XFlow 和模型" @@ -26,7 +26,9 @@ msgid "XFlow" msgstr "XFlow" #: ../../source/base-models.rst:9 -msgid "XFlow 是 ODPS 对算法包的封装,使用 PyODPS 可以执行 XFlow。对于下面的 PAI 命令:" +msgid "" +"XFlow 是 ODPS 对算法包的封装,使用 PyODPS 可以执行 XFlow。对于下面的 PAI " +"命令:" msgstr "" "XFlow is a MaxCompute algorithm package. You can use PyODPS to execute " "XFlow tasks. For the following PAI command:" @@ -78,8 +80,8 @@ msgstr "" #: ../../source/base-models.rst:33 msgid "" -"这两个方法都会返回一个 Instance 对象。由于 XFlow 的一个 Instance 包含若干个子 " -"Instance,需要使用下面的方法来获得每个 Instance 的 LogView:" +"这两个方法都会返回一个 Instance 对象。由于 XFlow 的一个 Instance 包含" +"若干个子 Instance,需要使用下面的方法来获得每个 Instance 的 LogView:" msgstr "" "Both methods return an Instance object. An XFlow instance contains " "several sub-instances. You can obtain the LogView of each Instance by " @@ -96,11 +98,12 @@ msgstr "" #: ../../source/base-models.rst:41 msgid "" "需要注意的是,``get_xflow_sub_instances`` 返回的是 Instance 当前的子 " -"Instance,可能会随时间变化,因而可能需要定时查询。 为简化这一步骤,可以使用 ``iter_xflow_sub_instances " -"方法``。该方法返回一个迭代器,会阻塞执行直至发现新的子 Instance 或者主 Instance 结束。同时需要注意的是, " -"``iter_xflow_sub_instances`` 默认不会检查 Instance 是否报错,建议在循环结束时手动检查 Instance " -"是否报错,以免遗漏可能的问题,或者增加 ``check=True`` 参数在 ``iter_xflow_sub_instances`` " -"退出时自动检查:" +"Instance,可能会随时间变化,因而可能需要定时查询。 为简化这一步骤,可以" +"使用 ``iter_xflow_sub_instances 方法``。该方法返回一个迭代器,会阻塞执行" +"直至发现新的子 Instance 或者主 Instance 结束。同时需要注意的是, ``iter_" +"xflow_sub_instances`` 默认不会检查 Instance 是否报错,建议在循环结束时" +"手动检查 Instance 是否报错,以免遗漏可能的问题,或者增加 ``check=True`` " +"参数在 ``iter_xflow_sub_instances`` 退出时自动检查:" msgstr "" "Note that ``get_xflow_sub_instances`` returns the current sub-instances " "of an Instance object, which may change over time. Periodic queries may " @@ -148,7 +151,8 @@ msgid "" ">>> inst = o.run_xflow('AlgoName', 'algo_public',\n" " parameters={'param1': 'param_value1', 'param2': " "'param_value2', ...})\n" -">>> # 增加 check=True,在循环结束时自动检查报错。如果循环中 break,instance 错误不会被抛出\n" +">>> # 增加 check=True,在循环结束时自动检查报错。如果循环中 break," +"instance 错误不会被抛出\n" ">>> for sub_inst_name, sub_inst in o.iter_xflow_sub_instances(inst, " "check=True):\n" ">>> print('%s: %s' % (sub_inst_name, sub_inst.get_logview_address()))" @@ -163,7 +167,9 @@ msgstr "" ">>> print('%s: %s' % (sub_inst_name, sub_inst.get_logview_address()))" #: ../../source/base-models.rst:68 -msgid "在调用 run_xflow 或者 execute_xflow 时,也可以指定运行参数,指定的方法与 SQL 类似:" +msgid "" +"在调用 run_xflow 或者 execute_xflow 时,也可以指定运行参数,指定的方法与 " +"SQL 类似:" msgstr "" "You can specify runtime parameters when calling run_xflow or " "execute_xflow. This process is similar to executing SQL statements:" @@ -203,12 +209,12 @@ msgstr "" #: ../../source/base-models.rst:92 msgid "" -"PAI 命令的文档可以参考 `这份文档 " -"`_ 里列出的各个\"组件参考\"章节。" +"PAI 命令的文档可以参考 `这份文档 `_ 里列出的各个\"组件参考\"章节。" msgstr "" "Details about PAI commands can be found in chapters about different " -"components linked in `this page `_ 。" #: ../../source/base-models.rst:95 @@ -217,8 +223,8 @@ msgstr "Offline models" #: ../../source/base-models.rst:97 msgid "" -"离线模型是 XFlow 分类 / 回归算法输出的模型。用户可以使用 PyODPS ML 或直接使用 odps.run_xflow " -"创建一个离线模型,例如下面使用 run_xflow 的例子:" +"离线模型是 XFlow 分类 / 回归算法输出的模型。用户可以使用 PyODPS ML 或直接" +"使用 odps.run_xflow 创建一个离线模型,例如下面使用 run_xflow 的例子:" msgstr "" "Offline models are outputs of XFlow classification or regression " "algorithms. You can directly call odps.run_xflow to create an offline " diff --git a/docs/source/locale/en/LC_MESSAGES/base-projects.po b/docs/source/locale/en/LC_MESSAGES/base-projects.po index a18ac1c2..ca61e6bd 100644 --- a/docs/source/locale/en/LC_MESSAGES/base-projects.po +++ b/docs/source/locale/en/LC_MESSAGES/base-projects.po @@ -15,7 +15,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.9.1\n" +"Generated-By: Babel 2.12.1\n" #: ../../source/base-projects.rst:4 msgid "项目空间" @@ -23,8 +23,8 @@ msgstr "Projects" #: ../../source/base-projects.rst:6 msgid "" -"`项目空间 `_ " -"是ODPS的基本组织单元, 有点类似于Database的概念。" +"`项目空间 `_ 是ODPS的" +"基本组织单元, 有点类似于Database的概念。" msgstr "" "A `project `_ " "is a basic organizational unit of MaxCompute, which is similar to a " diff --git a/docs/source/locale/en/LC_MESSAGES/base-resources.po b/docs/source/locale/en/LC_MESSAGES/base-resources.po index 82d4afbd..3eb6426d 100644 --- a/docs/source/locale/en/LC_MESSAGES/base-resources.po +++ b/docs/source/locale/en/LC_MESSAGES/base-resources.po @@ -15,7 +15,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.9.1\n" +"Generated-By: Babel 2.12.1\n" #: ../../source/base-resources.rst:4 msgid "资源" @@ -23,14 +23,16 @@ msgstr "Resources" #: ../../source/base-resources.rst:6 msgid "" -"`资源 `_ " -"在ODPS上常用在UDF和MapReduce中。" +"`资源 `_ 在ODPS上常用" +"在UDF和MapReduce中。" msgstr "" "`Resources `_ " "commonly apply to UDF and MapReduce on MaxCompute." #: ../../source/base-resources.rst:8 -msgid "在PyODPS中,主要支持两种资源类型,一种是文件,另一种是表。它们的基本操作(列举和删除)相同,但创建和修改方法略有差异,下面分别说明。" +msgid "" +"在PyODPS中,主要支持两种资源类型,一种是文件,另一种是表。它们的基本操作" +"(列举和删除)相同,但创建和修改方法略有差异,下面分别说明。" msgstr "" "PyODPS mainly supports two resource types, namely, file resources and " "table resources. They share same iteration and deletion operations, while" @@ -44,8 +46,9 @@ msgstr "Basic operations" #: ../../source/base-resources.rst:13 msgid "" -"列出所有资源还是可以使用 ``list_resources``,判断资源是否存在使用 ``exist_resource``。 删除资源时,可以调用" -" ``delete_resource``,或者直接对于Resource对象调用 ``drop`` 方法。" +"列出所有资源还是可以使用 ``list_resources``,判断资源是否存在使用 ``exist" +"_resource``。 删除资源时,可以调用 ``delete_resource``,或者直接对于" +"Resource对象调用 ``drop`` 方法。" msgstr "" "You can use ``list_resources`` to list all resources and use " "``exist_resource`` to check whether a resource exists. You can call " @@ -84,8 +87,8 @@ msgstr "" #: ../../source/base-resources.rst:36 msgid "" -"删除给定资源,可以使用 ODPS 入口对象的 ``delete_resource`` 方法,也可以使用 ``Resource`` 对象自己的 " -"``drop`` 方法。" +"删除给定资源,可以使用 ODPS 入口对象的 ``delete_resource`` 方法,也可以" +"使用 ``Resource`` 对象自己的 ``drop`` 方法。" msgstr "" "To delete certain resources, you may use ``delete_resource`` method of " "ODPS entrance object, or use ``drop`` method of the ``Resource`` object." @@ -117,7 +120,9 @@ msgid "创建文件资源" msgstr "Create a file resource" #: ../../source/base-resources.rst:54 -msgid "创建文件资源可以通过给定资源名、文件类型、以及一个file-like的对象(或者是字符串对象)来创建,比如" +msgid "" +"创建文件资源可以通过给定资源名、文件类型、以及一个file-like的对象(或者是" +"字符串对象)来创建,比如" msgstr "" "You can create a file resource by specifying the resource name, file " "type, and a file-like object (or a string object), as shown in the " @@ -125,14 +130,16 @@ msgstr "" #: ../../source/base-resources.rst:56 msgid "" -"# 使用 file-like 的对象创建文件资源,注意压缩包等文件需要用二进制模式读取\n" +"# 使用 file-like 的对象创建文件资源,注意压缩包等文件需要用二进制模式读取" +"\n" "resource = o.create_resource('test_file_resource', 'file', " "fileobj=open('/to/path/file', 'rb'))\n" "# 使用字符串\n" "resource = o.create_resource('test_py_resource', 'py', fileobj='import " "this')" msgstr "" -"# File-like objects as file content. Use binary mode to read source file.\n" +"# File-like objects as file content. Use binary mode to read source file." +"\n" "resource = o.create_resource('test_file_resource', 'file', " "fileobj=open('/to/path/file')) \n" "# Strings as file content.\n" @@ -150,15 +157,18 @@ msgid "" msgstr "" #: ../../source/base-resources.rst:72 -msgid "在 fileobj 参数中传入字符串,创建的资源内容为 **字符串本身** 而非字符串代表的路径指向的文件。" +msgid "" +"在 fileobj 参数中传入字符串,创建的资源内容为 **字符串本身** 而非字符串" +"代表的路径指向的文件。" msgstr "" "When ``fileobj`` is a string, the content of the created resource is the " "string itself, not the content of the file the string point to." #: ../../source/base-resources.rst:74 msgid "" -"如果文件过大(例如大小超过 64MB),PyODPS 可能会使用分块上传模式,而这不被旧版 MaxCompute 部署所支持。 如需在旧版 " -"MaxCompute 中上传大文件,请配置 ``options.upload_resource_in_chunks = False`` 。" +"如果文件过大(例如大小超过 64MB),PyODPS 可能会使用分块上传模式,而这不" +"被旧版 MaxCompute 部署所支持。 如需在旧版 MaxCompute 中上传大文件,请配置" +" ``options.upload_resource_in_chunks = False`` 。" msgstr "" "If the size of file to upload is over certain size (for instance, 64MB), " "PyODPS might upload the file in parts, which is not supported in old " @@ -171,8 +181,9 @@ msgstr "Read and modify a file resource" #: ../../source/base-resources.rst:79 msgid "" -"对文件资源调用 ``open`` 方法,或者在 MaxCompute 入口调用 ``open_resource`` 都能打开一个资源, " -"打开后的对象会是 file-like 的对象。 类似于Python内置的 ``open`` 方法,文件资源也支持打开的模式。我们看例子:" +"对文件资源调用 ``open`` 方法,或者在 MaxCompute 入口调用 ``open_resource`" +"` 都能打开一个资源, 打开后的对象会是 file-like 的对象。 类似于Python内置" +"的 ``open`` 方法,文件资源也支持打开的模式。我们看例子:" msgstr "" "You can call the ``open`` method for a file resource or call " "``open_resource`` at the MaxCompute entry to open a file resource. The " @@ -187,8 +198,8 @@ msgid "" ">>> lines = fp.readlines() # 读成多行\n" ">>> fp.write('Hello World') # 报错,读模式下无法写资源\n" ">>>\n" -">>> with o.open_resource('test_file_resource', mode='r+') as fp: # " -"读写模式打开\n" +">>> with o.open_resource('test_file_resource', mode='r+') as fp: # 读写" +"模式打开\n" ">>> fp.read()\n" ">>> fp.tell() # 当前位置\n" ">>> fp.seek(10)\n" @@ -248,8 +259,9 @@ msgstr "" #: ../../source/base-resources.rst:109 msgid "" -"同时,PyODPS中,文件资源支持以二进制模式打开,打开如说一些压缩文件等等就需要以这种模式, 因此 ``rb`` " -"就是指以二进制读模式打开文件,``r+b`` 是指以二进制读写模式打开。" +"同时,PyODPS中,文件资源支持以二进制模式打开,打开如说一些压缩文件等等就" +"需要以这种模式, 因此 ``rb`` 就是指以二进制读模式打开文件,``r+b`` 是指以" +"二进制读写模式打开。" msgstr "" "In PyODPS, file resources can be opened in binary mode. For example, some" " compressed files must be opened in binary mode. ``rb`` indicates opening" @@ -258,15 +270,16 @@ msgstr "" #: ../../source/base-resources.rst:112 msgid "" -"对于较大的文件资源,可以使用流式方式读写文件,使用方法为在调用 ``open_resource`` 时增加一个 ``stream=True`` " -"选项:" +"对于较大的文件资源,可以使用流式方式读写文件,使用方法为在调用 ``open_" +"resource`` 时增加一个 ``stream=True`` 选项:" msgstr "" "For large file resources, you may read or write them in streams by adding" " a ``stream=True`` argument in ``open_resource`` calls." #: ../../source/base-resources.rst:115 msgid "" -">>> with o.open_resource('test_file_resource', mode='w') as fp: # 写模式打开\n" +">>> with o.open_resource('test_file_resource', mode='w') as fp: # 写模式" +"打开\n" ">>> fp.writelines(['Hello\\n', 'World\\n']) # 写入多行\n" ">>> fp.write('Hello World')\n" ">>> fp.flush() # 手动调用会将更新提交到 MaxCompute\n" @@ -290,7 +303,9 @@ msgstr "" ">>> lines = fp.readlines() # read multiple lines" #: ../../source/base-resources.rst:127 -msgid "当 ``stream=True`` 时,只支持 ``r`` , ``rb`` , ``w`` , ``wb`` 四种模式。" +msgid "" +"当 ``stream=True`` 时,只支持 ``r`` , ``rb`` , ``w`` , ``wb`` 四种模式" +"。" msgstr "" "When ``stream=True`` is specified, only ``r``, ``rb``, ``w`` and ``wb`` " "are supported in ``mode``." diff --git a/docs/source/locale/en/LC_MESSAGES/base-schemas.po b/docs/source/locale/en/LC_MESSAGES/base-schemas.po index 6d161e12..6aa5e03d 100644 --- a/docs/source/locale/en/LC_MESSAGES/base-schemas.po +++ b/docs/source/locale/en/LC_MESSAGES/base-schemas.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: PyODPS 0.11.3\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-08-02 20:29+0800\n" +"POT-Creation-Date: 2024-08-29 18:44+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -23,9 +23,9 @@ msgstr "" #: ../../source/base-schemas.rst:8 msgid "" -"Schema 属于 MaxCompute 的公测功能,需要通过 `新功能测试申请 " -"`_ 开通。使用 Schema 需要 " -"PyODPS 0.11.3 以上版本。" +"Schema 属于 MaxCompute 的公测功能,需要通过 `新功能测试申请 `_ 开通。使用 Schema 需要 PyODPS " +"0.11.3 以上版本。" msgstr "" "Schema is a beta function of MaxCompute. You need to `apply for a trial " "of new features `_ 是 " -"MaxCompute 介于项目和表 / 资源 / 函数之间的概念,对表 / 资源 / 函数进行进一步归类。" +"MaxCompute 介于项目和表 / 资源 / 函数之间的概念,对表 / 资源 / 函数进行" +"进一步归类。" msgstr "" "`Schema `_ is a " "concept between projects and objects like tables, resources or functions." @@ -48,7 +49,8 @@ msgstr "Basic operations" #: ../../source/base-schemas.rst:16 msgid "你可以使用 ``exist_schema`` 判断 Schema 对象是否存在:" msgstr "" -"You may use ``exist_schema`` to check if the schema with specific name exists." +"You may use ``exist_schema`` to check if the schema with specific name " +"exists." #: ../../source/base-schemas.rst:18 msgid "print(o.exist_schema(\"test_schema\"))" @@ -74,8 +76,7 @@ msgstr "" #: ../../source/base-schemas.rst:35 msgid "使用 ``get_schema`` 获得一个 Schema 对象并打印 Schema Owner:" -msgstr "" -"Use ``get_schema`` to obtain a schema object and print its owner." +msgstr "Use ``get_schema`` to obtain a schema object and print its owner." #: ../../source/base-schemas.rst:37 msgid "" @@ -86,7 +87,8 @@ msgstr "" #: ../../source/base-schemas.rst:42 msgid "使用 ``list_schema`` 列举所有 Schema 对象并打印名称:" msgstr "" -"Use ``list_schema`` to list all schemas in s project and print their names." +"Use ``list_schema`` to list all schemas in s project and print their " +"names." #: ../../source/base-schemas.rst:44 msgid "" @@ -100,8 +102,9 @@ msgstr "Handling objects in Schema" #: ../../source/base-schemas.rst:51 msgid "" -"在开启 Schema 后,MaxCompute 入口对象默认操作的 MaxCompute 对象都位于名为 ``DEFAULT`` 的 Schema" -" 下。为操作其他 Schema 下的对象,需要在创建入口对象时指定 Schema,例如:" +"在开启 Schema 后,MaxCompute 入口对象默认操作的 MaxCompute 对象都位于名为" +" ``DEFAULT`` 的 Schema 下。为操作其他 Schema 下的对象,需要在创建入口对象" +"时指定 Schema,例如:" msgstr "" "After schemas are enabled, calls on your MaxCompute entrance only affects" " objects in the schema named ``DEFAULT`` by default. To handle objects in" @@ -139,7 +142,9 @@ msgstr "" ")" #: ../../source/base-schemas.rst:69 -msgid "也可以为不同对象的操作方法指定 ``schema`` 参数。例如,下面的方法列举了 ``test_schema`` 下所有的表:" +msgid "" +"也可以为不同对象的操作方法指定 ``schema`` 参数。例如,下面的方法列举了 ``" +"test_schema`` 下所有的表:" msgstr "" "You can also specify names of schemas when handling MaxCompute objects. " "For instance, the code below lists all tables under the schema " @@ -152,35 +157,58 @@ msgid "" msgstr "" #: ../../source/base-schemas.rst:77 +msgid "下列方法给出了如何从 ``test_schema`` 获取表 ``dual`` 并输出表结构:" +msgstr "" +"The code below gets a table named ``dual`` under schema named " +"``test_schema``and outputs its structure." + +#: ../../source/base-schemas.rst:79 +msgid "" +"table = o.get_table('dual', schema='test_schema')\n" +"print(table.table_schema)" +msgstr "" + +#: ../../source/base-schemas.rst:84 msgid "在执行 SQL 时,可以指定默认 Schema:" msgstr "" "You can also specify name of the default schema when executing SQL " "statements." -#: ../../source/base-schemas.rst:79 +#: ../../source/base-schemas.rst:86 msgid "o.execute_sql(\"SELECT * FROM dual\", default_schema=\"test_schema\")" msgstr "" -#: ../../source/base-schemas.rst:83 +#: ../../source/base-schemas.rst:90 msgid "" -"对于表而言,如果项目空间没有启用 Schema,``get_table`` 方法对于 ``x.y`` 形式的表名,默认按照 " -"``project.table`` 处理。如果当前租户开启了 ``odps.namespace.schema`` 配置,``get_table``" -" 会将 ``x.y`` 作为 ``schema.table`` 处理,否则依然按照 ``project.table`` 处理。如果租户上 " -"没有配置该选项,可以配置 ``options.always_enable_schema = True``,此后所有 ``x.y`` 都将被作为 " -"``schema.table`` 处理:" +"对于表而言,如果项目空间没有启用 Schema,``get_table`` 方法对于 ``x.y`` " +"形式的表名,默认按照 ``project.table`` 处理。如果当前租户开启了\\ `租户级" +"语法开关 `_\\ ,\\ ``get_table`` 会将 ``x.y`` 作为 ``schema.table`` " +"处理,否则依然按照 ``project.table`` 处理。如果租户上没有配置该选项,可以" +"配置 ``options.enable_schema = True``,此后所有 ``x.y`` 都将被作为 ``" +"schema.table`` 处理:" msgstr "" "For tables, if schema is not enabled in project, ``get_table`` will " -"handle ``x.y`` as ``project.table``. When ``odps.namespace.schema`` is " -"enabled for current tenant, ``get_table`` will handle ``x.y`` as " -"``schema.table``, or it will be still handled as ``project.table``. If " -"the option is not specified, you may configure " -"``options.always_enable_schema = True`` in your Python code and then all " -"table names like ``x.y`` will be handled as ``schema.table``." - -#: ../../source/base-schemas.rst:89 +"handle ``x.y`` as ``project.table``. When `tenant-level information " +"schema syntax `_ is enabled for current tenant, ``get_table`` will " +"handle ``x.y`` as ``schema.table``, or it will be still handled as " +"``project.table``. If the option is not specified, you may configure " +"``options.enable_schema = True`` in your Python code and then all table " +"names like ``x.y`` will be handled as ``schema.table``." + +#: ../../source/base-schemas.rst:96 msgid "" "from odps import options\n" -"options.always_enable_schema = True\n" +"options.enable_schema = True\n" "print(o.get_table(\"myschema.mytable\"))" msgstr "" +#: ../../source/base-schemas.rst:104 +msgid "" +"``options.enable_schema`` 自 PyODPS 0.12.0 开始支持,低版本 PyODPS 需要" +"使用 ``options.always_enable_schema``。" +msgstr "" +"``options.enable_schema`` is supported since PyODPS 0.12.0. " +"``options.always_enable_schema`` should be used in lower versions." + diff --git a/docs/source/locale/en/LC_MESSAGES/base-sql.po b/docs/source/locale/en/LC_MESSAGES/base-sql.po index a6ff7936..3a933f01 100644 --- a/docs/source/locale/en/LC_MESSAGES/base-sql.po +++ b/docs/source/locale/en/LC_MESSAGES/base-sql.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: PyODPS 0.7.16\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2024-01-24 15:30+0800\n" +"POT-Creation-Date: 2024-08-31 12:41+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -23,9 +23,9 @@ msgstr "SQL" #: ../../source/base-sql.rst:6 msgid "" -"PyODPS支持ODPS SQL的查询,并可以读取执行的结果。 ``execute_sql`` / " -"``execute_sql_interactive`` / ``run_sql`` / ``run_sql_interactive`` " -"方法的返回值是 :ref:`运行实例 ` 。" +"PyODPS支持ODPS SQL的查询,并可以读取执行的结果。 ``execute_sql`` / ``" +"execute_sql_interactive`` / ``run_sql`` / ``run_sql_interactive`` 方法的" +"返回值是 :ref:`运行实例 ` 。" msgstr "" "PyODPS supports MaxCompute SQL queries and provides methods to read SQL " "results. All ``execute_sql`` , ``execute_sql_interactive`` , ``run_sql`` " @@ -33,9 +33,10 @@ msgstr "" #: ../../source/base-sql.rst:11 msgid "" -"并非所有在 ODPS Console 中可以执行的命令都是 ODPS 可以接受的 SQL 语句。 在调用非 DDL / DML " -"语句时,请使用其他方法,例如 GRANT / REVOKE 等语句请使用 ``run_security_query`` 方法,PAI 命令请使用 " -"``run_xflow`` 或 ``execute_xflow`` 方法。" +"并非所有在 ODPS Console 中可以执行的命令都是 ODPS 可以接受的 SQL 语句。 " +"在调用非 DDL / DML 语句时,请使用其他方法,例如 GRANT / REVOKE 等语句请" +"使用 ``run_security_query`` 方法,PAI 命令请使用 ``run_xflow`` 或 ``" +"execute_xflow`` 方法。" msgstr "" "The commands that are executable in the MaxCompute Console may not be " "executed as SQL statements in MaxCompute. Use other methods to execute " @@ -49,8 +50,9 @@ msgstr "Execute SQL statements" #: ../../source/base-sql.rst:20 msgid "" -"你可以使用 ``execute_sql`` 方法以同步方式执行 SQL。调用时,该方法会阻塞直至 SQL 执行完成,并返回一个 Instance " -"实例。如果 SQL 执行报错,该方法会抛出以 ``odps.errors.ODPSError`` 为基类的错误。" +"你可以使用 ``execute_sql`` 方法以同步方式执行 SQL。调用时,该方法会阻塞" +"直至 SQL 执行完成,并返回一个 Instance 实例。如果 SQL 执行报错,该方法会" +"抛出以 ``odps.errors.ODPSError`` 为基类的错误。" msgstr "" "You can use ``execute_sql`` to run SQL and wait for finish. The method " "will block until execution is finished and returns an instance object. If" @@ -58,16 +60,19 @@ msgstr "" "on ``odps.errors.ODPSError`` class." #: ../../source/base-sql.rst:23 -msgid ">>> o.execute_sql('select * from dual') # 同步的方式执行,会阻塞直到SQL执行完成" +msgid "" +">>> o.execute_sql('select * from dual') # 同步的方式执行,会阻塞直到SQL" +"执行完成" msgstr "" ">>> o.execute_sql('select * from dual') # synchronous way, will block " "till SQL statement finishes execution\n" #: ../../source/base-sql.rst:27 msgid "" -"你也可以使用非阻塞方式异步执行 SQL。调用时,该方法在将 SQL 提交到 MaxCompute 后即返回 Instance 实例。你需要使用 " -"``wait_for_success`` 方法等待该 SQL 执行完成。同样地,如果 instance 出现错误, " -"``wait_for_success`` 会抛出以 ``odps.errors.ODPSError`` 为基类的错误。" +"你也可以使用非阻塞方式异步执行 SQL。调用时,该方法在将 SQL 提交到 " +"MaxCompute 后即返回 Instance 实例。你需要使用 ``wait_for_success`` 方法" +"等待该 SQL 执行完成。同样地,如果 instance 出现错误, ``wait_for_success`" +"` 会抛出以 ``odps.errors.ODPSError`` 为基类的错误。" msgstr "" "You can also use non-blocking method to run SQL. The method will submit " "your SQL statement to MaxCompute and return the corresponding Instance " @@ -88,8 +93,8 @@ msgstr "" #: ../../source/base-sql.rst:37 msgid "" -"关于如何操作 run_sql / execute_sql 返回的 Instance 实例,可以参考 :ref:`运行实例 `" -" 。" +"关于如何操作 run_sql / execute_sql 返回的 Instance 实例,可以参考 :ref:`" +"运行实例 ` 。" msgstr "" "You can take a look at :ref:`instances ` for more information " "on instance objects returned by run_sql or execute_sql method." @@ -101,8 +106,9 @@ msgstr "Execute SQL with MCQA acceleration" #: ../../source/base-sql.rst:41 msgid "" "`MCQA `_ 是 " -"MaxCompute 提供的查询加速功能, 支持使用独立资源池对中小规模数据进行加速。PyODPS 从 0.11.4.1 开始支持以下列方式通过 " -"MCQA 执行 SQL ,同时需要 MaxCompute 具备 MCQA 的支持。" +"MaxCompute 提供的查询加速功能, 支持使用独立资源池对中小规模数据进行加速" +"。PyODPS 从 0.11.4.1 开始支持以下列方式通过 MCQA 执行 SQL ,同时需要 " +"MaxCompute 具备 MCQA 的支持。" msgstr "" "`MCQA `_ is the " "query acceleration service provided by MaxCompute, which supports " @@ -113,8 +119,9 @@ msgstr "" #: ../../source/base-sql.rst:45 msgid "" -"你可以使用 ``execute_sql_interactive`` 通过 MCQA 执行 SQL 并返回 MCQA Instance。如果 " -"MCQA 无法执行相应的 SQL ,会自动回退到传统模式。此时,函数返回的 Instance 为回退后的 Instance。" +"你可以使用 ``execute_sql_interactive`` 通过 MCQA 执行 SQL 并返回 MCQA " +"Instance。如果 MCQA 无法执行相应的 SQL ,会自动回退到传统模式。此时,函数" +"返回的 Instance 为回退后的 Instance。" msgstr "" "You can use ``execute_sql_interactive`` to execute SQL with MCQA and " "return MCQA Instance object. If MCQA does not support your SQL statement," @@ -127,15 +134,15 @@ msgstr "" #: ../../source/base-sql.rst:52 msgid "" -"如果不希望回退,可以指定参数 ``fallback=False``。也可以指定为回退策略(或回退策略的组合,使用逗号分隔的字符串)。 " -"可用的策略名如下。默认策略为 ``all`` (即 " -"``generic,unsupported,upgrading,noresource,timeout`` )。" +"如果不希望回退,可以指定参数 ``fallback=False``。也可以指定为回退策略(或" +"回退策略的组合,使用逗号分隔的字符串)。 可用的策略名如下。默认策略为 ``" +"all`` (即 ``generic,unsupported,upgrading,noresource,timeout`` )。" msgstr "" "If you don't want the method to fallback automatically, you can specify " "``fallback=False``. You can also specify a fallback policy or combination" " of fallback policies with a comma-separated string. Available policy " -"names are listed below. The default policy is ``all``, which is an alias" -" of a combination of ``generic,unsupported,upgrading,noresource,timeout``." +"names are listed below. The default policy is ``all``, which is an alias " +"of a combination of ``generic,unsupported,upgrading,noresource,timeout``." #: ../../source/base-sql.rst:55 msgid "``generic`` :指定时,表示发生未知错误时回退到传统模式。" @@ -181,9 +188,10 @@ msgstr "" #: ../../source/base-sql.rst:67 msgid "" -"你也可以使用 ``run_sql_interactive`` 通过 MCQA 异步执行 SQL。类似 " -"``run_sql``,该方法会在提交任务后即返回 MCQA Instance,你需要自行等待 Instance " -"完成。需要注意的是,该方法不会自动回退。当执行失败时,你需要自行重试或执行 ``execute_sql``。" +"你也可以使用 ``run_sql_interactive`` 通过 MCQA 异步执行 SQL。类似 ``run_" +"sql``,该方法会在提交任务后即返回 MCQA Instance,你需要自行等待 Instance " +"完成。需要注意的是,该方法不会自动回退。当执行失败时,你需要自行重试或" +"执行 ``execute_sql``。" msgstr "" "You can also use ``run_sql_interactive`` to run SQL with MCQA. The method" " returns MCQA Instance once your SQL is submitted to the cluster, and you" @@ -193,7 +201,8 @@ msgstr "" #: ../../source/base-sql.rst:71 msgid "" -">>> instance = o.run_sql_interactive('select * from dual') # 异步的方式执行\n" +">>> instance = o.run_sql_interactive('select * from dual') # 异步的方式" +"执行\n" ">>> print(instance.get_logview_address()) # 获取logview地址\n" ">>> instance.wait_for_success() # 阻塞直到完成" msgstr "" @@ -207,7 +216,9 @@ msgid "设置时区" msgstr "Set timezone" #: ../../source/base-sql.rst:81 -msgid "有时我们希望对于查询出来的时间数据显示为特定时区下的时间,可以通过 ``options.local_timezone`` 设置客户端的时区。" +msgid "" +"有时我们希望对于查询出来的时间数据显示为特定时区下的时间,可以通过 ``" +"options.local_timezone`` 设置客户端的时区。" msgstr "" "Sometimes we want to display the queried time data with a correct " "timezone. We can set it via ``options.local_timezone``." @@ -260,8 +271,9 @@ msgstr "" #: ../../source/base-sql.rst:112 msgid "" -"设置 ``options.local_timezone`` 后,PyODPS 会根据它的值自动设置 ``odps.sql.timezone``。 " -"两者的值不同可能导致服务端和客户端时间不一致,因此不应再手动设置 ``odps.sql.timezone``。" +"设置 ``options.local_timezone`` 后,PyODPS 会根据它的值自动设置 ``odps." +"sql.timezone``。 两者的值不同可能导致服务端和客户端时间不一致,因此不应再" +"手动设置 ``odps.sql.timezone``。" msgstr "" "After setting ``options.local_timezone``, PyODPS will set " "``odps.sql.timezone`` according to it automatically. The difference of " @@ -274,9 +286,9 @@ msgstr "Set runtime parameters" #: ../../source/base-sql.rst:118 msgid "" -"有时,我们在运行时,需要设置运行时参数,我们可以通过设置 ``hints`` 参数,参数类型是 dict。该参数对 ``execute_sql``" -" / ``execute_sql_interactive`` / ``run_sql`` / ``run_sql_interactive`` " -"均有效。" +"有时,我们在运行时,需要设置运行时参数,我们可以通过设置 ``hints`` 参数," +"参数类型是 dict。该参数对 ``execute_sql`` / ``execute_sql_interactive`` /" +" ``run_sql`` / ``run_sql_interactive`` 均有效。" msgstr "" "You can use the ``hints`` parameter to set runtime parameters. The " "parameter is a dict type which is supported for ``execute_sql``, " @@ -284,38 +296,52 @@ msgstr "" #: ../../source/base-sql.rst:121 msgid "" -">>> o.execute_sql('select * from pyodps_iris', " -"hints={'odps.sql.mapper.split.size': 16})" +">>> hints = {'odps.stage.mapper.split.size': 16, " +"'odps.sql.reducer.instances': 1024}\n" +">>> o.execute_sql('select * from pyodps_iris', hints=hints)" msgstr "" +">>> hints = {'odps.stage.mapper.split.size': 16, " +"'odps.sql.reducer.instances': 1024}\n" +">>> o.execute_sql('select * from pyodps_iris', hints=hints)" -#: ../../source/base-sql.rst:125 -msgid "我们可以对于全局配置设置sql.settings后,每次运行时则都会添加相关的运行时参数。" +#: ../../source/base-sql.rst:126 +msgid "" +"我们可以对于全局配置设置sql.settings后,每次运行时则都会添加相关的运行时" +"参数。" msgstr "" "You can set sql.settings globally. The relevant runtime parameters are " "automatically added during each execution." -#: ../../source/base-sql.rst:127 +#: ../../source/base-sql.rst:128 msgid "" ">>> from odps import options\n" -">>> options.sql.settings = {'odps.sql.mapper.split.size': 16}\n" +">>> options.sql.settings = {\n" +">>> 'odps.stage.mapper.split.size': 16,\n" +">>> 'odps.sql.reducer.instances': 1024,\n" +">>> }\n" ">>> o.execute_sql('select * from pyodps_iris') # 会根据全局配置添加hints" msgstr "" ">>> from odps import options\n" -">>> options.sql.settings = {'odps.sql.mapper.split.size': 16}\n" +">>> options.sql.settings = {\n" +">>> 'odps.stage.mapper.split.size': 16,\n" +">>> 'odps.sql.reducer.instances': 1024,\n" +">>> }\n" ">>> o.execute_sql('select * from pyodps_iris') # global hints configured" " in options.sql.settings will be added" -#: ../../source/base-sql.rst:136 -msgid "读取SQL执行结果" +#: ../../source/base-sql.rst:140 +msgid "读取 SQL 执行结果" msgstr "View SQL results" -#: ../../source/base-sql.rst:138 -msgid "运行 SQL 的 instance 能够直接执行 ``open_reader`` 的操作,一种情况是SQL返回了结构化的数据。" +#: ../../source/base-sql.rst:142 +msgid "" +"运行 SQL 的 instance 能够直接执行 ``open_reader`` 的操作,一种情况是SQL" +"返回了结构化的数据。" msgstr "" "You can execute the ``open_reader`` method to retrieve SQL execution " "results. In the following example, structured data is returned. " -#: ../../source/base-sql.rst:140 +#: ../../source/base-sql.rst:144 msgid "" ">>> with o.execute_sql('select * from dual').open_reader() as reader:\n" ">>> for record in reader:\n" @@ -325,25 +351,29 @@ msgstr "" ">>> for record in reader:\n" ">>> # process every record" -#: ../../source/base-sql.rst:146 -msgid "另一种情况是 SQL 可能执行的比如 ``desc``,这时通过 ``reader.raw`` 属性取到原始的SQL执行结果。" +#: ../../source/base-sql.rst:150 +msgid "" +"另一种情况是 SQL 可能执行的比如 ``desc``,这时通过 ``reader.raw`` 属性取" +"到原始的SQL执行结果。" msgstr "" "When commands such as ``desc`` are executed, you can use the " "``reader.raw`` attribute to get the original execution results. " -#: ../../source/base-sql.rst:148 +#: ../../source/base-sql.rst:152 msgid "" ">>> with o.execute_sql('desc dual').open_reader() as reader:\n" ">>> print(reader.raw)" msgstr "" -#: ../../source/base-sql.rst:153 +#: ../../source/base-sql.rst:157 msgid "" -"如果 `options.tunnel.use_instance_tunnel == True`,在调用 open_reader 时,PyODPS " -"会默认调用 Instance Tunnel, 否则会调用旧的 Result 接口。如果你使用了版本较低的 MaxCompute 服务,或者调用 " -"Instance Tunnel 出现了问题,PyODPS 会给出警告并自动降级到旧的 Result 接口,可根据警告信息判断导致降级的原因。如果 " -"Instance Tunnel 的结果不合预期, 请将该选项设为 `False`。在调用 open_reader 时,也可以使用 " -"``tunnel`` 参数来指定使用何种结果接口,例如" +"如果 `options.tunnel.use_instance_tunnel == True`,在调用 open_reader 时" +",PyODPS 会默认调用 Instance Tunnel, 否则会调用旧的 Result 接口。如果你" +"使用了版本较低的 MaxCompute 服务,或者调用 Instance Tunnel 出现了问题," +"PyODPS 会给出警告并自动降级到旧的 Result 接口,可根据警告信息判断导致降级" +"的原因。如果 Instance Tunnel 的结果不合预期, 请将该选项设为 `False`。在" +"调用 open_reader 时,也可以使用 ``tunnel`` 参数来指定使用何种结果接口," +"例如" msgstr "" "If `options.tunnel.use_instance_tunnel` is set to `True` when open_reader" " has been executed, PyODPS calls Instance Tunnel by default. If " @@ -356,7 +386,7 @@ msgstr "" "open_reader, you can also use the ``tunnel`` parameter to specify which " "result interface to use. For example:" -#: ../../source/base-sql.rst:158 +#: ../../source/base-sql.rst:162 msgid "" ">>> # 使用 Instance Tunnel\n" ">>> with o.execute_sql('select * from dual').open_reader(tunnel=True) as " @@ -380,14 +410,16 @@ msgstr "" ">>> for record in reader:\n" ">>> # process every record" -#: ../../source/base-sql.rst:169 +#: ../../source/base-sql.rst:173 msgid "" -"PyODPS 默认不限制能够从 Instance 读取的数据规模,但 Project Owner 可能在 MaxCompute Project " -"上增加保护设置以限制对 Instance 结果的读取,此时只能使用受限读取模式读取数据,在此模式下可读取的行数受到 Project " -"配置限制,通常为 10000 行。如果 PyODPS 检测到读取 Instance 数据被限制,且 " -"`options.tunnel.limit_instance_tunnel` 未设置,会自动启用受限读取模式。 如果你的 Project " -"被保护,想要手动启用受限读取模式,可以为 `open_reader` 方法增加 `limit=True` 选项,或者设置 " -"`options.tunnel.limit_instance_tunnel = True` 。" +"PyODPS 默认不限制能够从 Instance 读取的数据规模,但 Project Owner 可能在 " +"MaxCompute Project 上增加保护设置以限制对 Instance 结果的读取,此时只能" +"使用受限读取模式读取数据,在此模式下可读取的行数受到 Project 配置限制," +"通常为 10000 行。如果 PyODPS 检测到读取 Instance 数据被限制,且 ``options" +".tunnel.limit_instance_tunnel`` 未设置,会自动启用受限读取模式。 如果你的" +" Project 被保护,想要手动启用受限读取模式,可以为 ``open_reader`` 方法" +"增加 ``limit=True`` 选项,或者设置 ``options.tunnel.limit_instance_tunnel" +" = True`` 。" msgstr "" "By default, PyODPS does not limit the size of data that can be read from " "an Instance. However, project owners might add protection configuration " @@ -395,41 +427,44 @@ msgstr "" "be permitted to read under limit mode which limits number of rows to read" " given configuration in the project, which is 10000 rows. If PyODPS " "detects the existence of read limit while " -"`options.tunnel.limit_instance_tunnel` is not set, limit mode is " +"``options.tunnel.limit_instance_tunnel`` is not set, limit mode is " "automatically enabled and number of downloadable records is limited. If " "your project is protected and want to enable limit mode manually, you can" -" add `limit=True` option to `open_reader`, or set " -"`options.tunnel.limit_instance_tunnel` to `True`." +" add ``limit=True`` option to ``open_reader``, or set " +"``options.tunnel.limit_instance_tunnel = True``." -#: ../../source/base-sql.rst:175 +#: ../../source/base-sql.rst:179 msgid "" -"在部分环境中,例如 DataWorks,`options.tunnel.limit_instance_tunnel` 可能默认被置为 " -"True。此时,如果需要读取所有数据,需要为 `open_reader` 增加参数 `tunnel=True, limit=False` " -"。需要注意的是,如果 Project 本身被保护,这两个参数 **不能** 解除保护,此时应联系 Project Owner 开放相应的读权限。" +"在部分环境中,例如 DataWorks,``options.tunnel.limit_instance_tunnel`` " +"可能默认被置为 True。此时,如果需要读取\\ 所有数据,需要为 ``open_reader`" +"` 增加参数 `tunnel=True, limit=False` 。需要注意的是,如果 Project 本身被" +"保护,\\ 这两个参数\\ **不能**\\ 解除保护,此时应联系 Project Owner 开放" +"相应的读权限。" msgstr "" "In some environments, for instance, " -"`options.tunnel.limit_instance_tunnel` might be set to True for " +"``options.tunnel.limit_instance_tunnel`` might be set to True for " "compatibility. In those environments, if you want to read all data, you " -"need to add arguments `tunnel=True, limit=False` for `open_reader` " +"need to add arguments `tunnel=True, limit=False` for ``open_reader`` " "method. Note that these two arguments will **NOT** lift read limitation " "on your project. If you still meet read limitations, please ask your " "project owner to grant read privileges for you." -#: ../../source/base-sql.rst:179 +#: ../../source/base-sql.rst:183 msgid "" -"如果你所使用的 MaxCompute 只能支持旧 Result 接口,同时你需要读取所有数据,可将 SQL 结果写入另一张表后用读表接口读取 " -"(可能受到 Project 安全设置的限制)。" +"如果你所使用的 MaxCompute 只能支持旧 Result 接口,同时你需要读取所有数据" +",可将 SQL 结果写入另一张表后用读表接口读取 (可能受到 Project 安全设置的" +"限制)。" msgstr "" "If the MaxCompute version you are using only supports the old Result " "interface, and you need to read all data, you can export the SQL results " "to another table and use these methods to read data. This may be limited " "by project security settings." -#: ../../source/base-sql.rst:182 +#: ../../source/base-sql.rst:186 msgid "同时,PyODPS 支持直接将运行结果数据读成 pandas DataFrame。" msgstr "PyODPS also supports reading data as pandas DataFrames." -#: ../../source/base-sql.rst:184 +#: ../../source/base-sql.rst:188 msgid "" ">>> # 直接使用 reader 的 to_pandas 方法\n" ">>> with o.execute_sql('select * from dual').open_reader(tunnel=True) as " @@ -443,13 +478,13 @@ msgstr "" ">>> # type of pd_df is pandas DataFrame\n" ">>> pd_df = reader.to_pandas()" -#: ../../source/base-sql.rst:193 -msgid "如果需要使用多核加速读取速度,可以通过 `n_process` 指定使用进程数:" +#: ../../source/base-sql.rst:197 +msgid "如果需要使用多核加速读取速度,可以通过 ``n_process`` 指定使用进程数:" msgstr "" "If you want to accelerate data reading with multiple cores, you can " -"specify `n_process` with number of cores you want to use:" +"specify ``n_process`` with number of cores you want to use:" -#: ../../source/base-sql.rst:195 +#: ../../source/base-sql.rst:199 msgid "" ">>> import multiprocessing\n" ">>> n_process = multiprocessing.cpu_count()\n" @@ -465,22 +500,71 @@ msgstr "" ">>> # n_process should be number of processes to use\n" ">>> pd_df = reader.to_pandas(n_process=n_process)" -#: ../../source/base-sql.rst:205 -msgid "目前 Instance 结果暂不支持使用 Arrow 格式读取。" -msgstr "Currently Arrow format is not supported for instance results yet." +#: ../../source/base-sql.rst:209 +msgid "" +"从 2024 年年末开始,MaxCompute 服务将支持离线 SQL 任务 ``open_reader`` " +"使用与表类似的 Arrow 接口,MCQA 作业暂不支持。在此之前,使用 ``Instance." +"open_reader(arrow=True)`` 读取数据将报错。" +msgstr "" +"It is expected that since late 2024, MaxCompute will support reading " +"results of offline SQL instances into arrow format with " +"``Instance.open_reader`` like tables. MCQA instances do not support this " +"feature by now. Before that time, reading data with " +"``Instance.open_reader(arrow=True)`` will lead to errors." -#: ../../source/base-sql.rst:208 -msgid "设置alias" +#: ../../source/base-sql.rst:212 +msgid "" +"从 PyODPS 0.12.0 开始,你也可以直接调用 Instance 上的 ``to_pandas`` 方法" +"直接将数据转换为 pandas。\\ 你可以指定转换为 pandas 的起始行号和行数,若" +"不指定则读取所有数据。该方法也支持 ``limit`` 参数,具体定义\\ 与 ``open_" +"reader`` 方法相同。该方法默认会使用 Arrow 格式读取,并转换为 pandas。如果" +" Arrow 格式不被\\ 支持,将会回退到 Record 接口。" +msgstr "" +"Since PyODPS 0.12.0, you can call ``to_pandas`` method on Instance to " +"read instance results into pandas format. Start row number and row count " +"can be specified with this method, or all data will be read. ``limit`` " +"argument is also supported with the same definition as ``open_reader``. " +"This method will try using arrow format if available and convert the " +"result into pandas. If arrow format is not supported by service, it will " +"fall back into record format." + +#: ../../source/base-sql.rst:217 +msgid "" +">>> inst = o.execute_sql('select * from dual')\n" +">>> pd_df = inst.to_pandas(start=10, count=20)" +msgstr "" + +#: ../../source/base-sql.rst:222 +msgid "" +"与表类似,从 PyODPS 0.12.0 开始,你也可以使用 Instance 上的 ``iter_pandas" +"`` 方法按多个批次读取 pandas DataFrame,参数与 ``Table.iter_pandas`` 类似" +"。" +msgstr "" +"Similar to tables, since PyODPS 0.12.0, you can use ``iter_pandas`` " +"method of Instance to read pandas DataFrames in multiple batches. The " +"method share similar arguments with ``Table.iter_pandas``." + +#: ../../source/base-sql.rst:225 +msgid "" +">>> inst = o.execute_sql('select * from dual')\n" +">>> for batch in inst.iter_pandas(start=0, count=1000, batch_size=100):\n" +">>> print(batch)" +msgstr "" + +#: ../../source/base-sql.rst:232 +msgid "设置 alias" msgstr "Set alias" -#: ../../source/base-sql.rst:210 -msgid "有时在运行时,比如某个UDF引用的资源是动态变化的,我们可以alias旧的资源名到新的资源,这样免去了重新删除并重新创建UDF的麻烦。" +#: ../../source/base-sql.rst:234 +msgid "" +"有时在运行时,比如某个UDF引用的资源是动态变化的,我们可以alias旧的资源" +"名到新的资源,这样免去了重新删除并重新创建UDF的麻烦。" msgstr "" "Some resources referenced by a UDF are dynamically changing at runtime. " "You can create an alias for the old resource and use it as a new " "resource." -#: ../../source/base-sql.rst:212 +#: ../../source/base-sql.rst:236 msgid "" "from odps.models import TableSchema\n" "\n" @@ -552,11 +636,11 @@ msgstr "" "reader:\n" " print(reader[0][0])" -#: ../../source/base-sql.rst:248 +#: ../../source/base-sql.rst:272 msgid "2" msgstr "" -#: ../../source/base-sql.rst:252 +#: ../../source/base-sql.rst:276 msgid "" "res2 = o.create_resource('test_alias_res2', 'file', file_obj='2')\n" "# 把内容为1的资源alias成内容为2的资源,我们不需要修改UDF或资源\n" @@ -577,35 +661,38 @@ msgstr "" "reader:\n" " print(reader[0][0])" -#: ../../source/base-sql.rst:261 +#: ../../source/base-sql.rst:285 msgid "3" msgstr "" -#: ../../source/base-sql.rst:267 +#: ../../source/base-sql.rst:291 msgid "在交互式环境执行 SQL" msgstr "Execute SQL statements in an interactive environment" -#: ../../source/base-sql.rst:269 +#: ../../source/base-sql.rst:293 msgid "" -"在 ipython 和 jupyter 里支持 :ref:`使用 SQL 插件的方式运行 SQL `,且支持 " -":ref:`参数化查询 `, 详情参阅 :ref:`文档 `。" +"在 ipython 和 jupyter 里支持 :ref:`使用 SQL 插件的方式运行 SQL " +"`,且支持 :ref:`参数化查询 `, 详情参阅 :ref:`文档 `" +"。" msgstr "" "In ipython and jupyter, you can :ref:`use SQL plugins to execute SQL " "statements`. Besides, :ref:`parameterized query` is " "also supported. For details, see :ref:`Documentation`." -#: ../../source/base-sql.rst:275 +#: ../../source/base-sql.rst:299 msgid "设置 biz_id" msgstr "Set biz_id" -#: ../../source/base-sql.rst:277 -msgid "在少数情形下,可能在提交 SQL 时,需要同时提交 biz_id,否则执行会报错。此时,你可以设置全局 options 里的 biz_id。" +#: ../../source/base-sql.rst:301 +msgid "" +"在少数情形下,可能在提交 SQL 时,需要同时提交 biz_id,否则执行会报错。" +"此时,你可以设置全局 options 里的 biz_id。" msgstr "" "In a few cases, it may be necessary to submit biz_id when submitting SQL " "statements. Otherwise an error occurs during execution. You can set the " "biz_id in options globally." -#: ../../source/base-sql.rst:279 +#: ../../source/base-sql.rst:303 msgid "" "from odps import options\n" "\n" diff --git a/docs/source/locale/en/LC_MESSAGES/base-sqlalchemy.po b/docs/source/locale/en/LC_MESSAGES/base-sqlalchemy.po index dd0dd95e..9132c759 100644 --- a/docs/source/locale/en/LC_MESSAGES/base-sqlalchemy.po +++ b/docs/source/locale/en/LC_MESSAGES/base-sqlalchemy.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: PyODPS 0.11.1\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-07-23 13:21+0800\n" +"POT-Creation-Date: 2024-03-14 14:10+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -36,7 +36,9 @@ msgid "创建连接" msgstr "Create connections" #: ../../source/base-sqlalchemy.rst:13 -msgid "创建连接可以在连接串中指定 ``access_id``、``access_key`` 和 ``project`` 等。" +msgid "" +"创建连接可以在连接字符串中指定 ``access_id``、``access_key`` 和 ``project" +"`` 等。" msgstr "" "You can create MaxCompute connection by specifying ``access_id``, " "``access_key``, ``project`` and other arguments in a connection string." @@ -49,7 +51,7 @@ msgid "" "\n" "# 确保 ALIBABA_CLOUD_ACCESS_KEY_ID 环境变量设置为用户 Access Key ID,\n" "# ALIBABA_CLOUD_ACCESS_KEY_SECRET 环境变量设置为用户 Access Key Secret,\n" -"# 不建议直接使用 Access Key ID / Access Key Secret 字符串\n" +"# 不建议直接使用 Access Key ID / Access Key Secret 字符串,下同\n" "conn_string = 'odps://%s:%s@' % (\n" " os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'),\n" " os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'),\n" @@ -59,12 +61,12 @@ msgstr "" "import os\n" "from sqlalchemy import create_engine\n" "\n" -"# Make sure environment variable ALIBABA_CLOUD_ACCESS_KEY_ID " -"already set to Access Key ID of user\n" -"# while environment variable ALIBABA_CLOUD_ACCESS_KEY_SECRET " -"set to Access Key Secret of user.\n" -"# Not recommended to hardcode Access Key ID or Access Key Secret" -" in your code.\n" +"# Make sure environment variable ALIBABA_CLOUD_ACCESS_KEY_ID already set " +"to Access Key ID of user\n" +"# while environment variable ALIBABA_CLOUD_ACCESS_KEY_SECRET set to " +"Access Key Secret of user.\n" +"# Not recommended to hardcode Access Key ID or Access Key Secret in your " +"code.\n" "conn_string = 'odps://%s:%s@' % (\n" " os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'),\n" " os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'),\n" @@ -72,7 +74,7 @@ msgstr "" "engine = create_engine(conn_string)" #: ../../source/base-sqlalchemy.rst:29 -msgid "要在连接串中指定 ``endpoint``,可以按如下方式:" +msgid "要在连接字符串中指定 ``endpoint``,可以按如下方式:" msgstr "You can use methods below to specify ``endpoint`` in connection strings:" #: ../../source/base-sqlalchemy.rst:31 @@ -81,74 +83,56 @@ msgid "" "import os\n" "from sqlalchemy import create_engine\n" "\n" -"# 确保 ALIBABA_CLOUD_ACCESS_KEY_ID 环境变量设置为用户 Access Key ID,\n" -"# ALIBABA_CLOUD_ACCESS_KEY_SECRET 环境变量设置为用户 Access Key Secret,\n" -"# 不建议直接使用 Access Key ID / Access Key Secret 字符串\n" "conn_string = 'odps://%s:%s@/?endpoint=' % (\n" " os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'),\n" " os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'),\n" ")\n" "engine = create_engine(conn_string)" msgstr "" -"import os\n" -"from sqlalchemy import create_engine\n" -"\n" -"# Make sure environment variable ALIBABA_CLOUD_ACCESS_KEY_ID " -"already set to Access Key ID of user\n" -"# while environment variable ALIBABA_CLOUD_ACCESS_KEY_SECRET " -"set to Access Key Secret of user.\n" -"# Not recommended to hardcode Access Key ID or Access Key Secret" -" in your code.\n" -"conn_string = 'odps://%s:%s@/?endpoint=' % (\n" -" os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'),\n" -" os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'),\n" -")\n" -"engine = create_engine(conn_string)" -#: ../../source/base-sqlalchemy.rst:45 +#: ../../source/base-sqlalchemy.rst:42 msgid "这里把 ```` 等替换成相应的账号。" msgstr "" "Replace ```` and other placeholders with real account " "information." -#: ../../source/base-sqlalchemy.rst:47 -msgid "对于已有的 ODPS 对象 ``o`` ,调用 ``o.to_global()`` 设为全局账号后,在连接串中就不需要指定了。" +#: ../../source/base-sqlalchemy.rst:44 +msgid "" +"对于已有的 ODPS 对象 ``o`` ,调用 ``o.to_global()`` 设为全局账号后,在" +"连接字符串中就不需要指定了。" msgstr "" "For existing ODPS entries, after calling ``o.to_global()`` to make " "accounts global, there is no need to specify connection strings in detail" " again." -#: ../../source/base-sqlalchemy.rst:49 +#: ../../source/base-sqlalchemy.rst:46 msgid "" "from sqlalchemy import create_engine\n" "o.to_global() # set ODPS object as global one\n" "engine = create_engine('odps://')" msgstr "" -#: ../../source/base-sqlalchemy.rst:55 +#: ../../source/base-sqlalchemy.rst:52 msgid "接着创建连接。" msgstr "Then connections can be created." -#: ../../source/base-sqlalchemy.rst:57 +#: ../../source/base-sqlalchemy.rst:54 msgid "conn = engine.connect()" msgstr "" -#: ../../source/base-sqlalchemy.rst:61 +#: ../../source/base-sqlalchemy.rst:58 msgid "如果需要为 SQL 作业配置执行选项,可以使用 PyODPS 提供的 ``options`` 对象:" msgstr "" "If you want to set execution settings for SQL tasks, you may still use " "``options`` object provided by PyODPS:" -#: ../../source/base-sqlalchemy.rst:63 +#: ../../source/base-sqlalchemy.rst:60 #, python-format msgid "" "import os\n" "from odps import options\n" "from sqlalchemy import create_engine\n" "\n" -"# 确保 ALIBABA_CLOUD_ACCESS_KEY_ID 环境变量设置为用户 Access Key ID,\n" -"# ALIBABA_CLOUD_ACCESS_KEY_SECRET 环境变量设置为用户 Access Key Secret,\n" -"# 不建议直接使用 Access Key ID / Access Key Secret 字符串\n" "conn_string = 'odps://%s:%s@/?endpoint=' % (\n" " os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'),\n" " os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'),\n" @@ -156,36 +140,17 @@ msgid "" "options.sql.settings = {'odps.sql.hive.compatible': 'true'}\n" "engine = create_engine(conn_string)" msgstr "" -"import os\n" -"from odps import options\n" -"from sqlalchemy import create_engine\n" -"\n" -"# Make sure environment variable ALIBABA_CLOUD_ACCESS_KEY_ID " -"already set to Access Key ID of user\n" -"# while environment variable ALIBABA_CLOUD_ACCESS_KEY_SECRET " -"set to Access Key Secret of user.\n" -"# Not recommended to hardcode Access Key ID or Access Key Secret" -" in your code.\n" -"conn_string = 'odps://%s:%s@/?endpoint=' % (\n" -" os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'),\n" -" os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'),\n" -")\n" -"options.sql.settings = {'odps.sql.hive.compatible': 'true'}\n" -"engine = create_engine(conn_string)" -#: ../../source/base-sqlalchemy.rst:79 +#: ../../source/base-sqlalchemy.rst:73 msgid "也可以直接配置在连接字符串中:" msgstr "Settings can also be configured with connection strings:" -#: ../../source/base-sqlalchemy.rst:81 +#: ../../source/base-sqlalchemy.rst:75 #, python-format msgid "" "import os\n" "from sqlalchemy import create_engine\n" "\n" -"# 确保 ALIBABA_CLOUD_ACCESS_KEY_ID 环境变量设置为用户 Access Key ID,\n" -"# ALIBABA_CLOUD_ACCESS_KEY_SECRET 环境变量设置为用户 Access Key Secret,\n" -"# 不建议直接使用 Access Key ID / Access Key Secret 字符串\n" "conn_string = " "'odps://%s:%s@/?endpoint=&odps.sql.hive.compatible=true'" " % (\n" @@ -194,44 +159,64 @@ msgid "" ")\n" "engine = create_engine(conn_string)" msgstr "" + +#: ../../source/base-sqlalchemy.rst:86 +msgid "使用上述方式时,每个 engine 对象都会拥有不同的选项。" +msgstr "" +"Note that when configuring with connection strings, different engines may" +" have different settings." + +#: ../../source/base-sqlalchemy.rst:88 +msgid "" +"部分商业智能引擎(例如 Apache Superset)可能会频繁列举 MaxCompute 表等" +"对象,这可能会带来较大的延迟。\\ 如果你在数据分析过程中对新增的 " +"MaxCompute 对象不敏感,在 PyODPS 0.12.0 及以上版本中可以考虑为连接字符串" +"\\ 增加 ``cache_names=true`` 选项以启用对象名缓存,并可指定缓存超时的时间" +" ``cache_seconds=<超时秒数>`` (默认为 24 * 3600)。下面的例子开启缓存并" +"将缓存超时时间设定为 1200 秒。" +msgstr "" +"Some business intelligence engines (for instance, Apache Superset) might " +"enumerate MaxCompute objects like tables quite frequently and this could " +"lead to big latencies. Since PyODPS 0.12.0, If you do not care about new " +"MaxCompute objects during data analysis, you may add ``cache_names=true``" +" to your connection string to enable caching of the names of these " +"objects and specify timeout seconds of the cache via " +"``cache_seconds=`` whose default value is 24 * 3600. The" +" code below enables caching object names and specifies cache timeout as " +"1200 seconds." + +#: ../../source/base-sqlalchemy.rst:93 +#, python-format +msgid "" "import os\n" "from sqlalchemy import create_engine\n" "\n" -"# Make sure environment variable ALIBABA_CLOUD_ACCESS_KEY_ID " -"already set to Access Key ID of user\n" -"# while environment variable ALIBABA_CLOUD_ACCESS_KEY_SECRET " -"set to Access Key Secret of user.\n" -"# Not recommended to hardcode Access Key ID or Access Key Secret" -" in your code.\n" "conn_string = " -"'odps://%s:%s@/?endpoint=&odps.sql.hive.compatible=true'" +"'odps://%s:%s@/?endpoint=&cache_names=true&cache_seconds=1200'" " % (\n" " os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'),\n" " os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'),\n" ")\n" "engine = create_engine(conn_string)" - -#: ../../source/base-sqlalchemy.rst:95 -msgid "使用上述方式时,每个 engine 对象都会拥有不同的选项。" msgstr "" -"Note that when configuring with connection strings, different engines may" -" have different settings." -#: ../../source/base-sqlalchemy.rst:98 +#: ../../source/base-sqlalchemy.rst:105 msgid "调用 SQLAlchemy 接口" msgstr "Using SQLAlchemy interfaces" -#: ../../source/base-sqlalchemy.rst:100 -msgid "创建了连接之后,就可以正常调用 SQLAlchemy 接口。以下对建表、写入数据、查询分别举例说明。" +#: ../../source/base-sqlalchemy.rst:107 +msgid "" +"创建了连接之后,就可以正常调用 SQLAlchemy 接口。以下对建表、写入数据、" +"查询分别举例说明。" msgstr "" "After establishing connections, you can call SQLAlchemy interfaces as " "usual. Here are examples for creating, writing data and querying." -#: ../../source/base-sqlalchemy.rst:103 +#: ../../source/base-sqlalchemy.rst:110 msgid "建表" msgstr "Creating tables" -#: ../../source/base-sqlalchemy.rst:105 +#: ../../source/base-sqlalchemy.rst:112 msgid "" "from sqlalchemy import Table, Column, Integer, String, MetaData\n" "metadata = MetaData()\n" @@ -245,21 +230,21 @@ msgid "" "metadata.create_all(engine)" msgstr "" -#: ../../source/base-sqlalchemy.rst:120 +#: ../../source/base-sqlalchemy.rst:127 msgid "写入数据" msgstr "Writing data" -#: ../../source/base-sqlalchemy.rst:122 +#: ../../source/base-sqlalchemy.rst:129 msgid "" "ins = users.insert().values(id=1, name='jack', fullname='Jack Jones')\n" "conn.execute(ins)" msgstr "" -#: ../../source/base-sqlalchemy.rst:129 +#: ../../source/base-sqlalchemy.rst:136 msgid "查询数据" msgstr "Querying" -#: ../../source/base-sqlalchemy.rst:131 +#: ../../source/base-sqlalchemy.rst:138 msgid "" ">>> from sqlalchemy.sql import select\n" ">>> s = select([users])\n" diff --git a/docs/source/locale/en/LC_MESSAGES/base-tables.po b/docs/source/locale/en/LC_MESSAGES/base-tables.po index 58123a61..2e553c02 100644 --- a/docs/source/locale/en/LC_MESSAGES/base-tables.po +++ b/docs/source/locale/en/LC_MESSAGES/base-tables.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: PyODPS 0.7.16\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2024-07-18 14:27+0800\n" +"POT-Creation-Date: 2024-08-31 13:02+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -22,20 +22,22 @@ msgid "表" msgstr "Tables" #: ../../source/base-tables.rst:6 -msgid "`表 `_ 是ODPS的数据存储单元。" +msgid "" +"`表 `_ 是ODPS的数据" +"存储单元。" msgstr "" "`Tables `_ are" " the data storage unit in MaxCompute." -#: ../../source/base-tables.rst:9 ../../source/base-tables.rst:489 +#: ../../source/base-tables.rst:9 ../../source/base-tables.rst:549 msgid "基本操作" msgstr "Basic operations" #: ../../source/base-tables.rst:13 msgid "" -"本文档中的代码对 PyODPS 0.11.3 及后续版本有效。对早于 0.11.3 版本的 PyODPS,请使用 " -"``odps.models.Schema`` 代替 ``odps.models.TableSchema``,使用 ``schema`` 属性代替 " -"``table_schema`` 属性。" +"本文档中的代码对 PyODPS 0.11.3 及后续版本有效。对早于 0.11.3 版本的 " +"PyODPS,请使用 ``odps.models.Schema`` 代替 ``odps.models.TableSchema``," +"使用 ``schema`` 属性代替 ``table_schema`` 属性。" msgstr "" "Code in this document is only guaranteed to work under PyODPS 0.11.3 and " "later versions. For PyODPS earlier than 0.11.3, please replace class " @@ -66,9 +68,10 @@ msgstr "" #: ../../source/base-tables.rst:30 msgid "" -"通过该方法获取的 Table 对象不会自动加载表名以外的属性,此时获取这些属性(例如 ``table_schema`` 或者 " -"``creation_time``)可能导致额外的请求并造成额外的时间开销。如果需要在列举表的同时读取这些属性,在 PyODPS 0.11.5 " -"及后续版本中,可以为 ``list_tables`` 添加 ``extended=True`` 参数:" +"通过该方法获取的 Table 对象不会自动加载表名以外的属性,此时获取这些属性(" +"例如 ``table_schema`` 或者 ``creation_time``)可能导致额外的请求并造成" +"额外的时间开销。如果需要在列举表的同时读取这些属性,在 PyODPS 0.11.5 及" +"后续版本中,可以为 ``list_tables`` 添加 ``extended=True`` 参数:" msgstr "" "Table objects obtained with code above do not load properties other than " "names. If you get properties like ``table_schema`` or ``creation_time``, " @@ -91,8 +94,10 @@ msgstr "" #: ../../source/base-tables.rst:41 msgid "" -"managed_tables = list(o.list_tables(type=\"managed_table\")) # 列举内置表\n" -"external_tables = list(o.list_tables(type=\"external_table\")) # 列举外表\n" +"managed_tables = list(o.list_tables(type=\"managed_table\")) # 列举内置" +"表\n" +"external_tables = list(o.list_tables(type=\"external_table\")) # 列举" +"外表\n" "virtual_views = list(o.list_tables(type=\"virtual_view\")) # 列举视图\n" "materialized_views = list(o.list_tables(type=\"materialized_view\")) # " "列举物化视图" @@ -220,7 +225,9 @@ msgstr "" "[bigint, double]" #: ../../source/base-tables.rst:132 -msgid "第二种方法是使用 ``Schema.from_lists``,这种方法更容易调用,但显然无法直接设置列和分区的注释了。" +msgid "" +"第二种方法是使用 ``Schema.from_lists``,这种方法更容易调用,但显然无法" +"直接设置列和分区的注释了。" msgstr "" "Second, you can use ``Schema.from_lists`` to initialize the table. This " "method is easier, but you cannot directly set the comments of the columns" @@ -249,7 +256,8 @@ msgid "" ">>> table = o.create_table('my_new_table', schema)\n" ">>> table = o.create_table('my_new_table', schema, if_not_exists=True) #" " 只有不存在表时才创建\n" -">>> table = o.create_table('my_new_table', schema, lifecycle=7) # 设置生命周期" +">>> table = o.create_table('my_new_table', schema, lifecycle=7) # 设置" +"生命周期" msgstr "" ">>> table = o.create_table('my_new_table', schema)\n" ">>> table = o.create_table('my_new_table', schema, if_not_exists=True) #" @@ -280,9 +288,10 @@ msgstr "" #: ../../source/base-tables.rst:163 msgid "" -"在未经设置的情况下,创建表时,只允许使用 bigint、double、decimal、string、datetime、boolean、map 和 " -"array 类型。\\ 如果你使用的是位于公共云上的服务,或者支持 tinyint、struct 等新类型,可以设置 " -"``options.sql.use_odps2_extension = True`` 打开这些类型的支持,示例如下:" +"在未经设置的情况下,创建表时,只允许使用 bigint、double、decimal、string" +"、datetime、boolean、map 和 array 类型。\\ 如果你使用的是位于公共云上的" +"服务,或者支持 tinyint、struct 等新类型,可以设置 ``options.sql.use_odps2" +"_extension = True`` 打开这些类型的支持,示例如下:" msgstr "" "By default, you can only use the bigint, double, decimal, string, " "datetime, boolean, map and array types to create a table. If you use " @@ -303,7 +312,9 @@ msgid "同步表更新" msgstr "Synchronize table updates" #: ../../source/base-tables.rst:177 -msgid "有时候,一个表可能被别的程序做了更新,比如schema有了变化。此时可以调用 ``reload`` 方法来更新。" +msgid "" +"有时候,一个表可能被别的程序做了更新,比如schema有了变化。此时可以调用 ``" +"reload`` 方法来更新。" msgstr "" "If a table has been updated by another program and has changes in the " "schema, you can use ``reload`` to synchronize the update." @@ -317,11 +328,13 @@ msgid "读写数据" msgstr "Read and write data" #: ../../source/base-tables.rst:188 -msgid "行记录Record" +msgid "行记录 Record" msgstr "Record" #: ../../source/base-tables.rst:190 -msgid "Record表示表的一行记录,我们在 Table 对象上调用 new_record 就可以创建一个新的 Record。" +msgid "" +"Record表示表的一行记录,我们在 Table 对象上调用 new_record 就可以创建一个" +"新的 Record。" msgstr "" "A record is a row record in a table. You can use new_record of a table " "object to create a new record." @@ -329,7 +342,8 @@ msgstr "" #: ../../source/base-tables.rst:192 msgid "" ">>> t = o.get_table('mytable')\n" -">>> r = t.new_record(['val0', 'val1']) # 值的个数必须等于表schema的字段数\n" +">>> r = t.new_record(['val0', 'val1']) # 值的个数必须等于表schema的字" +"段数\n" ">>> r2 = t.new_record() # 也可以不传入值\n" ">>> r2[0] = 'val0' # 可以通过偏移设置值\n" ">>> r2['field1'] = 'val1' # 也可以通过字段名设置值\n" @@ -364,7 +378,9 @@ msgid "获取表数据" msgstr "Obtain table data" #: ../../source/base-tables.rst:214 -msgid "有若干种方法能够获取表数据。首先,如果只是查看每个表的开始的小于1万条数据,则可以使用 ``head`` 方法。" +msgid "" +"有若干种方法能够获取表数据。首先,如果只是查看每个表的开始的小于1万条数据" +",则可以使用 ``head`` 方法。" msgstr "" "You can obtain table data in different ways. First, you can use ``head`` " "to retrieve the first 10,000 or fewer data items in each table." @@ -381,8 +397,8 @@ msgstr "" #: ../../source/base-tables.rst:225 msgid "" -"其次,在 table 实例上可以执行 ``open_reader`` 操作来打一个 reader 来读取数据。如果表为分区表,需要引入 " -"``partition`` 参数指定需要读取的分区。" +"其次,在 table 实例上可以执行 ``open_reader`` 操作来打一个 reader 来" +"读取数据。如果表为分区表,需要引入 ``partition`` 参数指定需要读取的分区。" msgstr "" "Then, use ``open_reader`` as the table object to open a reader and read " "the data. If you need to read data from a partitioned table, you need to " @@ -399,8 +415,8 @@ msgstr "" msgid "" ">>> with t.open_reader(partition='pt=test,pt2=test2') as reader:\n" ">>> count = reader.count\n" -">>> for record in reader[5:10]: # " -"可以执行多次,直到将count数量的record读完,这里可以改造成并行操作\n" +">>> for record in reader[5:10]: # 可以执行多次,直到将count数量的" +"record读完,这里可以改造成并行操作\n" ">>> # 处理一条记录" msgstr "" ">>> with t.open_reader(partition='pt=test,pt2=test2') as reader:\n" @@ -420,8 +436,8 @@ msgstr "" msgid "" ">>> reader = t.open_reader(partition='pt=test,pt2=test2')\n" ">>> count = reader.count\n" -">>> for record in reader[5:10]: # 可以执行多次,直到将count数量的record读完,这里可以改造成并行操作" -"\n" +">>> for record in reader[5:10]: # 可以执行多次,直到将count数量的record" +"读完,这里可以改造成并行操作\n" ">>> # 处理一条记录\n" ">>> reader.close()" msgstr "" @@ -449,61 +465,186 @@ msgstr "" ">>> # process one record" #: ../../source/base-tables.rst:254 -msgid "直接读取成 Pandas DataFrame:" -msgstr "Read directly into Pandas DataFrames:" +msgid "" +"从 0.11.2 开始,PyODPS 支持使用 `https://arrow.apache.org/ `_ 格式" +"读写数据,该格式可以以更高\\ 效率与 pandas 等格式互相转换。安装 pyarrow " +"后,在调用 ``open_reader`` 时增加 ``arrow=True`` 参数,即可按 `https://" +"arrow.apache.org/docs/python/data.html#record-batches " +"`_ 格式读取表内容。" +msgstr "" +"Since 0.11.2, PyODPS supports reading and writing table data with " +"`https://arrow.apache.org/ `_ format, which can be converted from " +"and to pandas or other formats with high efficiency. After installing " +"pyarrow, you can read data from tables with " +"`https://arrow.apache.org/docs/python/data.html#record-batches `_ format by adding ``arrow=True`` argument when calling " +"``open_reader`` method." -#: ../../source/base-tables.rst:256 +#: ../../source/base-tables.rst:259 msgid "" -">>> with t.open_reader(partition='pt=test,pt2=test2') as reader:\n" +">>> with t.open_reader(partition='pt=test,pt2=test2', arrow=True) as " +"reader:\n" +">>> count = reader.count\n" +">>> for batch in reader: # 可以执行多次,直到将所有 RecordBatch 读完" +"\n" +">>> # 处理一个 RecordBatch,例如转换为 Pandas\n" +">>> print(batch.to_pandas())" +msgstr "" +">>> with t.open_reader(partition='pt=test,pt2=test2', arrow=True) as " +"reader:\n" +">>> count = reader.count\n" +">>> for batch in reader: # This line can be executed many times " +"until all record batches are visited.\n" +">>> # process one RecordBatch, for instance, convert to Pandas\n" +">>> print(batch.to_pandas())" + +#: ../../source/base-tables.rst:267 +msgid "" +"你也可以直接调用 reader 上的 ``to_pandas`` 方法直接从 reader 获取 pandas " +"DataFrame。 读取时,可以指定起始行号(从0开始)和行数。如果不指定,则默认" +"读取所有数据。" +msgstr "" +"You can also call ``to_pandas`` method on readers to read pandas " +"DataFrame. Start row index (starts from 0) and row count can be specified" +" on reading. If row indexes are not specified, all data will be read by " +"default." + +#: ../../source/base-tables.rst:270 +msgid "" +">>> with t.open_reader(partition='pt=test,pt2=test2', arrow=True) as " +"reader:\n" +">>> # 指定起始行号和行数\n" +">>> pd_df = reader.to_pandas(start=10, count=20)\n" +">>> # 如不指定,则读取所有数据\n" ">>> pd_df = reader.to_pandas()" msgstr "" +">>> with t.open_reader(partition='pt=test,pt2=test2', arrow=True) as " +"reader:\n" +">>> # specify start row index and row count\n" +">>> pd_df = reader.to_pandas(start=10, count=20)\n" +">>> # if not specified, all data will be read\n" +">>> pd_df = reader.to_pandas()" -#: ../../source/base-tables.rst:263 -msgid "利用多进程加速读取:" -msgstr "Accelerate data read using multiple processes:" +#: ../../source/base-tables.rst:280 +msgid "你可以利用多进程加速读取 Pandas DataFrame:" +msgstr "You can read data directly into Pandas DataFrames with multiple processes." -#: ../../source/base-tables.rst:265 +#: ../../source/base-tables.rst:282 msgid "" ">>> import multiprocessing\n" ">>> n_process = multiprocessing.cpu_count()\n" -">>> with t.open_reader(partition='pt=test,pt2=test2') as reader:\n" +">>> with t.open_reader(partition='pt=test,pt2=test2', arrow=True) as " +"reader:\n" ">>> pd_df = reader.to_pandas(n_process=n_process)" msgstr "" -#: ../../source/base-tables.rst:274 +#: ../../source/base-tables.rst:289 +msgid "" +"为方便读取数据为 pandas,从 PyODPS 0.12.0 开始,Table 和 Partition 对象" +"支持直接调用 ``to_pandas`` 方法。" +msgstr "" +"To facilitate reading data as pandas, since PyODPS 0.12.0, ``to_pandas`` " +"method is added to table and partition objects." + +#: ../../source/base-tables.rst:292 +msgid "" +">>> # 将表读取为 pandas DataFrame\n" +">>> pd_df = table.to_pandas(start=10, count=20)\n" +">>> # 通过2个进程读取所有数据\n" +">>> pd_df = table.to_pandas(n_process=2)\n" +">>> # 将分区读取为 pandas\n" +">>> pd_df = partitioned_table.to_pandas(partition=\"pt=test\", start=10, " +"count=20)" +msgstr "" +">>> # read table as pandas dataframe\n" +">>> pd_df = table.to_pandas(start=10, count=20)\n" +">>> # read all data with 2 processes\n" +">>> pd_df = table.to_pandas(n_process=2)\n" +">>> # read partition as pandas\n" +">>> pd_df = partitioned_table.to_pandas(partition=\"pt=test\", start=10, " +"count=20)" + +#: ../../source/base-tables.rst:301 +msgid "" +"与此同时,从 PyODPS 0.12.0 开始,你也可以使用 ``iter_pandas`` 方法从一张" +"表或分区按多个批次读取 pandas DataFrame,并通过 ``batch_size`` 参数指定" +"每次读取的 DataFrame 批次大小,该大小默认值为 ``options.tunnel.read_row_" +"batch_size`` 指定,默认为 1024。" +msgstr "" +"At the same time, since PyODPS 0.12.0, you can also use ``iter_pandas`` " +"method to read multiple batches of pandas DataFrames from a table or " +"partition. The size of Dataframes can be specified with ``batch_size`` " +"argument, whose default value is specified with " +"``options.tunnel.read_row_batch_size`` and the default value is 1024." + +#: ../../source/base-tables.rst:305 msgid "" -"``open_reader`` 或者 ``read_table`` 方法仅支持读取单个分区。如果需要读取多个分区的值,例如\\ 读取所有符合 " -"``dt>20230119`` 这样条件的分区,需要使用 ``iterate_partitions`` 方法,详见 :ref:`遍历表分区 " -"` 章节。" +">>> # 以默认 batch_size 读取所有数据\n" +">>> for batch in table.iter_pandas():\n" +">>> print(batch)\n" +">>> # 以 batch_size==100 读取前 1000 行数据\n" +">>> for batch in table.iter_pandas(batch_size=100, start=0, count=1000):\n" +">>> print(batch)" msgstr "" -"``open_reader`` or ``read_table`` only supports reading from one single " -"partition. If you need to read from multiple partitions, for instance, " -"partitions specified by the inequality ``dt>20230119``, you need to use " -"method ``iterate_partitions``. For more details please take a look at " -":ref:`iterating over table partitions ` section." +">>> # iterate all data with default batch_size\n" +">>> for batch in table.iter_pandas():\n" +">>> print(batch)\n" +">>> # iterate first 1000 rows with batch_size==100\n" +">>> for batch in table.iter_pandas(batch_size=100, start=0, count=1000):\n" +">>> print(batch)" -#: ../../source/base-tables.rst:281 +#: ../../source/base-tables.rst:316 +msgid "" +"``open_reader``、``read_table`` 以及 ``to_pandas`` 方法仅支持读取单个分区" +"。如果需要读取多个分区\\ 的值,例如读取所有符合 ``dt>20230119`` 这样条件" +"的分区,需要使用 ``iterate_partitions`` 方法,详见 :ref:`遍历表分区 <" +"iterate_partitions>` 章节。" +msgstr "" +"``open_reader``, ``read_table`` and ``to_pandas`` only supports reading " +"from one single partition. If you need to read from multiple partitions, " +"for instance, partitions specified by the inequality ``dt>20230119``, you" +" need to use method ``iterate_partitions``. For more details please take " +"a look at :ref:`iterating over table partitions ` " +"section." + +#: ../../source/base-tables.rst:320 +msgid "" +"导出数据是否包含分区列的值由输出格式决定。Record 格式数据默认包含分区列的" +"值,而 Arrow 格式默认不包含。\\ 从 PyODPS 0.12.0 开始,你可以通过指定 ``" +"append_partitions=True`` 显示引入分区列的值,通过 ``append_partitions=" +"False`` 将分区列排除在结果之外。" +msgstr "" +"Data read from tables can include partition columns or not, depending on " +"the format. Record format contains partition columns by default, while " +"arrow format does not. Since PyODPS 0.12.0, you can explicitly include " +"partition columns by specifying ``append_partitions=True``, and exclude " +"them by specifying ``append_partitions=False``." + +#: ../../source/base-tables.rst:327 msgid "向表写数据" msgstr "Write data to tables" -#: ../../source/base-tables.rst:283 +#: ../../source/base-tables.rst:329 msgid "" -"类似于 ``open_reader``,table对象同样能执行 ``open_writer`` " -"来打开writer,并写数据。如果表为分区表,需要引入 ``partition`` 参数指定需要写入的分区。" +"类似于 ``open_reader``,table对象同样能执行 ``open_writer`` 来打开writer" +",并写数据。如果表为分区表,需要引入 ``partition`` 参数指定需要写入的分区" +"。" msgstr "" "Similar to ``open_reader``, you can use ``open_writer`` as the table " "object to open a writer and write data to the table. If the table to " "write is partitioned, you need to add a ``partition`` argument to specify" " the partition to write into." -#: ../../source/base-tables.rst:286 -msgid "使用 with 表达式的写法,with 表达式会保证离开时关闭 writer 并提交所有数据:" +#: ../../source/base-tables.rst:332 +msgid "" +"使用 with 表达式的写法,with 表达式会保证离开时关闭 writer 并提交所有数据" +":" msgstr "" "Open the reader using a WITH clause, as shown in the following code. It " "is ensured by with expression that the writer is closed once the with " "block is exited and all written data are committed." -#: ../../source/base-tables.rst:288 +#: ../../source/base-tables.rst:334 msgid "" ">>> with t.open_writer(partition='pt=test') as writer:\n" ">>> records = [[111, 'aaa', True], # 这里可以是list\n" @@ -512,7 +653,8 @@ msgid "" ">>> [444, '中文', False]]\n" ">>> writer.write(records) # 这里records可以是可迭代对象\n" ">>>\n" -">>> records = [t.new_record([111, 'aaa', True]), # 也可以是Record对象\n" +">>> records = [t.new_record([111, 'aaa', True]), # 也可以是Record" +"对象\n" ">>> t.new_record([222, 'bbb', False]),\n" ">>> t.new_record([333, 'ccc', True]),\n" ">>> t.new_record([444, '中文', False])]\n" @@ -535,13 +677,13 @@ msgstr "" ">>> writer.write(records)\n" ">>>" -#: ../../source/base-tables.rst:305 +#: ../../source/base-tables.rst:351 msgid "如果分区不存在,可以使用 ``create_partition`` 参数指定创建分区,如" msgstr "" "If the specified partition does not exist, use the ``create_partition`` " "parameter to create a partition, as shown in the following code:" -#: ../../source/base-tables.rst:307 +#: ../../source/base-tables.rst:353 msgid "" ">>> with t.open_writer(partition='pt=test', create_partition=True) as " "writer:\n" @@ -560,13 +702,13 @@ msgstr "" ">>> [444, '中文', False]]\n" ">>> writer.write(records) # records can also be iterable objects" -#: ../../source/base-tables.rst:316 +#: ../../source/base-tables.rst:362 msgid "更简单的写数据方法是使用 ODPS 对象的 write_table 方法,例如" msgstr "" "An easier way is to use write_table as the ODPS object to write data, as " "shown in the following code:" -#: ../../source/base-tables.rst:318 +#: ../../source/base-tables.rst:364 msgid "" ">>> records = [[111, 'aaa', True], # 这里可以是list\n" ">>> [222, 'bbb', False],\n" @@ -583,40 +725,43 @@ msgstr "" ">>> o.write_table('test_table', records, partition='pt=test', " "create_partition=True)" -#: ../../source/base-tables.rst:328 +#: ../../source/base-tables.rst:374 msgid "" -"**注意**\\ :每次调用 write_table,MaxCompute 都会在服务端生成一个文件。这一操作需要较大的时间开销,\\ " -"同时过多的文件会降低后续的查询效率。因此,我们建议在使用 write_table 方法时,一次性写入多组数据,\\ 或者传入一个 " +"**注意**\\ :每次调用 write_table,MaxCompute 都会在服务端生成一个文件。" +"这一操作需要较大的时间开销,\\ 同时过多的文件会降低后续的查询效率。因此," +"我们建议在使用 write_table 方法时,一次性写入多组数据,\\ 或者传入一个 " "generator 对象。" msgstr "" "**Note**\\ :Every time when ``write_table`` is invoked,MaxCompute " "generates a new file on the server side, which is an expensive operation " "that reduces the throughput drastically. What's more, too many files may " "increase query time on that table. Hence we propose writing multiple " -"records or passing a Python generator object when calling " -"``write_table``." +"records or passing a Python generator object when calling ``write_table``" +"." -#: ../../source/base-tables.rst:332 +#: ../../source/base-tables.rst:378 msgid "" -"write_table 写表时会追加到原有数据。如果需要覆盖数据,可以为 write_table 增加一个参数 " -"``overwrite=True`` (仅在 0.11.1 以后支持),或者调用 table.truncate() / 删除分区后再建立分区。" +"write_table 写表时会追加到原有数据。如果需要覆盖数据,可以为 write_table " +"增加一个参数 ``overwrite=True`` (仅在 0.11.1 以后支持),或者调用 table." +"truncate() / 删除分区后再建立分区。" msgstr "" "When calling ```write_table```, new data will be appended to existing " "data. If you need to overwrite existing data, you can add an argument " "``overwrite=True`` to ``write_table`` call when you are using PyODPS " "later than 0.11.1, or call ``truncate`` on tables or partitions." -#: ../../source/base-tables.rst:335 +#: ../../source/base-tables.rst:381 msgid "" -"你可以使用多线程写入数据。从 PyODPS 0.11.6 开始,直接将 open_writer 创建的 Writer 对象分发到\\ " -"各个线程中即可完成多线程写入,写入时请注意不要关闭 writer,待所有数据写入完成后再关闭 writer。" +"你可以使用多线程写入数据。从 PyODPS 0.11.6 开始,直接将 open_writer 创建" +"的 Writer 对象分发到\\ 各个线程中即可完成多线程写入,写入时请注意不要关闭" +" writer,待所有数据写入完成后再关闭 writer。" msgstr "" "You can write data with multiple threads. Since PyODPS 0.11.6, simply " "spawning ``writer`` objects created with ``open_writer`` method into " "different threads and then data can be written in those threads. Note " "that you shall not close writers until all data are written." -#: ../../source/base-tables.rst:338 +#: ../../source/base-tables.rst:384 msgid "" "import random\n" "# Python 2.7 请从三方库 futures 中 import ThreadPoolExecutor\n" @@ -669,12 +814,13 @@ msgstr "" " # wait for threaded calls to finish\n" " [f.result() for f in futures]" -#: ../../source/base-tables.rst:363 +#: ../../source/base-tables.rst:409 msgid "" -"你也可以使用多进程写入数据,以避免 Python GIL 带来的性能损失。从 PyODPS 0.11.6 开始,只需要将 open_writer " -"创建的 Writer 对象通过 multiprocessing 标准库传递到需要写入的子进程中即可写入。\\ " -"需要注意的是,与多线程的情形不同,你应当在每个子进程完成写入后关闭 writer,并在所有写入子进程退出后\\ 再关闭主进程 writer(或离开" -" with 语句块),以保证所有数据被提交。" +"你也可以使用多进程写入数据,以避免 Python GIL 带来的性能损失。从 PyODPS " +"0.11.6 开始,只需要将 open_writer 创建的 Writer 对象通过 multiprocessing " +"标准库传递到需要写入的子进程中即可写入。\\ 需要注意的是,与多线程的情形" +"不同,你应当在每个子进程完成写入后关闭 writer,并在所有写入子进程退出后\\" +" 再关闭主进程 writer(或离开 with 语句块),以保证所有数据被提交。" msgstr "" "You can also write data with ``multiprocessing`` module in Python to " "avoid performance loss from GIL. Since PyODPS 0.11.6, you can simply pass" @@ -684,7 +830,7 @@ msgstr "" "writing is finished and close writer in the main process once writing in " "all subprocesses is done to make sure all written data are committed." -#: ../../source/base-tables.rst:368 +#: ../../source/base-tables.rst:414 msgid "" "import random\n" "from multiprocessing import Pool\n" @@ -744,15 +890,142 @@ msgstr "" " # wait for subprocesses to finish\n" " [f.get() for f in futures]" -#: ../../source/base-tables.rst:398 ../../source/base-tables.rst:869 +#: ../../source/base-tables.rst:443 +msgid "" +"从 0.11.2 开始,PyODPS 支持使用 `https://arrow.apache.org/ `_ 格式" +"读写数据,该格式可以以更高\\ 效率与 pandas 等格式互相转换。安装 pyarrow " +"后,在调用 ``open_writer`` 时增加 ``arrow=True`` 参数,即可按 `https://" +"arrow.apache.org/docs/python/data.html#record-batches " +"`_ 格式写入表内容。PyODPS 也支持直接写入 pandas DataFrame,支持自动转换为" +" Arrow RecordBatch。" +msgstr "" +"Since 0.11.2, PyODPS supports reading and writing table data with " +"`https://arrow.apache.org/ `_ format, which can be converted from " +"and to pandas or other formats with high efficiency. After installing " +"pyarrow, you can write data into tables with " +"`https://arrow.apache.org/docs/python/data.html#record-batches `_ format by adding ``arrow=True`` argument when calling " +"``open_writer`` method. PyODPS also supports writing tables with pandas " +"DataFrames, which will be converted into Arrow RecordBatch directly." + +#: ../../source/base-tables.rst:448 +msgid "" +">>> import pandas as pd\n" +">>> import pyarrow as pa\n" +">>>\n" +">>> with t.open_writer(partition='pt=test', create_partition=True, " +"arrow=True) as writer:\n" +">>> records = [[111, 'aaa', True],\n" +">>> [222, 'bbb', False],\n" +">>> [333, 'ccc', True],\n" +">>> [444, '中文', False]]\n" +">>> df = pd.DataFrame(records, columns=[\"int_val\", \"str_val\", " +"\"bool_val\"])\n" +">>> # 写入 RecordBatch\n" +">>> batch = pa.RecordBatch.from_pandas(df)\n" +">>> writer.write(batch)\n" +">>> # 也可以直接写入 Pandas DataFrame\n" +">>> writer.write(df)" +msgstr "" +">>> import pandas as pd\n" +">>> import pyarrow as pa\n" +">>>\n" +">>> with t.open_writer(partition='pt=test', create_partition=True) as " +"writer:\n" +">>> records = [[111, 'aaa', True],\n" +">>> [222, 'bbb', False],\n" +">>> [333, 'ccc', True],\n" +">>> [444, '中文', False]]\n" +">>> df = pd.DataFrame(records, columns=[\"int_val\", \"str_val\", " +"\"bool_val\"])\n" +">>> # write a RecordBatch\n" +">>> batch = pa.RecordBatch.from_pandas(df)\n" +">>> writer.write(batch)\n" +">>> # Pandas DataFrame can also be used directly\n" +">>> writer.write(df)" + +#: ../../source/base-tables.rst:465 +msgid "" +"为方便写入 pandas DataFrame,从 0.12.0 开始,PyODPS 支持直接通过 ``write_" +"table`` 方法写入 pandas DataFrame。\\ 如果写入数据前对应表不存在,可以" +"增加 ``create_table=True`` 参数以自动创建表。" +msgstr "" +"To facilitate writing pandas DataFrame, since PyODPS 0.12.0, pandas " +"DataFrames can be written with ``write_table`` method. If target table " +"does not exist, you can add ``create_table=True`` to let the method " +"create it for you." + +#: ../../source/base-tables.rst:468 +msgid "" +">>> import pandas as pd\n" +">>> df = pd.DataFrame([\n" +">>> [111, 'aaa', True],\n" +">>> [222, 'bbb', False],\n" +">>> [333, 'ccc', True],\n" +">>> [444, '中文', False]\n" +">>> ], columns=['num_col', 'str_col', 'bool_col'])\n" +">>> # 如果表 test_table 不存在,将会自动创建\n" +">>> o.write_table('test_table', df, partition='pt=test', " +"create_table=True, create_partition=True)" +msgstr "" +">>> import pandas as pd\n" +">>> df = pd.DataFrame([\n" +">>> [111, 'aaa', True],\n" +">>> [222, 'bbb', False],\n" +">>> [333, 'ccc', True],\n" +">>> [444, '中文', False]\n" +">>> ], columns=['num_col', 'str_col', 'bool_col'])\n" +">>> # if table test_table does not exist, it will be created " +"automatically\n" +">>> o.write_table('test_table', df, partition='pt=test', " +"create_table=True, create_partition=True)" + +#: ../../source/base-tables.rst:480 +msgid "" +"从 PyODPS 0.12.0 开始,``write_table`` 方法也支持动态分区,可通过 ``" +"partitions`` 参数传入需要作为分区的列名,\\ 并指定 ``create_partition=" +"True``,相应的分区将会自动创建。" +msgstr "" +"Since PyODPS 0.12.0, ``write_table`` supports dynamic partitioning. You " +"can use ``partitions`` argument to specify columns as partitions, and " +"when ``create_partition=True`` is specified, these partitions will be " +"created by the method." + +#: ../../source/base-tables.rst:483 +msgid "" +">>> import pandas as pd\n" +">>> df = pd.DataFrame([\n" +">>> [111, 'aaa', True, 'p1'],\n" +">>> [222, 'bbb', False, 'p1'],\n" +">>> [333, 'ccc', True, 'p2'],\n" +">>> [444, '中文', False, 'p2']\n" +">>> ], columns=['num_col', 'str_col', 'bool_col', 'pt'])\n" +">>> # 如果分区 pt=p1 或 pt=p2 不存在,将会自动创建。\n" +">>> o.write_table('test_part_table', df, partitions=['pt'], " +"create_partition=True)" +msgstr "" +">>> import pandas as pd\n" +">>> df = pd.DataFrame([\n" +">>> [111, 'aaa', True, 'p1'],\n" +">>> [222, 'bbb', False, 'p1'],\n" +">>> [333, 'ccc', True, 'p2'],\n" +">>> [444, '中文', False, 'p2']\n" +">>> ], columns=['num_col', 'str_col', 'bool_col', 'pt'])\n" +">>> # if partition pt=p1 or pt=p2 does not exist, they will be created " +"automatically\n" +">>> o.write_table('test_part_table', df, partitions=['pt'], " +"create_partition=True)" + +#: ../../source/base-tables.rst:496 ../../source/base-tables.rst:929 msgid "压缩选项" msgstr "Compression options" -#: ../../source/base-tables.rst:399 ../../source/base-tables.rst:870 +#: ../../source/base-tables.rst:497 ../../source/base-tables.rst:930 msgid "" -"为加快数据上传 / 下载速度,你可以在上传 / 下载数据时设置压缩选项。具体地,可以创建一个 ``CompressOption`` " -"实例,在其中指定需要的压缩算法及压缩等级。目前可用的压缩算法包括 zlib 和 ZSTD,其中 ZSTD 需要额外安装 ``zstandard``" -" 包。" +"为加快数据上传 / 下载速度,你可以在上传 / 下载数据时设置压缩选项。具体地" +",可以创建一个 ``CompressOption`` 实例,在其中指定需要的压缩算法及压缩" +"等级。目前可用的压缩算法包括 zlib 和 ZSTD,其中 ZSTD 需要额外安装 ``" +"zstandard`` 包。" msgstr "" "You can specify compression options to accelerate data upload or " "download. To achieve this, you may create a ``CompressOption`` instance " @@ -760,7 +1033,7 @@ msgstr "" "ZSTD is supported, and you need to install ``zstandard`` package to " "enable ZSTD support." -#: ../../source/base-tables.rst:403 ../../source/base-tables.rst:874 +#: ../../source/base-tables.rst:501 ../../source/base-tables.rst:934 msgid "" "from odps.tunnel import CompressOption\n" "\n" @@ -779,13 +1052,13 @@ msgstr "" "zlib\n" ")" -#: ../../source/base-tables.rst:413 +#: ../../source/base-tables.rst:511 msgid "此后可在 ``open_reader`` / ``open_writer`` 中设置压缩选项,例如:" msgstr "" "Then you may specify compression option when calling ``open_reader`` or " "``open_writer``. For instance," -#: ../../source/base-tables.rst:415 +#: ../../source/base-tables.rst:513 msgid "" "with table.open_writer(compress_option=compress_option) as writer:\n" " # 写入数据,此处从略" @@ -793,15 +1066,15 @@ msgstr "" "with table.open_writer(compress_option=compress_option) as writer:\n" " # replace this comment with actual data writing code" -#: ../../source/base-tables.rst:420 +#: ../../source/base-tables.rst:518 msgid "" -"如果仅需指定算法名,也可以直接在 ``open_reader`` / ``open_writer`` 中指定 ``compress_algo`` " -"参数,例如" +"如果仅需指定算法名,也可以直接在 ``open_reader`` / ``open_writer`` 中指定" +" ``compress_algo`` 参数,例如" msgstr "" "If you only need to specify name of the compression algorithm, you can " "specify it with ``compress_algo`` argument directly." -#: ../../source/base-tables.rst:422 +#: ../../source/base-tables.rst:520 msgid "" "with table.open_writer(compress_algo=\"zlib\") as writer:\n" " # 写入数据,此处从略" @@ -809,93 +1082,11 @@ msgstr "" "with table.open_writer(compress_algo=\"zlib\") as writer:\n" " # replace this comment with actual data writing code" -#: ../../source/base-tables.rst:430 -msgid "使用 Arrow 格式读写数据" -msgstr "Use Arrow format to read and write data" - -#: ../../source/base-tables.rst:431 -msgid "" -"`Apache Arrow `_ " -"是一种跨语言的通用数据读写格式,支持在各种不同平台间进行数据交换。\\ 自2021年起, MaxCompute 支持使用 Arrow " -"格式读取表数据,PyODPS 则从 0.11.2 版本开始支持该功能。具体地,如果在 Python 环境中安装 pyarrow 后,在调用 " -"``open_reader`` 或者 ``open_writer`` 时增加 ``arrow=True`` 参数,即可读写 `Arrow " -"RecordBatch `_ 。" -msgstr "" -"`Apache Arrow `_ is a language-neutral format " -"supporting data exchange between different platforms. MaxCompute supports" -" reading and writing table data with Arrow format since 2021, and PyODPS " -"starts experimental support in 0.11.2. After installing pyarrow in your " -"Python environment, you can enable reading and writing with Arrow format " -"by adding ``arrow=True`` argument in ``open_reader`` or ``open_writer`` " -"calls to handle RecordBatch " -"`_ instead" -" of single records." - -#: ../../source/base-tables.rst:436 -msgid "按 RecordBatch 读取表内容:" -msgstr "Read table content by record batches" - -#: ../../source/base-tables.rst:438 -msgid "" -">>> reader = t.open_reader(partition='pt=test', arrow=True)\n" -">>> count = reader.count\n" -">>> for batch in reader: # 可以执行多次,直到将所有 RecordBatch 读完\n" -">>> # 处理一个 RecordBatch,例如转换为 Pandas\n" -">>> print(batch.to_pandas())" -msgstr "" -">>> reader = t.open_reader(partition='pt=test', arrow=True)\n" -">>> count = reader.count\n" -">>> for batch in reader: # This line can be executed many times until " -"all record batches are visited.\n" -">>> # process one RecordBatch, for instance, convert to Pandas\n" -">>> print(batch.to_pandas())" - -#: ../../source/base-tables.rst:446 -msgid "写入 RecordBatch:" -msgstr "Write record batches" - -#: ../../source/base-tables.rst:448 -msgid "" -">>> import pandas as pd\n" -">>> import pyarrow as pa\n" -">>>\n" -">>> with t.open_writer(partition='pt=test', create_partition=True, " -"arrow=True) as writer:\n" -">>> records = [[111, 'aaa', True],\n" -">>> [222, 'bbb', False],\n" -">>> [333, 'ccc', True],\n" -">>> [444, '中文', False]]\n" -">>> df = pd.DataFrame(records, columns=[\"int_val\", \"str_val\", " -"\"bool_val\"])\n" -">>> # 写入 RecordBatch\n" -">>> batch = pa.RecordBatch.from_pandas(df)\n" -">>> writer.write(batch)\n" -">>> # 也可以直接写入 Pandas DataFrame\n" -">>> writer.write(df)" -msgstr "" -">>> import pandas as pd\n" -">>> import pyarrow as pa\n" -">>>\n" -">>> with t.open_writer(partition='pt=test', create_partition=True) as " -"writer:\n" -">>> records = [[111, 'aaa', True],\n" -">>> [222, 'bbb', False],\n" -">>> [333, 'ccc', True],\n" -">>> [444, '中文', False]]\n" -">>> df = pd.DataFrame(records, columns=[\"int_val\", \"str_val\", " -"\"bool_val\"])\n" -">>> # write a RecordBatch\n" -">>> batch = pa.RecordBatch.from_pandas(df)\n" -">>> writer.write(batch)\n" -">>> # Pandas DataFrame can also be used directly\n" -">>> writer.write(df)" - -#: ../../source/base-tables.rst:466 +#: ../../source/base-tables.rst:526 msgid "删除表" msgstr "Delete tables" -#: ../../source/base-tables.rst:468 +#: ../../source/base-tables.rst:528 msgid "" ">>> o.delete_table('my_table_name', if_exists=True) # 只有表存在时删除\n" ">>> t.drop() # Table对象存在的时候可以直接执行drop函数" @@ -904,55 +1095,55 @@ msgstr "" "the table exists\n" ">>> t.drop() # call drop method of the Table object to delete directly" -#: ../../source/base-tables.rst:475 +#: ../../source/base-tables.rst:535 msgid "创建DataFrame" msgstr "Create a DataFrame" -#: ../../source/base-tables.rst:477 +#: ../../source/base-tables.rst:537 msgid "" -"PyODPS提供了 :ref:`DataFrame框架 ` ,支持更方便地方式来查询和操作ODPS数据。 使用 ``to_df`` " -"方法,即可转化为 DataFrame 对象。" +"PyODPS提供了 :ref:`DataFrame框架 ` ,支持更方便地方式来查询和操作ODPS" +"数据。 使用 ``to_df`` 方法,即可转化为 DataFrame 对象。" msgstr "" "PyODPS provides a :ref:`DataFrame framework ` to easily search and " "operate MaxCompute data. You can use ``to_df`` to convert a table to a " "DataFrame object." -#: ../../source/base-tables.rst:480 +#: ../../source/base-tables.rst:540 msgid "" ">>> table = o.get_table('my_table_name')\n" ">>> df = table.to_df()" msgstr "" -#: ../../source/base-tables.rst:486 +#: ../../source/base-tables.rst:546 msgid "表分区" msgstr "Table partitions" -#: ../../source/base-tables.rst:491 +#: ../../source/base-tables.rst:551 msgid "判断表是否为分区表:" msgstr "Check if a table is partitioned:" -#: ../../source/base-tables.rst:493 +#: ../../source/base-tables.rst:553 #, python-format msgid "" ">>> if table.table_schema.partitions:\n" ">>> print('Table %s is partitioned.' % table.name)" msgstr "" -#: ../../source/base-tables.rst:498 +#: ../../source/base-tables.rst:558 msgid "判断分区是否存在(该方法需要填写所有分区字段值):" msgstr "" "Check whether the specified partition exists, all field values should be " "provided:" -#: ../../source/base-tables.rst:500 +#: ../../source/base-tables.rst:560 msgid ">>> table.exist_partition('pt=test,sub=2015')" msgstr "" -#: ../../source/base-tables.rst:504 +#: ../../source/base-tables.rst:564 msgid "判断给定前缀的分区是否存在:" msgstr "Check whether partitions satisfying provided prefix exist:" -#: ../../source/base-tables.rst:506 +#: ../../source/base-tables.rst:566 msgid "" ">>> # 表 table 的分区字段依次为 pt, sub\n" ">>> table.exist_partitions('pt=test')" @@ -960,11 +1151,11 @@ msgstr "" ">>> # the order of partitions fields of table is pt, sub\n" ">>> table.exist_partitions('pt=test')" -#: ../../source/base-tables.rst:511 +#: ../../source/base-tables.rst:571 msgid "获取一个分区的相关信息:" msgstr "Obtain information about one specified partition:" -#: ../../source/base-tables.rst:513 +#: ../../source/base-tables.rst:573 msgid "" ">>> partition = table.get_partition('pt=test')\n" ">>> print(partition.creation_time)\n" @@ -973,10 +1164,11 @@ msgid "" "0" msgstr "" -#: ../../source/base-tables.rst:523 +#: ../../source/base-tables.rst:583 msgid "" -"这里的\"分区\"指的不是分区字段而是所有分区字段均确定的分区定义对应的子表。如果某个分区字段对应多个值, 则相应地有多个子表,即多个分区。而 " -"``get_partition`` 只能获取一个分区的信息。因而," +"这里的\"分区\"指的不是分区字段而是所有分区字段均确定的分区定义对应的子表" +"。如果某个分区字段对应多个值, 则相应地有多个子表,即多个分区。而 ``get_" +"partition`` 只能获取一个分区的信息。因而," msgstr "" "The word `partition` here refers to a partition specification that " "specifies values of all partition columns which uniquely specifies a sub-" @@ -985,20 +1177,22 @@ msgstr "" "partitions. Meanwhile the method ``get_partition`` can only obtain " "information of only one sub-table. Thus," -#: ../../source/base-tables.rst:526 +#: ../../source/base-tables.rst:586 msgid "" -"如果某些分区未指定,那么这个分区定义可能对应多个子表,``get_partition`` 时则不被 PyODPS 支持。\\ 此时,需要使用 " -"``iterate_partitions`` 分别处理每个分区。" +"如果某些分区未指定,那么这个分区定义可能对应多个子表,``get_partition`` " +"时则不被 PyODPS 支持。\\ 此时,需要使用 ``iterate_partitions`` 分别处理" +"每个分区。" msgstr "" "When some values of partition columns are absent, the specification could" " represent multiple tables, and then calling ``get_partitions`` with this" " specification is not supported in PyODPS. You need to use " "``iter_partitions`` to handle every partition respectively." -#: ../../source/base-tables.rst:528 +#: ../../source/base-tables.rst:588 msgid "" -"如果某个分区字段被定义多次,或者使用类似 ``pt>20210302`` 这样的非确定逻辑表达式,则无法使用 ``get_partition`` " -"获取分区。在此情况下,可以尝试使用 ``iterate_partitions`` 枚举每个分区。" +"如果某个分区字段被定义多次,或者使用类似 ``pt>20210302`` 这样的非确定逻辑" +"表达式,则无法使用 ``get_partition`` 获取分区。在此情况下,可以尝试使用 `" +"`iterate_partitions`` 枚举每个分区。" msgstr "" "When some partition column is specified multiple times, or non-" "deterministic logic expressions like ``pt>20210302`` is used, " @@ -1006,78 +1200,80 @@ msgstr "" " case, ``iterate_partitions`` might be used to iterate over all " "partitions." -#: ../../source/base-tables.rst:532 +#: ../../source/base-tables.rst:592 msgid "创建分区" msgstr "Create partitions" -#: ../../source/base-tables.rst:534 +#: ../../source/base-tables.rst:594 msgid "下面的操作将创建一个分区,如果分区存在将报错:" msgstr "" "Code below will create a partition or raise an error if the partition " "already exists." -#: ../../source/base-tables.rst:536 +#: ../../source/base-tables.rst:596 msgid ">>> t.create_partition('pt=test')" msgstr "" -#: ../../source/base-tables.rst:540 +#: ../../source/base-tables.rst:600 msgid "下面的操作将创建一个分区,如果分区存在则跳过:" msgstr "" "Code below will create a partition or do nothing if the partition already" " exists." -#: ../../source/base-tables.rst:542 +#: ../../source/base-tables.rst:602 msgid ">>> t.create_partition('pt=test', if_not_exists=True)" msgstr "" -#: ../../source/base-tables.rst:549 +#: ../../source/base-tables.rst:609 msgid "遍历表分区" msgstr "Iterate through partitions" -#: ../../source/base-tables.rst:550 +#: ../../source/base-tables.rst:610 msgid "下面的操作将遍历表全部分区:" msgstr "Code below iterates through all the partitions in a table." -#: ../../source/base-tables.rst:552 +#: ../../source/base-tables.rst:612 msgid "" ">>> for partition in table.partitions:\n" ">>> print(partition.name)" msgstr "" -#: ../../source/base-tables.rst:557 +#: ../../source/base-tables.rst:617 msgid "如果要遍历部分分区值确定的分区,可以使用 ``iterate_partitions`` 方法。" msgstr "" "If you need to iterate through partitions with certain values of " "partition fields fixed, you can use ``iterate_partitions`` method." -#: ../../source/base-tables.rst:559 +#: ../../source/base-tables.rst:619 msgid "" ">>> for partition in table.iterate_partitions(spec='pt=test'):\n" ">>> print(partition.name)" msgstr "" -#: ../../source/base-tables.rst:564 +#: ../../source/base-tables.rst:624 msgid "" -"自 PyODPS 0.11.3 开始,支持为 ``iterate_partitions`` 指定简单的逻辑表达式及通过逗号连接,\\ " -"每个子表达式均须满足的复合逻辑表达式。或运算符暂不支持。" +"自 PyODPS 0.11.3 开始,支持为 ``iterate_partitions`` 指定简单的逻辑表达式" +"及通过逗号连接,\\ 每个子表达式均须满足的复合逻辑表达式。或运算符暂不支持" +"。" msgstr "" "Since PyODPS 0.11.3, PyODPS supports using simple logic expressions or " "logic expressions connected with commas which means combined conditions " "when iterating through partitions. OR operator is not supported " "currently." -#: ../../source/base-tables.rst:567 +#: ../../source/base-tables.rst:627 msgid "" ">>> for partition in table.iterate_partitions(spec='dt>20230119'):\n" ">>> print(partition.name)" msgstr "" -#: ../../source/base-tables.rst:574 +#: ../../source/base-tables.rst:634 msgid "" -"在 0.11.3 之前的版本中,``iterate_partitions`` 仅支持枚举前若干个分区等于相应值的情形。例如, " -"当表的分区字段按顺序分别为 pt1、pt2 和 pt3,那么 ``iterate_partitions`` 中的 ``spec`` 参数只能指定" -" ``pt1=xxx`` 或者 ``pt1=xxx,pt2=yyy`` 这样的形式。自 0.11.3 开始, " -"``iterate_partitions`` 支持更多枚举方式,但仍建议尽可能限定上一级分区以提高枚举的效率。" +"在 0.11.3 之前的版本中,``iterate_partitions`` 仅支持枚举前若干个分区等于" +"相应值的情形。例如, 当表的分区字段按顺序分别为 pt1、pt2 和 pt3,那么 ``" +"iterate_partitions`` 中的 ``spec`` 参数只能指定 ``pt1=xxx`` 或者 ``pt1=" +"xxx,pt2=yyy`` 这样的形式。自 0.11.3 开始, ``iterate_partitions`` 支持更" +"多枚举方式,但仍建议尽可能限定上一级分区以提高枚举的效率。" msgstr "" "Before 0.11.3, ``iterate_partitions`` only supports specifying partition " "values for the first partition fields. For instance, when a table has 3 " @@ -1087,15 +1283,15 @@ msgstr "" "flexible forms of ``spec`` arguments. However, it is still recommended to" " fix values of first partition fields to improve speed of iteration." -#: ../../source/base-tables.rst:580 +#: ../../source/base-tables.rst:640 msgid "删除分区" msgstr "Delete partitions" -#: ../../source/base-tables.rst:582 +#: ../../source/base-tables.rst:642 msgid "下面的操作将删除一个分区:" msgstr "Code below will delete a partition." -#: ../../source/base-tables.rst:584 +#: ../../source/base-tables.rst:644 msgid "" ">>> t.delete_partition('pt=test', if_exists=True) # 存在的时候才删除\n" ">>> partition.drop() # Partition对象存在的时候直接drop" @@ -1105,23 +1301,25 @@ msgstr "" ">>> partition.drop() # delete directly via the drop method of the " "partition object" -#: ../../source/base-tables.rst:590 +#: ../../source/base-tables.rst:650 msgid "获取值最大分区" msgstr "Obtain the partition with maximal value:" -#: ../../source/base-tables.rst:591 -msgid "很多时候你可能希望获取值最大的分区。例如,当以日期为分区值时,你可能希望获得日期最近的有数据的分区。PyODPS 自 0.11.3 开始支持此功能。" +#: ../../source/base-tables.rst:651 +msgid "" +"很多时候你可能希望获取值最大的分区。例如,当以日期为分区值时,你可能希望" +"获得日期最近的有数据的分区。PyODPS 自 0.11.3 开始支持此功能。" msgstr "" "Sometimes you want to get the partition with maximal value, for instance," " when dates are used as partition values, you may want to get the " "partition with data and latest date. PyODPS starts supporting this " "function since 0.11.3." -#: ../../source/base-tables.rst:594 +#: ../../source/base-tables.rst:654 msgid "创建分区表并写入一些数据:" msgstr "Create a partitioned table and write some data." -#: ../../source/base-tables.rst:596 +#: ../../source/base-tables.rst:656 #, python-format msgid "" "t = o.create_table(\"test_multi_pt_table\", (\"col string\", \"pt1 " @@ -1132,13 +1330,13 @@ msgid "" "partition=\"pt1=%s,pt2=%s\" % (pt1, pt2))" msgstr "" -#: ../../source/base-tables.rst:602 +#: ../../source/base-tables.rst:662 msgid "如果想要获得值最大的分区,可以使用下面的代码:" msgstr "" "If you want to get the partition with maximal value, you can use code " "below:" -#: ../../source/base-tables.rst:604 +#: ../../source/base-tables.rst:664 msgid "" ">>> part = t.get_max_partition()\n" ">>> part\n" @@ -1152,39 +1350,40 @@ msgstr "" ">>> part.partition_spec[\"pt1\"] # get value of certain partition field\n" "b" -#: ../../source/base-tables.rst:612 +#: ../../source/base-tables.rst:672 msgid "如果只希望获得最新的分区而忽略分区内是否有数据,可以用" msgstr "" "If you want to get latest partition while ignore whether the partition " "has data, you may use" -#: ../../source/base-tables.rst:614 +#: ../../source/base-tables.rst:674 msgid "" ">>> t.get_max_partition(skip_empty=False)\n" "" msgstr "" -#: ../../source/base-tables.rst:619 +#: ../../source/base-tables.rst:679 msgid "对于多级分区表,可以通过限定上级分区值来获得值最大的子分区,例如" msgstr "" "For tables with multiple partitions, you may specify the parent partition" " specification to get child partition with maximal value, for instance," -#: ../../source/base-tables.rst:621 +#: ../../source/base-tables.rst:681 msgid "" ">>> t.get_max_partition(\"pt1=a\")\n" "" msgstr "" -#: ../../source/base-tables.rst:629 +#: ../../source/base-tables.rst:689 msgid "数据上传下载通道" msgstr "Data upload and download tunnels" -#: ../../source/base-tables.rst:632 +#: ../../source/base-tables.rst:692 msgid "" -"不推荐直接使用 Tunnel 接口,该接口较为低级,简单的表写入推荐直接使用 Tunnel 接口上实现的表 :ref:`写 " -"` 和 :ref:`读 ` 接口,可靠性和易用性更高。 只有在分布式写表等复杂场景下有直接使用 " -"Tunnel 接口的需要。" +"不推荐直接使用 Tunnel 接口,该接口较为低级,简单的表写入推荐直接使用 " +"Tunnel 接口上实现的表 :ref:`写 ` 和 :ref:`读 ` " +"接口,可靠性和易用性更高。 只有在分布式写表等复杂场景下有直接使用 Tunnel " +"接口的需要。" msgstr "" "If you just need to upload a small amount of data, we do not recommend " "using table tunnel directly for simple table reading and writing, as " @@ -1193,27 +1392,31 @@ msgstr "" "use tunnel interfaces directly when writing tables distributedly or under" " complicated scenarios." -#: ../../source/base-tables.rst:636 -msgid "ODPS Tunnel 是 MaxCompute 的数据通道,用户可以通过 Tunnel 向 MaxCompute 中上传或者下载数据。" +#: ../../source/base-tables.rst:696 +msgid "" +"ODPS Tunnel 是 MaxCompute 的数据通道,用户可以通过 Tunnel 向 MaxCompute " +"中上传或者下载数据。" msgstr "" "MaxCompute Tunnel is the data channel of MaxCompute. You can use this to " "upload data to or download data from MaxCompute." -#: ../../source/base-tables.rst:639 +#: ../../source/base-tables.rst:699 msgid "上传" msgstr "Upload" -#: ../../source/base-tables.rst:641 +#: ../../source/base-tables.rst:701 msgid "分块上传接口" msgstr "Block upload interface" -#: ../../source/base-tables.rst:642 +#: ../../source/base-tables.rst:702 msgid "" -"直接使用 Tunnel 分块接口上传时,需要首先通过 ``create_upload_session`` 方法使用表名和分区创建 Upload " -"Session,此后从 Upload Session 创建 Writer。每个 Upload Session 可多次调用 " -"``open_record_writer`` 方法创建多个 Writer,每个 Writer 拥有一个 ``block_id`` " -"对应一个数据块。完成写入后,需要调用 Upload Session 上的 ``commit`` 方法并指定需要提交的数据块列表。\\ 如果有某个 " -"``block_id`` 有数据写入但未包括在 ``commit`` 的参数中,则该数据块不会出现在最终的表中。" +"直接使用 Tunnel 分块接口上传时,需要首先通过 ``create_upload_session`` " +"方法使用表名和分区创建 Upload Session,此后从 Upload Session 创建 Writer" +"。每个 Upload Session 可多次调用 ``open_record_writer`` 方法创建多个 " +"Writer,每个 Writer 拥有一个 ``block_id`` 对应一个数据块。完成写入后," +"需要调用 Upload Session 上的 ``commit`` 方法并指定需要提交的数据块列表。" +"\\ 如果有某个 ``block_id`` 有数据写入但未包括在 ``commit`` 的参数中,则该" +"数据块不会出现在最终的表中。" msgstr "" "When using block upload interface of MaxCompute tunnel, you need to " "create an upload session with ``create_upload_session`` method and table " @@ -1225,7 +1428,7 @@ msgstr "" "the list submitted with ``commit`` method, the corresponding data block " "will not appear in the final table." -#: ../../source/base-tables.rst:648 +#: ../../source/base-tables.rst:708 msgid "" "from odps.tunnel import TableTunnel\n" "\n" @@ -1276,11 +1479,13 @@ msgstr "" "# commit, and an error will be raised.\n" "upload_session.commit([0])" -#: ../../source/base-tables.rst:672 +#: ../../source/base-tables.rst:732 msgid "" -"如果你需要在多个进程乃至节点中使用相同的 Upload Session,可以先创建 Upload Session,并获取其 ``id`` " -"属性。此后在其他进程中调用 ``create_upload_session`` 方法时,将该值作为 ``upload_id`` 参数。\\ " -"完成每个进程的上传后,需要收集各进程提交数据所用的 ``block_id``,并在某个进程中完成 ``commit``。" +"如果你需要在多个进程乃至节点中使用相同的 Upload Session,可以先创建 " +"Upload Session,并获取其 ``id`` 属性。此后在其他进程中调用 ``create_" +"upload_session`` 方法时,将该值作为 ``upload_id`` 参数。\\ 完成每个进程的" +"上传后,需要收集各进程提交数据所用的 ``block_id``,并在某个进程中完成 ``" +"commit``。" msgstr "" "If you need to reuse your upload session in multiple processes or nodes, " "you may create your upload session first, obtain its ``id`` property and " @@ -1289,7 +1494,7 @@ msgstr "" "need to collect ``block_id`` used in these processes and call ``commit`` " "in some process." -#: ../../source/base-tables.rst:676 +#: ../../source/base-tables.rst:736 msgid "" "from odps.tunnel import TableTunnel\n" "\n" @@ -1374,15 +1579,17 @@ msgstr "" "# submit all collected block_id\n" "upload_session_main.commit(collected_block_ids)" -#: ../../source/base-tables.rst:716 +#: ../../source/base-tables.rst:776 msgid "" -"需要注意的是,指定 block id 后,所创建的 Writer 为长连接,如果长时间不写入会导致连接关闭,并导致写入失败,\\ 该时间通常为 5" -" 分钟。如果你写入数据的间隔较大,建议生成一批数据后再通过 ``open_record_writer`` 接口创建 Writer " -"并按需写入数据。如果你只希望在单个 Writer 上通过 Tunnel 写入数据,可以考虑在调用 ``open_record_writer`` " -"时不指定 block id,此时创建的 Writer 在写入数据时将首先将数据缓存在本地,当 Writer 关闭或者缓存数据大于\\ " -"一定大小(默认为 20MB,可通过 ``options.tunnel.block_buffer_size`` " -"指定)时才会写入数据。写入数据后,\\ 需要先通过 Writer 上的 ``get_blocks_written`` 方法获得已经写入的 " -"block 列表,再进行提交。" +"需要注意的是,指定 block id 后,所创建的 Writer 为长连接,如果长时间不" +"写入会导致连接关闭,并导致写入失败,\\ 该时间通常为 5 分钟。如果你写入" +"数据的间隔较大,建议生成一批数据后再通过 ``open_record_writer`` 接口创建 " +"Writer 并按需写入数据。如果你只希望在单个 Writer 上通过 Tunnel 写入数据," +"可以考虑在调用 ``open_record_writer`` 时不指定 block id,此时创建的 " +"Writer 在写入数据时将首先将数据缓存在本地,当 Writer 关闭或者缓存数据大于" +"\\ 一定大小(默认为 20MB,可通过 ``options.tunnel.block_buffer_size`` " +"指定)时才会写入数据。写入数据后,\\ 需要先通过 Writer 上的 ``get_blocks_" +"written`` 方法获得已经写入的 block 列表,再进行提交。" msgstr "" "Note that writers created with ``open_record_writer`` establish long " "connections which will be closed if no data are written in a long period " @@ -1397,7 +1604,7 @@ msgstr "" "with ``options.tunnel.block_buffer_size``. After writing all data you " "need to obtain all written blocks with ``get_blocks_written`` method." -#: ../../source/base-tables.rst:723 +#: ../../source/base-tables.rst:783 msgid "" "from odps.tunnel import TableTunnel\n" "\n" @@ -1446,26 +1653,26 @@ msgstr "" "# obtain block ids from the writer and then commit\n" "upload_session.commit(writer.get_blocks_written())" -#: ../../source/base-tables.rst:749 +#: ../../source/base-tables.rst:809 msgid "" -"使用带缓存的 Writer 时,需要注意不能在同一 Upload Session 上开启多个带缓存 Writer 进行写入,\\ " -"否则可能导致冲突而使数据丢失。" +"使用带缓存的 Writer 时,需要注意不能在同一 Upload Session 上开启多个带" +"缓存 Writer 进行写入,\\ 否则可能导致冲突而使数据丢失。" msgstr "" "When using buffered writers, you need to avoid opening multiple writers " "on a single upload session, or there might be collisions and data might " "be lost." -#: ../../source/base-tables.rst:752 +#: ../../source/base-tables.rst:812 msgid "" -"如果你需要使用 Arrow 格式而不是 Record 格式进行上传,可以将 ``open_record_writer`` 替换为 " -"``open_arrow_writer``,并写入 Arrow RecordBatch / Arrow Table 或者 pandas " -"DataFrame。" +"如果你需要使用 Arrow 格式而不是 Record 格式进行上传,可以将 ``open_record" +"_writer`` 替换为 ``open_arrow_writer``,并写入 Arrow RecordBatch / Arrow " +"Table 或者 pandas DataFrame。" msgstr "" "If you need to upload with arrow format instead of record format, you may" " replace ``open_record_writer`` with ``open_arrow_writer`` and write " "arrow RecordBatches, Tables or pandas DataFrames." -#: ../../source/base-tables.rst:755 +#: ../../source/base-tables.rst:815 msgid "" "import pandas as pd\n" "import pyarrow as pa\n" @@ -1508,22 +1715,23 @@ msgstr "" "# commit, and an error will be raised.\n" "upload_session.commit([0])" -#: ../../source/base-tables.rst:775 +#: ../../source/base-tables.rst:835 msgid "本章节中所述所有 Writer 均非线程安全。你需要为每个线程单独创建 Writer。" msgstr "" "All writers described in this chapter are not thread safe. You need to " "create separate writers for every thread." -#: ../../source/base-tables.rst:778 +#: ../../source/base-tables.rst:838 msgid "流式上传接口" msgstr "Stream upload interface" -#: ../../source/base-tables.rst:779 +#: ../../source/base-tables.rst:839 msgid "" -"MaxCompute 提供了\\ `流式上传接口 `_\\ 用于简化分布式服务开发成本。可以使用 " -"``create_stream_upload_session`` 方法创建专门的 Upload Session。\\ 此时,不需要为该 " -"Session 的 ``open_record_writer`` 提供 block id。" +"MaxCompute 提供了\\ `流式上传接口 `_\\ 用于简化分布式服务" +"开发成本。可以使用 ``create_stream_upload_session`` 方法创建专门的 Upload" +" Session。\\ 此时,不需要为该 Session 的 ``open_record_writer`` 提供 " +"block id。" msgstr "" "MaxCompute provides `stream upload interface " "` 。" +"需要注意的是,该方法返回的行数大小受到 ODPS 排序返回结果大小的限制,默认" +"为 10000 行,可通过 ``options.df.odps.sort.limit`` 配置,详见 :ref:`配置" +"选项 ` 。" msgstr "" "Note that the number of lines returned by ``value_counts`` is limited due" " to limitations on ORDER BY clause of MaxCompute SQL. The default " @@ -344,14 +353,18 @@ msgid "编写自定义聚合" msgstr "Write custom aggregations" #: ../../source/df-agg.rst:153 -msgid "对字段调用agg或者aggregate方法来调用自定义聚合。自定义聚合需要提供一个类,这个类需要提供以下方法:" +msgid "" +"对字段调用agg或者aggregate方法来调用自定义聚合。自定义聚合需要提供一个类" +",这个类需要提供以下方法:" msgstr "" "Use the agg or aggregate method on fields with a custom aggregation. The " "custom aggregation requires a class, which provides the following " "methods:" #: ../../source/df-agg.rst:155 -msgid "buffer():返回一个mutable的object(比如 list、dict),buffer大小不应随数据而递增。" +msgid "" +"buffer():返回一个mutable的object(比如 list、dict),buffer大小不应随" +"数据而递增。" msgstr "" "buffer(): returns a mutable object such as list and dict. The buffer size" " should not increase during data processing." @@ -473,7 +486,8 @@ msgstr "Specify the function name to use an existing UDAF in MaxCompute." #: ../../source/df-agg.rst:243 msgid "" -">>> iris.groupby('name').agg(iris.sepalwidth.agg('your_func')) # 对单列聚合\n" +">>> iris.groupby('name').agg(iris.sepalwidth.agg('your_func')) # 对单列" +"聚合\n" ">>> to_agg = agg([iris.sepalwidth, iris.sepallength], 'your_func', " "rtype='float')\n" ">>> iris.groupby('name').agg(to_agg.rename('val')) # 对多列聚合" @@ -486,7 +500,9 @@ msgstr "" "columns" #: ../../source/df-agg.rst:245 -msgid "目前,受限于 Python UDF,自定义聚合无法支持将 list / dict 类型作为初始输入或最终输出结果。" +msgid "" +"目前,受限于 Python UDF,自定义聚合无法支持将 list / dict 类型作为初始" +"输入或最终输出结果。" msgstr "" "Limited by Python user-defined functions (UDFs), custom aggregations " "cannot specify the input or output result type as the list or dict type." @@ -497,8 +513,8 @@ msgstr "HyperLogLog counting" #: ../../source/df-agg.rst:250 msgid "" -"DataFrame 提供了对列进行 HyperLogLog 计数的接口 ``hll_count``,这个接口是个近似的估计接口, " -"当数据量很大时,能较快的对数据的唯一个数进行估计。" +"DataFrame 提供了对列进行 HyperLogLog 计数的接口 ``hll_count``,这个接口是" +"个近似的估计接口, 当数据量很大时,能较快的对数据的唯一个数进行估计。" msgstr "" "DataFrame provides the ``hll_count`` API to use HyperLogLog counting for " "columns. This API estimates the unique quantity among large volumes of " diff --git a/docs/source/locale/en/LC_MESSAGES/df-basic.po b/docs/source/locale/en/LC_MESSAGES/df-basic.po index bbfe8c67..bc41ffa2 100644 --- a/docs/source/locale/en/LC_MESSAGES/df-basic.po +++ b/docs/source/locale/en/LC_MESSAGES/df-basic.po @@ -23,10 +23,12 @@ msgstr "Concepts" #: ../../source/df-basic.rst:7 msgid "" -"在使用 DataFrame 时,你需要了解三个对象上的操作:\\ ``Collection``\\ (``DataFrame``) ,\\ " -"``Sequence``\\ ,\\ ``Scalar``\\ 。 " -"这三个对象分别表示表结构(或者二维结构)、列(一维结构)、标量。需要注意的是,这些对象仅在使用 Pandas 数据创建后会包含实际数据, 而在 " -"ODPS 表上创建的对象中并不包含实际的数据,而仅仅包含对这些数据的操作,实质的存储和计算会在 ODPS 中进行。" +"在使用 DataFrame 时,你需要了解三个对象上的操作:\\ ``Collection``\\ (``" +"DataFrame``) ,\\ ``Sequence``\\ ,\\ ``Scalar``\\ 。 这三个对象分别表示" +"表结构(或者二维结构)、列(一维结构)、标量。需要注意的是,这些对象仅在" +"使用 Pandas 数据创建后会包含实际数据, 而在 ODPS 表上创建的对象中并不包含" +"实际的数据,而仅仅包含对这些数据的操作,实质的存储和计算会在 ODPS 中进行" +"。" msgstr "" "When using DataFrame, you need to know the operations on the following " "objects: \\ ``Collection``\\ (``DataFrame``), \\ ``Sequence``\\, and \\ " @@ -42,10 +44,12 @@ msgstr "Create a DataFrame object" #: ../../source/df-basic.rst:14 msgid "" -"通常情况下,你唯一需要直接创建的 Collection 对象是 :class:`DataFrame`,这一对象用于引用数据源,可能是一个 ODPS" -" 表, ODPS 分区,Pandas DataFrame或sqlalchemy.Table(数据库表)。 " -"使用这几种数据源时,相关的操作相同,这意味着你可以不更改数据处理的代码,仅仅修改输入/输出的指向, " -"便可以简单地将小数据量上本地测试运行的代码迁移到 ODPS 上,而迁移的正确性由 PyODPS 来保证。" +"通常情况下,你唯一需要直接创建的 Collection 对象是 :class:`DataFrame`," +"这一对象用于引用数据源,可能是一个 ODPS 表, ODPS 分区,Pandas DataFrame" +"或sqlalchemy.Table(数据库表)。 使用这几种数据源时,相关的操作相同,这" +"意味着你可以不更改数据处理的代码,仅仅修改输入/输出的指向, 便可以简单地" +"将小数据量上本地测试运行的代码迁移到 ODPS 上,而迁移的正确性由 PyODPS 来" +"保证。" msgstr "" "The only Collection object you need to directly create is " ":class:`DataFrame`. This object is used to reference the data source, " @@ -57,8 +61,8 @@ msgstr "" #: ../../source/df-basic.rst:19 msgid "" -"创建 DataFrame 非常简单,只需将 Table 对象、 pandas DataFrame 对象或者 sqlalchemy Table " -"对象传入即可。" +"创建 DataFrame 非常简单,只需将 Table 对象、 pandas DataFrame 对象或者 " +"sqlalchemy Table 对象传入即可。" msgstr "" "Creating a DataFrame object is as simple as passing in a Table object, a " "pandas DataFrame object, or a sqlalchemy Table object." @@ -75,9 +79,8 @@ msgid "" ">>> pt_df = " "DataFrame(o.get_table('partitioned_table').get_partition('pt=20171111'))" "\n" -">>> pt_df2 = " -"o.get_table('partitioned_table').get_partition('pt=20171111').to_df() # " -"使用分区的to_df方法\n" +">>> pt_df2 = o.get_table('partitioned_table').get_partition('pt=20171111'" +").to_df() # 使用分区的to_df方法\n" ">>>\n" ">>> # 从 Pandas DataFrame 创建\n" ">>> import pandas as pd\n" @@ -124,11 +127,12 @@ msgstr "" #: ../../source/df-basic.rst:44 msgid "" -"在用 pandas DataFrame 初始化时,对于 numpy object 类型或者 string 类型,PyODPS DataFrame " -"会尝试推断类型, 如果一整列都为空,则会报错。这时,用户可以指定 `unknown_as_string` " -"为True,会将这些列指定为string类型。 用户也可以指定 as_type 参数。若类型为基本类型,会在创建 PyODPS DataFrame" -" 时进行强制类型转换。 如果 Pandas DataFrame 中包含 list 或者 dict 列,该列的类型不会被推断,必须手动使用 " -"as_type 指定。 as_type 参数类型必须是dict。" +"在用 pandas DataFrame 初始化时,对于 numpy object 类型或者 string 类型," +"PyODPS DataFrame 会尝试推断类型, 如果一整列都为空,则会报错。这时,用户" +"可以指定 `unknown_as_string` 为True,会将这些列指定为string类型。 用户也" +"可以指定 as_type 参数。若类型为基本类型,会在创建 PyODPS DataFrame 时进行" +"强制类型转换。 如果 Pandas DataFrame 中包含 list 或者 dict 列,该列的类型" +"不会被推断,必须手动使用 as_type 指定。 as_type 参数类型必须是dict。" msgstr "" "When initializing a DataFrame object by using a pandas DataFrame object, " "PyODPS attempts to determine the type of the column if it is a numpy " @@ -152,7 +156,8 @@ msgid "" " petallength float64\n" " petalwidth float64\n" " name string\n" -" null_col1 string # 无法识别,通过unknown_as_string设置成string类型\n" +" null_col1 string # 无法识别,通过unknown_as_string设置成" +"string类型\n" " null_col2 float64 # 强制转换成float类型\n" "}\n" ">>> df4 = DataFrame(df3, as_type={'list_col': 'list'})\n" @@ -189,8 +194,8 @@ msgstr "Sequence" #: ../../source/df-basic.rst:73 msgid "" -":class:`SequenceExpr` 代表了二维数据集中的一列。你不应当手动创建 SequenceExpr,而应当从一个 " -"Collection 中获取。" +":class:`SequenceExpr` 代表了二维数据集中的一列。你不应当手动创建 " +"SequenceExpr,而应当从一个 Collection 中获取。" msgstr "" ":class:`SequenceExpr` represents a column in a two-dimensional data set. " "You are not allowed to manually create SequenceExpr objects. Instead, you" @@ -217,8 +222,8 @@ msgstr "" #: ../../source/df-basic.rst:92 msgid "" -"如果列名存储在一个字符串变量中,除了使用 getattr(df, 'column_name') 达到相同的效果外,也可以使用 " -"df[column_name] 的形式,例如" +"如果列名存储在一个字符串变量中,除了使用 getattr(df, 'column_name') 达到" +"相同的效果外,也可以使用 df[column_name] 的形式,例如" msgstr "" "If the column name is stored in a string variable, apart from using " "getattr(df, 'column_name'), df[column_name] can also be used to achieve " @@ -241,8 +246,9 @@ msgstr "Column type" #: ../../source/df-basic.rst:108 msgid "" -"DataFrame包括自己的类型系统,在使用Table初始化的时候,ODPS的类型会被进行转换。这样做的好处是,能支持更多的计算后端。 " -"目前,DataFrame的执行后端支持ODPS SQL、pandas以及数据库(MySQL和Postgres)。" +"DataFrame包括自己的类型系统,在使用Table初始化的时候,ODPS的类型会被进行" +"转换。这样做的好处是,能支持更多的计算后端。 目前,DataFrame的执行后端" +"支持ODPS SQL、pandas以及数据库(MySQL和Postgres)。" msgstr "" "DataFrame has its own type system. When performing the Table " "initialization, the MaxCompute types are cast. This design provides " @@ -256,8 +262,8 @@ msgstr "PyODPS DataFrame includes the following types:" #: ../../source/df-basic.rst:113 msgid "" -"``int8``\\ ,\\ ``int16``\\ ,\\ ``int32``\\ ,\\ ``int64``\\ ,\\ " -"``float32``\\ ,\\ ``float64``\\ ,\\ ``boolean``\\ ,\\ ``string``\\ ,\\ " +"``int8``\\ ,\\ ``int16``\\ ,\\ ``int32``\\ ,\\ ``int64``\\ ,\\ ``" +"float32``\\ ,\\ ``float64``\\ ,\\ ``boolean``\\ ,\\ ``string``\\ ,\\ " "``decimal``\\ ,\\ ``datetime``\\ ,\\ ``list``\\ ,\\ ``dict``" msgstr "" "``int8``\\, \\ ``int16``\\, \\ ``int32``\\, \\ ``int64``\\, \\ " @@ -326,8 +332,8 @@ msgstr "" #: ../../source/df-basic.rst:131 msgid "" -"list 和 dict 必须填写其包含值的类型,否则会报错。目前 DataFrame 暂不支持 MaxCompute 2.0 中新增的 " -"Timestamp 及 Struct 类型,未来的版本会支持。" +"list 和 dict 必须填写其包含值的类型,否则会报错。目前 DataFrame 暂不支持 " +"MaxCompute 2.0 中新增的 Timestamp 及 Struct 类型,未来的版本会支持。" msgstr "" "For list and dict types, the types of their containing values must be " "specified, or an error occurs. Currently, DataFrame does not support the " @@ -345,7 +351,9 @@ msgid "" msgstr "" #: ../../source/df-basic.rst:141 -msgid "如果要修改一列的类型,可以使用 astype 方法。该方法输入一个类型,并返回类型转换后的 Sequence。例如," +msgid "" +"如果要修改一列的类型,可以使用 astype 方法。该方法输入一个类型,并返回" +"类型转换后的 Sequence。例如," msgstr "" "You can use the astype method to change the type of an entire column. " "This method requires a type as input and returns the converted Sequence " @@ -367,7 +375,9 @@ msgid "列名" msgstr "Column name" #: ../../source/df-basic.rst:157 -msgid "在 DataFrame 的计算过程中,一个 Sequence 必须要有列名。在很多情况下,DataFrame 会起一个名字。比如:" +msgid "" +"在 DataFrame 的计算过程中,一个 Sequence 必须要有列名。在很多情况下," +"DataFrame 会起一个名字。比如:" msgstr "" "In the calculation of DataFrame, a Sequence object must have a column " "name. At most times, DataFrame creates names automatically. For example: " @@ -383,8 +393,9 @@ msgstr "" #: ../../source/df-basic.rst:167 msgid "" -"可以看到,\\ ``sepalwidth``\\ 取最大值后被命名为\\ ``sepalwidth_max``\\ 。还有一些操作,比如一个 " -"Sequence 做加法,加上一个 Scalar,这时,会被命名为这个 Sequence 的名字。其它情况下,需要用户去自己命名。" +"可以看到,\\ ``sepalwidth``\\ 取最大值后被命名为\\ ``sepalwidth_max``\\ " +"。还有一些操作,比如一个 Sequence 做加法,加上一个 Scalar,这时,会被命名" +"为这个 Sequence 的名字。其它情况下,需要用户去自己命名。" msgstr "" "As you can see, \\ ``sepalwidth``\\ was named \\ ``sepalwidth_max``\\ " "after obtaining the maximum value. Other operations, such as adding a " @@ -407,7 +418,9 @@ msgid "" msgstr "" #: ../../source/df-basic.rst:182 -msgid "需要注意的是,rename 操作并不是就地重命名。如果要在 Collection 上应用新的列名,需要重新做列选择,例如" +msgid "" +"需要注意的是,rename 操作并不是就地重命名。如果要在 Collection 上应用新的" +"列名,需要重新做列选择,例如" msgstr "" "Note that rename method does not actually rename the column in place. If " "you want to apply new names in your collection, you need to select " @@ -431,8 +444,9 @@ msgstr "Simple column transformations" #: ../../source/df-basic.rst:198 msgid "" -"你可以对一个 Sequence 进行运算,返回一个新的 Sequence,正如对简单的 Python 变量进行运算一样。对数值列, " -"Sequence 支持四则运算,而对字符串则支持字符串相加等操作。例如," +"你可以对一个 Sequence 进行运算,返回一个新的 Sequence,正如对简单的 " +"Python 变量进行运算一样。对数值列, Sequence 支持四则运算,而对字符串则" +"支持字符串相加等操作。例如," msgstr "" "You can perform operations on a Sequence object to get a new Sequence, " "just as you do with a simple Python variable. For numeric columns, " @@ -466,7 +480,9 @@ msgid "" msgstr "" #: ../../source/df-basic.rst:223 -msgid "注意到两列参与运算,因而 PyODPS 无法确定最终显示的列名,需要手动指定。详细的列变换说明,请参见 :ref:`dfelement`。" +msgid "" +"注意到两列参与运算,因而 PyODPS 无法确定最终显示的列名,需要手动指定。" +"详细的列变换说明,请参见 :ref:`dfelement`。" msgstr "" "Note that when two columns are involved in operations, you need to " "manually specify the name of the result column. For details about column " @@ -478,9 +494,9 @@ msgstr "Collection" #: ../../source/df-basic.rst:227 msgid "" -"DataFrame 中所有二维数据集上的操作都属于 :class:`CollectionExpr`,可视为一张 ODPS " -"表或一张电子表单,DataFrame 对象也是 CollectionExpr 的特例。CollectionExpr " -"中包含针对二维数据集的列操作、筛选、变换等大量操作。" +"DataFrame 中所有二维数据集上的操作都属于 :class:`CollectionExpr`,可视为" +"一张 ODPS 表或一张电子表单,DataFrame 对象也是 CollectionExpr 的特例。" +"CollectionExpr 中包含针对二维数据集的列操作、筛选、变换等大量操作。" msgstr "" "CollectionExpr supports all operations on DataFrame two-dimensional " "datasets. It can be seen as a MaxCompute table or a spreadsheet. " @@ -494,8 +510,8 @@ msgstr "Retrieve types" #: ../../source/df-basic.rst:233 msgid "" -"``dtypes``\\ 可以用来获取 CollectionExpr 中所有列的类型。``dtypes`` 返回的是 :ref:`Schema类型" -" ` 。" +"``dtypes``\\ 可以用来获取 CollectionExpr 中所有列的类型。``dtypes`` 返回" +"的是 :ref:`Schema类型 ` 。" msgstr "" "You can use the ``dtypes`` method to retrieve the types of all columns in" " a CollectionExpr object. ``dtypes`` returns a :ref:`Schema type." @@ -517,7 +533,9 @@ msgid "列选择和增删" msgstr "Select, add, and delete columns" #: ../../source/df-basic.rst:250 -msgid "如果要从一个 CollectionExpr 中选取部分列,产生新的数据集,可以使用 expr[columns] 语法。例如," +msgid "" +"如果要从一个 CollectionExpr 中选取部分列,产生新的数据集,可以使用 expr[" +"columns] 语法。例如," msgstr "" "You can use the expr[columns] syntax to select a certain amount of " "columns from a CollectionExpr object and form a new dataset. For example:" @@ -535,8 +553,9 @@ msgstr "" #: ../../source/df-basic.rst:264 msgid "" -"**注意**\\ :如果需要选择的列只有一列,需要在 columns 后加上逗号或者显示标记为列表,例如 df[df.sepal_length, " -"] 或 df[[df.sepal_length]],否则返回的将是一个 Sequence 对象,而不是 Collection。" +"**注意**\\ :如果需要选择的列只有一列,需要在 columns 后加上逗号或者显示" +"标记为列表,例如 df[df.sepal_length, ] 或 df[[df.sepal_length]],否则返回" +"的将是一个 Sequence 对象,而不是 Collection。" msgstr "" "**Note**\\: If only one column is needed, you need to add a comma (,) " "after the column name or explicitly mark the column as a list, for " @@ -581,15 +600,17 @@ msgstr "" #: ../../source/df-basic.rst:293 msgid "" -"如果我们需要在已有 collection 中添加某一列变换的结果,也可以使用 expr[expr, new_sequence] 语法, " -"新增的列会作为新 collection 的一部分。" +"如果我们需要在已有 collection 中添加某一列变换的结果,也可以使用 expr[" +"expr, new_sequence] 语法, 新增的列会作为新 collection 的一部分。" msgstr "" "You can also use the expr[expr, new_sequence] syntax to add a new " "sequence transformation to the original collection. The new sequence is " "part of the new collection." #: ../../source/df-basic.rst:296 -msgid "下面的例子将 iris 中的 sepalwidth 列加一后重命名为 sepalwidthplus1 并追加到数据集末尾,形成新的数据集:" +msgid "" +"下面的例子将 iris 中的 sepalwidth 列加一后重命名为 sepalwidthplus1 并追加" +"到数据集末尾,形成新的数据集:" msgstr "" "The following example illustrates that, we create a new sequence by " "adding one to each element in the original sepalwidth column of iris, " @@ -616,8 +637,8 @@ msgstr "" #: ../../source/df-basic.rst:315 msgid "" -"使用 `df[df, new_sequence]` 需要注意的是,变换后的列名与原列名可能相同,如果需要与原 collection 合并, " -"请将该列重命名。" +"使用 `df[df, new_sequence]` 需要注意的是,变换后的列名与原列名可能相同," +"如果需要与原 collection 合并, 请将该列重命名。" msgstr "" "When using `df[df, new_sequence]`, note that the transformed column may " "have the same name as the original column. Rename the new column if you " @@ -649,7 +670,9 @@ msgid "" msgstr "" #: ../../source/df-basic.rst:338 -msgid "我们也可以先将原列通过 exclude 方法进行排除,再将变换后的新列并入,而不必担心重名。" +msgid "" +"我们也可以先将原列通过 exclude 方法进行排除,再将变换后的新列并入,而不必" +"担心重名。" msgstr "" "You can also use the exclude method to exclude the original column before" " appending the new one to the dataset." @@ -685,8 +708,9 @@ msgstr "" #: ../../source/df-basic.rst:363 msgid "" -"增删列以创建新 collection 的另一种方法是调用 select 方法,将需要选择的列作为参数输入。如果需要重命名,使用 keyword " -"参数输入,并将新的列名作为参数名即可。" +"增删列以创建新 collection 的另一种方法是调用 select 方法,将需要选择的列" +"作为参数输入。如果需要重命名,使用 keyword 参数输入,并将新的列名作为参数" +"名即可。" msgstr "" "The select method provides another way to create a new collection, " "keyword argument will rename the provided sequence to the given keyword." @@ -704,8 +728,9 @@ msgstr "" #: ../../source/df-basic.rst:376 msgid "" -"此外,我们也可以传入一个 lambda 表达式,它接收一个参数,接收上一步的结果。在执行时,PyODPS 会检查这些 lambda " -"表达式,传入上一步生成的 collection 并将其替换为正确的列。" +"此外,我们也可以传入一个 lambda 表达式,它接收一个参数,接收上一步的结果" +"。在执行时,PyODPS 会检查这些 lambda 表达式,传入上一步生成的 collection " +"并将其替换为正确的列。" msgstr "" "You can also pass in a lambda expression, which takes the result from the" " previous operation as a parameter. When executed, PyODPS checks the " @@ -746,7 +771,9 @@ msgid "引入常数和随机数" msgstr "Constants and random numbers" #: ../../source/df-basic.rst:404 -msgid "DataFrame 支持在 collection 中追加一列常数。追加常数需要使用 :class:`Scalar`,引入时需要手动指定列名,如" +msgid "" +"DataFrame 支持在 collection 中追加一列常数。追加常数需要使用 :class:`" +"Scalar`,引入时需要手动指定列名,如" msgstr "" "DataFrame allows you to append a column of constants to a Collection " "object. Scalar is required and you need to manually specify the column " @@ -806,7 +833,9 @@ msgid "" msgstr "" #: ../../source/df-basic.rst:446 -msgid "需要注意的是,这种写法无法自动识别空值的类型,所以在增加空值列时,仍然要使用" +msgid "" +"需要注意的是,这种写法无法自动识别空值的类型,所以在增加空值列时,仍然要" +"使用" msgstr "" "Note that this method cannot automatically recognize the type of null " "values. Add null columns as follows: " @@ -830,8 +859,9 @@ msgstr "" #: ../../source/df-basic.rst:460 msgid "" -"DataFrame 也支持在 collection 中增加一列随机数列,该列类型为 float,范围为 0 - 1,每行数值均不同。 " -"追加随机数列需要使用 :class:`RandomScalar`,参数为随机数种子,可省略。" +"DataFrame 也支持在 collection 中增加一列随机数列,该列类型为 float,范围" +"为 0 - 1,每行数值均不同。 追加随机数列需要使用 :class:`RandomScalar`," +"参数为随机数种子,可省略。" msgstr "" "DataFrame also allows you to append a column of random numbers to a " "Collection object. The column type is float and the value range is 0-1. " @@ -965,7 +995,9 @@ msgid "" msgstr "" #: ../../source/df-basic.rst:561 -msgid "对于Collection,如果它包含一个列是boolean类型,则可以直接使用该列作为过滤条件。" +msgid "" +"对于Collection,如果它包含一个列是boolean类型,则可以直接使用该列作为过滤" +"条件。" msgstr "" "If a Collection object contains a boolean column, you can use it directly" " as a filter condition." @@ -984,7 +1016,9 @@ msgid "" msgstr "" #: ../../source/df-basic.rst:576 -msgid "因此,记住对Collection取单个squence的操作时,只有boolean列是合法的,即对Collection作过滤操作。" +msgid "" +"因此,记住对Collection取单个squence的操作时,只有boolean列是合法的,即对" +"Collection作过滤操作。" msgstr "" "When retrieving a single sequence from a Collection object, only the " "boolean column can be used as a valid filter condition." @@ -1008,9 +1042,9 @@ msgstr "" #: ../../source/df-basic.rst:588 msgid "" -"同时,我们也支持Pandas中的\\ ``query``\\方法,用查询语句来做数据的筛选,在表达式中直接使用列名如\\ " -"``sepallength``\\进行操作, 另外在查询语句中\\ ``&``\\和\\ ``and``\\都表示与操作,\\ " -"``|``\\和\\ ``or``\\都表示或操作。" +"同时,我们也支持Pandas中的\\ ``query``\\方法,用查询语句来做数据的筛选," +"在表达式中直接使用列名如\\ ``sepallength``\\进行操作, 另外在查询语句中\\" +" ``&``\\和\\ ``and``\\都表示与操作,\\ ``|``\\和\\ ``or``\\都表示或操作。" msgstr "" "The \\ ``query``\\ method in pandas is also supported. You can write " "query statements to filter data. Column names such as \\ " @@ -1075,9 +1109,9 @@ msgstr "" #: ../../source/df-basic.rst:622 msgid "" -"支持部分运算符:\\ ``+``\\,\\ ``-``\\,\\ ``*``\\,\\ ``/``\\,\\ ``//``\\,\\ " -"``%``\\,\\ ``**``\\, \\ ``==``\\,\\ ``!=``\\,\\ ``<``\\,\\ ``<=``\\,\\ " -"``>``\\,\\ ``>=``\\,\\ ``in``\\,\\ ``not in``\\" +"支持部分运算符:\\ ``+``\\,\\ ``-``\\,\\ ``*``\\,\\ ``/``\\,\\ ``//``" +"\\,\\ ``%``\\,\\ ``**``\\, \\ ``==``\\,\\ ``!=``\\,\\ ``<``\\,\\ ``<" +"=``\\,\\ ``>``\\,\\ ``>=``\\,\\ ``in``\\,\\ ``not in``\\" msgstr "" "The following operators are supported: \\ ``+``\\, \\ ``-``\\, \\ " "``*``\\, \\ ``/``\\, \\ ``//``\\, \\ ``%``\\, \\ ``**``\\, \\ ``==``\\, " @@ -1089,7 +1123,9 @@ msgid "bool" msgstr "" #: ../../source/df-basic.rst:624 -msgid "与或非操作,其中 \\ ``&``\\ 和 \\ ``and``\\ 表示与,\\ ``|``\\ 和 \\ ``or``\\ 表示或" +msgid "" +"与或非操作,其中 \\ ``&``\\ 和 \\ ``and``\\ 表示与,\\ ``|``\\ 和 \\ ``or" +"``\\ 表示或" msgstr "" "Ampersands (\\ ``&``\\ ) and \\ ``and``\\ represent the AND operator. " "Vertical bars (\\ ``|``\\) and \\ ``or``\\ represent the OR operator." @@ -1116,9 +1152,10 @@ msgstr "Lateral view" #: ../../source/df-basic.rst:633 msgid "" -"对于 list 及 map 类型的列,explode 方法会将该列转换为多行输出。使用 apply 方法也可以输出多行。 " -"为了进行聚合等操作,常常需要将这些输出和原表中的列合并。此时可以使用 DataFrame 提供的并列多行输出功能, " -"写法为将多行输出函数生成的集合与原集合中的列名一起映射。" +"对于 list 及 map 类型的列,explode 方法会将该列转换为多行输出。使用 apply" +" 方法也可以输出多行。 为了进行聚合等操作,常常需要将这些输出和原表中的列" +"合并。此时可以使用 DataFrame 提供的并列多行输出功能, 写法为将多行输出" +"函数生成的集合与原集合中的列名一起映射。" msgstr "" "For list and map columns, the ``explode`` method can convert the columns " "into multiple rows for output. Functions passed into the ``apply`` method" @@ -1156,8 +1193,9 @@ msgstr "" #: ../../source/df-basic.rst:661 msgid "" -"如果多行输出方法对某个输入不产生任何输出,默认输入行将不在最终结果中出现。如果需要在结果中出现该行,可以设置 " -"``keep_nulls=True``。此时,与该行并列的值将输出为空值:" +"如果多行输出方法对某个输入不产生任何输出,默认输入行将不在最终结果中出现" +"。如果需要在结果中出现该行,可以设置 ``keep_nulls=True``。此时,与该行" +"并列的值将输出为空值:" msgstr "" "When the method does not produce any output for some rows, these rows " "will not appear in the output, which is often not expected. To preserve " @@ -1184,8 +1222,8 @@ msgstr "" #: ../../source/df-basic.rst:680 msgid "" -"关于 explode 使用并列输出的具体文档可参考 :ref:`dfcollections`,对于 apply 方法使用并列输出的例子可参考 " -":ref:`dfudtfapp`。" +"关于 explode 使用并列输出的具体文档可参考 :ref:`dfcollections`,对于 " +"apply 方法使用并列输出的例子可参考 :ref:`dfudtfapp`。" msgstr "" "For details about the explode method, see :ref:`dfcollections`. For " "examples of the apply method, see :ref:`dfudtfapp`." @@ -1204,7 +1242,9 @@ msgid "" msgstr "" #: ../../source/df-basic.rst:694 -msgid "值得注意的是,目前切片对于ODPS SQL后端不支持start和step。我们也可以使用limit方法" +msgid "" +"值得注意的是,目前切片对于ODPS SQL后端不支持start和step。我们也可以使用" +"limit方法" msgstr "" "Note that for MaxCompute SQL backend, start and step are not supported in" " slice operations. You can also use the limit method." @@ -1235,7 +1275,8 @@ msgstr "Deferred execution" #: ../../source/df-basic.rst:716 msgid "" "DataFrame上的所有操作并不会立即执行,只有当用户显式调用\\ ``execute``\\ " -"方法,或者一些立即执行的方法时(内部调用的就是\\ ``execute``\\ ),才会真正去执行。" +"方法,或者一些立即执行的方法时(内部调用的就是\\ ``execute``\\ ),才会" +"真正去执行。" msgstr "" "The operations in DataFrame are not automatically executed. They are only" " executed when you explicitly call the \\ `ʻexecute`` \\ action or " @@ -1299,7 +1340,9 @@ msgid "to_pandas" msgstr "" #: ../../source/df-basic.rst:727 -msgid "转化为 Pandas DataFrame 或者 Series,wrap 参数为 True 的时候,返回 PyODPS DataFrame 对象" +msgid "" +"转化为 Pandas DataFrame 或者 Series,wrap 参数为 True 的时候,返回 PyODPS" +" DataFrame 对象" msgstr "" "Converts a collection object to a pandas DataFrame object or a Sequence " "object to a Series object. When the wrap parameter is set to Ture, a " @@ -1322,8 +1365,8 @@ msgstr "Plotting-related methods." #: ../../source/df-basic.rst:733 msgid "" -"**注意**\\ :在交互式环境下,PyODPS DataFrame会在打印或者repr的时候,调用\\ ``execute``\\ " -"方法,这样省去了用户手动去调用execute。" +"**注意**\\ :在交互式环境下,PyODPS DataFrame会在打印或者repr的时候,调用" +"\\ ``execute``\\ 方法,这样省去了用户手动去调用execute。" msgstr "" "**Note**\\: In an interactive environment, PyODPS DataFrame objects " "automatically call the ``execute`` method when printing or repr is " @@ -1379,7 +1422,9 @@ msgid "" msgstr "" #: ../../source/df-basic.rst:779 -msgid "此时打印或者 repr 对象,会显示整棵抽象语法树。如果需要执行,则必须手动调用 ``execute`` 方法:" +msgid "" +"此时打印或者 repr 对象,会显示整棵抽象语法树。如果需要执行,则必须手动" +"调用 ``execute`` 方法:" msgstr "" "Now the entire abstract syntax tree is displayed when printing or repr is" " called. When you want to actually execute this DataFrame, you have to " @@ -1401,7 +1446,9 @@ msgid "读取执行结果" msgstr "Obtaining execution results" #: ../../source/df-basic.rst:794 -msgid "``execute`` 或 ``head`` 函数输出的结果为 ``ResultFrame`` 类型,可从中读取结果。" +msgid "" +"``execute`` 或 ``head`` 函数输出的结果为 ``ResultFrame`` 类型,可从中读取" +"结果。" msgstr "" "Outputs of ``execute`` or ``head`` methods are ``ResultFrame`` instances " "where you can obtain execution results." @@ -1431,8 +1478,8 @@ msgstr "" #: ../../source/df-basic.rst:814 msgid "" -"ResultFrame 也支持在安装有 pandas 的前提下转换为 pandas DataFrame 或使用 pandas 后端的 PyODPS" -" DataFrame:" +"ResultFrame 也支持在安装有 pandas 的前提下转换为 pandas DataFrame 或使用 " +"pandas 后端的 PyODPS DataFrame:" msgstr "" "When pandas is installed, a ResultFrame can be converted into a pandas " "DataFrame or a PyODPS DataFrame with pandas backend." @@ -1440,8 +1487,8 @@ msgstr "" #: ../../source/df-basic.rst:817 msgid "" ">>> pd_df = iris.head(3).to_pandas() # 返回 pandas DataFrame\n" -">>> wrapped_df = iris.head(3).to_pandas(wrap=True) # 返回使用 Pandas 后端的 " -"PyODPS DataFrame" +">>> wrapped_df = iris.head(3).to_pandas(wrap=True) # 返回使用 Pandas 后" +"端的 PyODPS DataFrame" msgstr "" ">>> pd_df = iris.head(3).to_pandas() # returns a pandas DataFrame\n" ">>> wrapped_df = iris.head(3).to_pandas(wrap=True) # returns a PyODPS " @@ -1449,8 +1496,8 @@ msgstr "" #: ../../source/df-basic.rst:823 msgid "" -"关于如何使用 pandas,请参考 `pandas 文档 `_ 。pandas " -"为开源库,ODPS 不对其结果负责。" +"关于如何使用 pandas,请参考 `pandas 文档 `_ 。pandas 为开源库,ODPS 不对其结果负责。" msgstr "" #: ../../source/df-basic.rst:827 @@ -1458,7 +1505,9 @@ msgid "保存执行结果为 ODPS 表" msgstr "Save results to MaxCompute tables" #: ../../source/df-basic.rst:829 -msgid "对 Collection,我们可以调用\\ ``persist``\\ 方法,参数为表名。返回一个新的DataFrame对象" +msgid "" +"对 Collection,我们可以调用\\ ``persist``\\ 方法,参数为表名。返回一个新" +"的DataFrame对象" msgstr "" "For collection objects, you can use the \\ ``persist``\\ method, which " "takes a table name as the parameter and returns a new DataFrame object." @@ -1477,9 +1526,11 @@ msgstr "" #: ../../source/df-basic.rst:842 msgid "" -"``persist``\\ 可以传入 partitions 参数。加入该参数后,会创建一个分区表,它的分区字段为 partitions " -"列出的字段, DataFrame 中相应字段的值决定该行将被写入的分区。例如,当 partitions 为 ['name'] 且某行 name " -"的值为 test, 那么该行将被写入分区 ``name=test``。这适用于当分区需要通过计算获取的情形。" +"``persist``\\ 可以传入 partitions 参数。加入该参数后,会创建一个分区表," +"它的分区字段为 partitions 列出的字段, DataFrame 中相应字段的值决定该行将" +"被写入的分区。例如,当 partitions 为 ['name'] 且某行 name 的值为 test, " +"那么该行将被写入分区 ``name=test``。这适用于当分区需要通过计算获取的情形" +"。" msgstr "" "You can pass in a partitions parameter to ``persist``\\. A table is " "created with partition fields specified by the parameter. Data in the " @@ -1507,9 +1558,11 @@ msgstr "" #: ../../source/df-basic.rst:861 msgid "" -"如果想写入已经存在的表的某个分区,``persist``\\ 可以传入 partition 参数,指明写入表的哪个分区(如ds=******)。 " -"这时要注意,该DataFrame的每个字段都必须在该表存在,且类型相同。drop_partition和create_partition参数只有在此时有效," -" 分别表示是否要删除(如果分区存在)或创建(如果分区不存在)该分区。" +"如果想写入已经存在的表的某个分区,``persist``\\ 可以传入 partition 参数," +"指明写入表的哪个分区(如ds=******)。 这时要注意,该DataFrame的每个字段都" +"必须在该表存在,且类型相同。drop_partition和create_partition参数只有在" +"此时有效, 分别表示是否要删除(如果分区存在)或创建(如果分区不存在)该" +"分区。" msgstr "" "To write to a partition of an existing table, you can pass in a partition" " parameter to ``persist``\\ to indicate which partition is the target " @@ -1527,9 +1580,9 @@ msgstr "" #: ../../source/df-basic.rst:869 msgid "" -"persist 时,默认会覆盖原有数据。例如,当 persist " -"到一张分区表,对应分区的数据将会被重写。如果写入一张非分区表,整张表的数据都将被重写。如果你想要追加数据,可以使用参数 " -"``overwrite=False`` 。" +"persist 时,默认会覆盖原有数据。例如,当 persist 到一张分区表,对应分区的" +"数据将会被重写。如果写入一张非分区表,整张表的数据都将被重写。如果你想要" +"追加数据,可以使用参数 ``overwrite=False`` 。" msgstr "" "Persisting a DataFrame will overwrite existing data by default. For " "instance, when persisting into a partitioned table, data in corresponding" @@ -1549,8 +1602,8 @@ msgstr "" #: ../../source/df-basic.rst:877 msgid "" -"如果数据源中没有 ODPS 对象,例如数据源仅为 Pandas,在 persist 时需要手动指定 ODPS 入口对象, " -"或者将需要的入口对象标明为全局对象,如:" +"如果数据源中没有 ODPS 对象,例如数据源仅为 Pandas,在 persist 时需要手动" +"指定 ODPS 入口对象, 或者将需要的入口对象标明为全局对象,如:" msgstr "" "If the data source contains no ODPS objects, for example, only pandas " "data, you need to manually specify the ODPS object or mark the object as " @@ -1577,7 +1630,9 @@ msgid "保存执行结果为 Pandas DataFrame" msgstr "Save results to pandas DataFrame" #: ../../source/df-basic.rst:892 -msgid "我们可以使用 ``to_pandas``\\ 方法,如果wrap参数为True,将返回PyODPS DataFrame对象。" +msgid "" +"我们可以使用 ``to_pandas``\\ 方法,如果wrap参数为True,将返回PyODPS " +"DataFrame对象。" msgstr "" "You can use the ``to_pandas``\\ method. If wrap is set to True, a PyODPS " "DataFrame object is returned." @@ -1592,9 +1647,10 @@ msgstr "" #: ../../source/df-basic.rst:903 msgid "" -"``to_pandas`` 返回的 pandas DataFrame 与直接通过 pandas 创建的 DataFrame 没有任何区别, " -"数据的存储和计算均在本地。如果 ``wrap=True``,生成的即便是 PyODPS DataFrame,数据依然在本地。 " -"如果你的数据很大,或者运行环境的内存限制较为严格,请谨慎使用 ``to_pandas``。" +"``to_pandas`` 返回的 pandas DataFrame 与直接通过 pandas 创建的 DataFrame " +"没有任何区别, 数据的存储和计算均在本地。如果 ``wrap=True``,生成的即便是" +" PyODPS DataFrame,数据依然在本地。 如果你的数据很大,或者运行环境的内存" +"限制较为严格,请谨慎使用 ``to_pandas``。" msgstr "" "There are no differences between pandas DataFrames returned by " "``to_pandas()`` calls and pandas DataFrames originally created by pandas." @@ -1610,8 +1666,8 @@ msgstr "Set runtime parameters" #: ../../source/df-basic.rst:910 msgid "" -"对于立即执行的方法,比如 ``execute``、``persist``、``to_pandas`` 等,可以设置运行时参数(仅对ODPS " -"SQL后端有效 )。" +"对于立即执行的方法,比如 ``execute``、``persist``、``to_pandas`` 等,可以" +"设置运行时参数(仅对ODPS SQL后端有效 )。" msgstr "" "For actions such as `execute``, ``persist``, and ``to_pandas``, you can " "set runtime parameters. This is only valid for MaxCompute SQL." @@ -1623,7 +1679,9 @@ msgstr "" "parameters `." #: ../../source/df-basic.rst:914 -msgid "也可以在这些立即执行的方法上,使用 ``hints`` 参数。这样,这些参数只会作用于当前的计算过程。" +msgid "" +"也可以在这些立即执行的方法上,使用 ``hints`` 参数。这样,这些参数只会作用" +"于当前的计算过程。" msgstr "" "Additionally, you can use the `hints`` parameter. These parameters are " "only valid for the current calculation." @@ -1699,8 +1757,9 @@ msgstr "Cache intermediate results" #: ../../source/df-basic.rst:972 msgid "" -"DataFrame的计算过程中,一些Collection被多处使用,或者用户需要查看中间过程的执行结果, 这时用户可以使用 ``cache``\\" -" 标记某个collection需要被优先计算。" +"DataFrame的计算过程中,一些Collection被多处使用,或者用户需要查看中间过程" +"的执行结果, 这时用户可以使用 ``cache``\\ 标记某个collection需要被优先" +"计算。" msgstr "" "During the calculation of DataFrame, some Collection objects are used " "multiple times, or you need to check the execution results of an " @@ -1748,11 +1807,11 @@ msgstr "Asynchronous and parallel executions" #: ../../source/df-basic.rst:1000 msgid "" -"DataFrame 支持异步操作,对于立即执行的方法,包括 " -"``execute``、``persist``、``head``、``tail``、``to_pandas`` (其他方法不支持), 传入 " -"``async_`` 参数,即可以将一个操作异步执行,``timeout`` 参数指定超时时间, 异步返回的是 `Future " -"`_ 对象。" +"DataFrame 支持异步操作,对于立即执行的方法,包括 ``execute``、``persist``" +"、``head``、``tail``、``to_pandas`` (其他方法不支持), 传入 ``async_`` " +"参数,即可以将一个操作异步执行,``timeout`` 参数指定超时时间, 异步返回的" +"是 `Future `_ 对象。" msgstr "" "DataFrame supports asynchronous operations. For actions such as " "``execute``, ``persist``, ``head``, ``tail``, and ``to_pandas``, you can " @@ -1781,8 +1840,9 @@ msgstr "" #: ../../source/df-basic.rst:1021 msgid "" -"DataFrame 的并行执行可以使用多线程来并行,单个 expr 的执行可以通过 ``n_parallel`` 参数来指定并发度。 比如,当一个" -" DataFrame 的执行依赖的多个 cache 的 DataFrame 能够并行执行时,该参数就会生效。" +"DataFrame 的并行执行可以使用多线程来并行,单个 expr 的执行可以通过 ``n_" +"parallel`` 参数来指定并发度。 比如,当一个 DataFrame 的执行依赖的多个 " +"cache 的 DataFrame 能够并行执行时,该参数就会生效。" msgstr "" "The parallel execution of DataFrame can be achieved by using multiple " "threads. You can use the ``n_parallel`` parameter to specify the degree " @@ -1799,8 +1859,8 @@ msgid "" ">>> expr3 = " "iris.groupby('category').agg(value=iris.petal_length.min()).cache()\n" ">>> expr = expr1.union(expr2).union(expr3)\n" -">>> future = expr.execute(n_parallel=3, async_=True, timeout=2) # " -"并行和异步执行,2秒超时,返回Future对象\n" +">>> future = expr.execute(n_parallel=3, async_=True, timeout=2) # 并行和" +"异步执行,2秒超时,返回Future对象\n" ">>> future.result()\n" " category value\n" "0 Iris-setosa 5.006\n" @@ -1836,7 +1896,9 @@ msgstr "" "8 Iris-virginica 4.500" #: ../../source/df-basic.rst:1044 -msgid "当同时执行多个 expr 时,我们可以用多线程执行,但会面临一个问题, 比如两个 DataFrame 有共同的依赖,这个依赖将会被执行两遍。" +msgid "" +"当同时执行多个 expr 时,我们可以用多线程执行,但会面临一个问题, 比如两个" +" DataFrame 有共同的依赖,这个依赖将会被执行两遍。" msgstr "" "You can use multiple threads to execute multiple expr objects in " "parallel, but you may encounter a problem when two DataFrame objects " @@ -1844,10 +1906,11 @@ msgstr "" #: ../../source/df-basic.rst:1047 msgid "" -"现在我们提供了新的 ``Delay API``, 来将立即执行的操作(包括 " -"``execute``、``persist``、``head``、``tail``、``to_pandas``,其他方法不支持)变成延迟操作, " -"并返回 `Future `_ 对象。 当用户触发delay执行的时候,会去寻找共同依赖,按用户给定的并发度执行,并支持异步执行。" +"现在我们提供了新的 ``Delay API``, 来将立即执行的操作(包括 ``execute``、" +"``persist``、``head``、``tail``、``to_pandas``,其他方法不支持)变成延迟" +"操作, 并返回 `Future `_ 对象。 当用户触发delay执行的时候,会去寻找" +"共同依赖,按用户给定的并发度执行,并支持异步执行。" msgstr "" "A new ``Delay API`` is provided, which can delay the execution of actions" " (including ``execute``, ``persist``, ``head``, ``tail``, and " @@ -1863,8 +1926,8 @@ msgid "" ">>> delay = Delay() # 创建Delay对象\n" ">>>\n" ">>> df = iris[iris.sepal_width < 5].cache() # 有一个共同的依赖\n" -">>> future1 = df.sepal_width.sum().execute(delay=delay) # " -"立即返回future对象,此时并没有执行\n" +">>> future1 = df.sepal_width.sum().execute(delay=delay) # 立即返回future" +"对象,此时并没有执行\n" ">>> future2 = df.sepal_width.mean().execute(delay=delay)\n" ">>> future3 = df.sepal_length.max().execute(delay=delay)\n" ">>> delay.execute(n_parallel=3) # 并发度是3,此时才真正执行。\n" @@ -1895,8 +1958,8 @@ msgstr "" #: ../../source/df-basic.rst:1069 msgid "" -"可以看到上面的例子里,共同依赖的对象会先执行,然后再以并发度为3分别执行future1到future3。 当 ``n_parallel`` " -"为1时,执行时间会达到37s。" +"可以看到上面的例子里,共同依赖的对象会先执行,然后再以并发度为3分别执行" +"future1到future3。 当 ``n_parallel`` 为1时,执行时间会达到37s。" msgstr "" "As you can see in the above example, the shared dependency is executed " "first. Objects future1 to future3 are then executed with the degree of " @@ -1905,8 +1968,8 @@ msgstr "" #: ../../source/df-basic.rst:1072 msgid "" -"``delay.execute`` 也接受 ``async_`` 操作来指定是否异步执行,当异步的时候,也可以指定 ``timeout`` " -"参数来指定超时时间。" +"``delay.execute`` 也接受 ``async_`` 操作来指定是否异步执行,当异步的时候" +",也可以指定 ``timeout`` 参数来指定超时时间。" msgstr "" "You can also pass in the ``async_`` parameter to ``delay.execute`` to " "specify whether asynchronous execution is enabled. When enabled, you can " diff --git a/docs/source/locale/en/LC_MESSAGES/df-debug-instruction.po b/docs/source/locale/en/LC_MESSAGES/df-debug-instruction.po index 76eb9dd9..bd95fcd2 100644 --- a/docs/source/locale/en/LC_MESSAGES/df-debug-instruction.po +++ b/docs/source/locale/en/LC_MESSAGES/df-debug-instruction.po @@ -15,7 +15,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.5.3\n" +"Generated-By: Babel 2.12.1\n" #: ../../source/df-debug-instruction.rst:5 msgid "调试指南" @@ -26,7 +26,9 @@ msgid "可视化DataFrame" msgstr "Visualized DataFrame" #: ../../source/df-debug-instruction.rst:11 -msgid "由于PyODPS DataFrame本身会对整个操作执行优化,为了能直观地反应整个过程, 我们可以使用可视化的方式显示整个表达式的计算过程。" +msgid "" +"由于PyODPS DataFrame本身会对整个操作执行优化,为了能直观地反应整个过程, " +"我们可以使用可视化的方式显示整个表达式的计算过程。" msgstr "" "Python on MaxCompute (PyODPS) DataFrame can optimize and display the " "entire operation execution. You can use this to visualize the entire " @@ -34,8 +36,8 @@ msgstr "" #: ../../source/df-debug-instruction.rst:14 msgid "" -"值得注意的是,可视化需要依赖 `graphviz 软件 `_ 和 " -"**graphviz** Python 包。" +"值得注意的是,可视化需要依赖 `graphviz 软件 `_ 和 **graphviz** Python 包。" msgstr "" "Note that this visualization depends on `graphviz " "`_ and the **graphviz** Python " @@ -105,9 +107,10 @@ msgstr "Execute local debugging with the pandas computation backend" #: ../../source/df-debug-instruction.rst:62 msgid "" -"对于来自ODPS表的DataFrame,一些操作不会compile到ODPS SQL执行,而是会使用Tunnel下载, " -"这个过程是很快的,且无需等待ODPS SQL任务的调度。 " -"利用这个特性,我们能快速下载小部分ODPS数据到本地,使用pandas计算后端来进行代码编写和调试。" +"对于来自ODPS表的DataFrame,一些操作不会compile到ODPS SQL执行,而是会使用" +"Tunnel下载, 这个过程是很快的,且无需等待ODPS SQL任务的调度。 利用这个" +"特性,我们能快速下载小部分ODPS数据到本地,使用pandas计算后端来进行代码" +"编写和调试。" msgstr "" "The DataFrame application program interfaces (APIs) that are created from" " the MaxCompute table do not compile some operations to MaxCompute SQL " @@ -122,21 +125,27 @@ msgid "这些操作包括:" msgstr "Follow these operations:" #: ../../source/df-debug-instruction.rst:68 -msgid "对非分区表进行选取整个或者有限条数据、或者列筛选的操作(不包括列的各种计算),以及计算其数量" +msgid "" +"对非分区表进行选取整个或者有限条数据、或者列筛选的操作(不包括列的各种" +"计算),以及计算其数量" msgstr "" "Select all or some items of data from a non-partitioned table, or filter " "column data excluding column computation, and then calculate the number " "of specific data items." #: ../../source/df-debug-instruction.rst:69 -msgid "对分区表不选取分区或筛选前几个分区字段,对其进行选取全部或者有限条数据、或者列筛选的操作,以及计算其数量" +msgid "" +"对分区表不选取分区或筛选前几个分区字段,对其进行选取全部或者有限条数据、" +"或者列筛选的操作,以及计算其数量" msgstr "" -"Select all or some items of data from all or the first several partition columns " -"that you have specified in a partitioned table, or filter the column " -"data, and then calculate the number of data items." +"Select all or some items of data from all or the first several partition " +"columns that you have specified in a partitioned table, or filter the " +"column data, and then calculate the number of data items." #: ../../source/df-debug-instruction.rst:71 -msgid "如我们的iris这个DataFrame的来源ODPS表是非分区表,以下操作会使用tunnel进行下载。" +msgid "" +"如我们的iris这个DataFrame的来源ODPS表是非分区表,以下操作会使用tunnel进行" +"下载。" msgstr "" "If the iris object of DataFrame uses non-partitioned MaxCompute table as " "the source, the following operation uses the Tunnel API to download data:" @@ -148,7 +157,9 @@ msgid "" msgstr "" #: ../../source/df-debug-instruction.rst:78 -msgid "对于分区表,如有个DataFrame来源于分区表(有三个分区字段,ds、hh、mm),以下操作会使用tunnel下载。" +msgid "" +"对于分区表,如有个DataFrame来源于分区表(有三个分区字段,ds、hh、mm)," +"以下操作会使用tunnel下载。" msgstr "" "If DataFrame uses a partitioned table that includes three fields ds, hh, " "and mm, the following operation uses Tunnel commands to download data:" @@ -162,7 +173,9 @@ msgid "" msgstr "" #: ../../source/df-debug-instruction.rst:87 -msgid "因此我们可以使用 ``to_pandas`` 方法来将部分数据下载到本地来进行调试,我们可以写出如下代码:" +msgid "" +"因此我们可以使用 ``to_pandas`` 方法来将部分数据下载到本地来进行调试,我们" +"可以写出如下代码:" msgstr "" "You can use the ``to_pandas`` method to download some data to a local " "directory for debugging. You can write the following code:" @@ -180,7 +193,9 @@ msgid "" msgstr "" #: ../../source/df-debug-instruction.rst:100 -msgid "这样,当我们全部编写完成时,再把 ``DEBUG`` 设置为False就可以在ODPS上执行完整的计算了。" +msgid "" +"这样,当我们全部编写完成时,再把 ``DEBUG`` 设置为False就可以在ODPS上执行" +"完整的计算了。" msgstr "" "At the end of compiling, set ``DEBUG`` to False to execute complete " "computation on MaxCompute." diff --git a/docs/source/locale/en/LC_MESSAGES/df-element.po b/docs/source/locale/en/LC_MESSAGES/df-element.po index 1e72eb8c..f4ea37fe 100644 --- a/docs/source/locale/en/LC_MESSAGES/df-element.po +++ b/docs/source/locale/en/LC_MESSAGES/df-element.po @@ -32,7 +32,9 @@ msgid "" msgstr "" #: ../../source/df-element.rst:16 -msgid "对于一个Sequence来说,对它加上一个常量、或者执行sin函数的这类操作时,是作用于每个元素上的。接下来会详细说明。" +msgid "" +"对于一个Sequence来说,对它加上一个常量、或者执行sin函数的这类操作时,是" +"作用于每个元素上的。接下来会详细说明。" msgstr "" "For a Sequence object, operations such as adding a constant or executing " "the sin function are performed on all elements of the object. For more " @@ -44,8 +46,8 @@ msgstr "NULL-related functions (isnull, notnull, fillna)" #: ../../source/df-element.rst:21 msgid "" -"DataFrame " -"API提供了几个和NULL相关的内置函数,比如isnull来判断是否某字段是NULL,notnull则相反,fillna是将NULL填充为用户指定的值。" +"DataFrame API提供了几个和NULL相关的内置函数,比如isnull来判断是否某字段是" +"NULL,notnull则相反,fillna是将NULL填充为用户指定的值。" msgstr "" "The DataFrame API provides several NULL-related built-in functions. You " "can use isnull or notnull to determine whether or not a field is NULL, " @@ -67,7 +69,9 @@ msgid "逻辑判断(ifelse,switch)" msgstr "Logic functions (ifelse, switch)" #: ../../source/df-element.rst:37 -msgid "``ifelse``\\ 作用于boolean类型的字段,当条件成立时,返回第0个参数,否则返回第1个参数。" +msgid "" +"``ifelse``\\ 作用于boolean类型的字段,当条件成立时,返回第0个参数,否则" +"返回第1个参数。" msgstr "" "The ``ifelse``\\ function acts on boolean fields, and returns the first " "parameter if the condition is true, or the second parameter if the " @@ -353,8 +357,8 @@ msgstr "" #: ../../source/df-element.rst:160 msgid "" -"值得主意的是,DataFrame API不支持连续操作,比如\\ ``3 <= iris.sepallength <= 5``\\ " -",但是提供了between这个函数来进行是否在某个区间的判断。" +"值得主意的是,DataFrame API不支持连续操作,比如\\ ``3 <= iris.sepallength" +" <= 5``\\ ,但是提供了between这个函数来进行是否在某个区间的判断。" msgstr "" "Note that the DataFrame API does not support sequential operations, such " "as \\ ``3 <= iris.sepallength <= 5``\\. But you can use the between " @@ -372,7 +376,9 @@ msgid "" msgstr "" #: ../../source/df-element.rst:173 -msgid "默认情况下,between包含两边的区间,如果计算开区间,则需要设inclusive=False。" +msgid "" +"默认情况下,between包含两边的区间,如果计算开区间,则需要设inclusive=" +"False。" msgstr "" "By default, the between function specifies an interval that includes " "endpoints. To specify an open interval, set inclusive to False." @@ -430,7 +436,9 @@ msgid "contains" msgstr "" #: ../../source/df-element.rst:210 -msgid "包含某个字符串,如果 regex 参数为 True,则是包含某个正则表达式,默认为 True" +msgid "" +"包含某个字符串,如果 regex 参数为 True,则是包含某个正则表达式,默认为 " +"True" msgstr "" "Returns whether the given string contains a substring. The substring is a" " regular expression if the regex parameter is set to True. The regex " @@ -465,7 +473,9 @@ msgid "extract" msgstr "" #: ../../source/df-element.rst:214 -msgid "抽取出某个正则表达式,如果 group 不指定,则返回满足整个 pattern 的子串;否则,返回第几个 group" +msgid "" +"抽取出某个正则表达式,如果 group 不指定,则返回满足整个 pattern 的子串;" +"否则,返回第几个 group" msgstr "" "Extracts a regular expression, and if the group has not been specified, " "returns the substrings that satisfy the pattern. If the group has been " @@ -524,7 +534,9 @@ msgid "ljust" msgstr "" #: ../../source/df-element.rst:220 -msgid "若未达到指定的 ``width`` 的长度,则在右侧填充 ``fillchar`` 指定的字符串(默认空格)" +msgid "" +"若未达到指定的 ``width`` 的长度,则在右侧填充 ``fillchar`` 指定的字符串(" +"默认空格)" msgstr "" "Pads the string with the character ``fillchar`` on the right until it " "reaches the specified length ``width``. The space character is used by " @@ -535,7 +547,9 @@ msgid "rjust" msgstr "" #: ../../source/df-element.rst:221 -msgid "若未达到指定的 ``width`` 的长度,则在左侧填充 ``fillchar`` 指定的字符串(默认空格)" +msgid "" +"若未达到指定的 ``width`` 的长度,则在左侧填充 ``fillchar`` 指定的字符串(" +"默认空格)" msgstr "" "Pads the string with the character ``fillchar`` on the left until it " "reaches the specified length ``width``. The space character is used by " @@ -594,7 +608,9 @@ msgid "pad" msgstr "" #: ../../source/df-element.rst:228 -msgid "在指定的位置(left,right 或者 both)用指定填充字符(用 ``fillchar`` 指定,默认空格)来对齐" +msgid "" +"在指定的位置(left,right 或者 both)用指定填充字符(用 ``fillchar`` 指定" +",默认空格)来对齐" msgstr "" "Pads the string with the character ``fillchar`` on the specified position" " which may be left, right or both. The space character is used by " @@ -730,8 +746,8 @@ msgstr "" #: ../../source/df-element.rst:243 msgid "" -"将字符串按分隔符拆分为一个 dict,传入的两个参数分别为项目分隔符和 Key-Value 分隔符(返回 dict" -" 类型)" +"将字符串按分隔符拆分为一个 dict,传入的两个参数分别为项目分隔符和 Key-" +"Value 分隔符(返回 dict 类型)" msgstr "" "Splits the string at the specified separator into a dict. Two parameters," " the project separator and the Key-Value separator are required. Returns " @@ -743,9 +759,9 @@ msgstr "" #: ../../source/df-element.rst:244 msgid "" -"按格式化读取成时间,时间格式和Python标准库相同,详细参考 `Python 时间格式化 " -"`_" +"按格式化读取成时间,时间格式和Python标准库相同,详细参考 `Python 时间" +"格式化 `_" msgstr "" "Converts the string representing a time to a time type according to the " "specified format. The time format is the same as specified in the " @@ -843,8 +859,8 @@ msgstr "" #: ../../source/df-element.rst:281 msgid "" -"格式化时间,时间格式和 Python 标准库相同,详细参考 `Python 时间格式化 " -"`_" msgstr "" "Converts a time to a string type according to the specified format. The " @@ -854,7 +870,9 @@ msgstr "" "behavior>`_." #: ../../source/df-element.rst:284 -msgid "PyODPS 也支持时间的加减操作,比如可以通过以下方法得到前3天的日期。两个日期列相减得到相差的毫秒数。" +msgid "" +"PyODPS 也支持时间的加减操作,比如可以通过以下方法得到前3天的日期。两个" +"日期列相减得到相差的毫秒数。" msgstr "" "PyODPS also supports the addition and subtraction of time. For example, " "you can retrieve the date 3 days before the current date. Subtracting one" @@ -899,7 +917,9 @@ msgid "集合类型相关操作" msgstr "Collection type related operations" #: ../../source/df-element.rst:327 -msgid "PyODPS 支持的集合类型有 List 和 Dict。这两个类型都可以使用下标获取集合中的某个项目,另有 len 方法,可获得集合的大小。" +msgid "" +"PyODPS 支持的集合类型有 List 和 Dict。这两个类型都可以使用下标获取集合中" +"的某个项目,另有 len 方法,可获得集合的大小。" msgstr "" "PyODPS sequences supports List and Dict types. You can use subscripts to " "retrieve an item from both types. You can also use ``len`` method to " @@ -907,9 +927,10 @@ msgstr "" #: ../../source/df-element.rst:329 msgid "" -"同时,两种集合均有 explode 方法,用于展开集合中的内容。对于 List,explode 默认返回一列,当传入参数 pos 时, " -"将返回两列,其中一列为值在数组中的编号(类似 Python 的 enumerate 函数)。对于 Dict,explode 会返回两列, 分别表示" -" keys 及 values。explode 中也可以传入列名,作为最后生成的列。" +"同时,两种集合均有 explode 方法,用于展开集合中的内容。对于 List,explode" +" 默认返回一列,当传入参数 pos 时, 将返回两列,其中一列为值在数组中的编号" +"(类似 Python 的 enumerate 函数)。对于 Dict,explode 会返回两列, 分别" +"表示 keys 及 values。explode 中也可以传入列名,作为最后生成的列。" msgstr "" "Additionally, List and Dict types support ``explode`` method, which can " "be used to display the contents of the collection. For List, explode " @@ -1015,7 +1036,9 @@ msgstr "" "4 e2 4" #: ../../source/df-element.rst:381 -msgid "explode 也可以和 :ref:`dflateralview` 结合,以将原有列和 explode 的结果相结合,例子如下:" +msgid "" +"explode 也可以和 :ref:`dflateralview` 结合,以将原有列和 explode 的结果" +"相结合,例子如下:" msgstr "You can also combine explode with the lateral view as follows:" #: ../../source/df-element.rst:383 @@ -1092,7 +1115,9 @@ msgid "其他元素操作(isin,notin,cut)" msgstr "Other operations" #: ../../source/df-element.rst:424 -msgid "``isin``\\ 给出Sequence里的元素是否在某个集合元素里。\\ ``notin``\\ 是相反动作。" +msgid "" +"``isin``\\ 给出Sequence里的元素是否在某个集合元素里。\\ ``notin``\\ 是" +"相反动作。" msgstr "" "``isin``\\ or \\ ``notin``\\ returns whether or not the elements in a " "Sequence object exist in a collection element." @@ -1148,7 +1173,9 @@ msgid "使用自定义函数" msgstr "Use custom functions" #: ../../source/df-element.rst:467 -msgid "DataFrame函数支持对Sequence使用map,它会对它的每个元素调用自定义函数。比如:" +msgid "" +"DataFrame函数支持对Sequence使用map,它会对它的每个元素调用自定义函数。" +"比如:" msgstr "" "DataFrame allows you to call the map method on a Sequence object so as to" " call custom functions on all of its elements." @@ -1165,7 +1192,9 @@ msgid "" msgstr "" #: ../../source/df-element.rst:480 -msgid "目前,受限于 Python UDF,自定义函数无法支持将 list / dict 类型作为输入或输出。" +msgid "" +"目前,受限于 Python UDF,自定义函数无法支持将 list / dict 类型作为输入或" +"输出。" msgstr "" "Custom functions are currently not allowed to use lists or dicts as " "inputs or outputs because of Python UDF limitations." @@ -1188,7 +1217,9 @@ msgid "" msgstr "" #: ../../source/df-element.rst:494 -msgid "如果在函数中包含闭包,需要注意的是,函数外闭包变量值的变化会引起函数内该变量值的变化。例如," +msgid "" +"如果在函数中包含闭包,需要注意的是,函数外闭包变量值的变化会引起函数内该" +"变量值的变化。例如," msgstr "" "If a function contains a closure, note that if the value of a closure " "variable changes outside the function, the value of this variable within " @@ -1203,8 +1234,8 @@ msgstr "" #: ../../source/df-element.rst:502 msgid "" -"结果为 dfs 中每个 SequenceExpr 均为 ``df.sepal_length + " -"9``。为解决此问题,可以将函数作为另一函数的返回值,或者使用 partial,如" +"结果为 dfs 中每个 SequenceExpr 均为 ``df.sepal_length + 9``。为解决此问题" +",可以将函数作为另一函数的返回值,或者使用 partial,如" msgstr "" "Each SequenceExpr object in dfs is ``df.sepal_length + 9`` now. To solve " "this problem, you can use the function as the return of another function," @@ -1233,7 +1264,9 @@ msgid "" msgstr "" #: ../../source/df-element.rst:522 -msgid "map也支持使用现有的UDF函数,传入的参数是str类型(函数名)或者 :ref:`Function对象 ` 。" +msgid "" +"map也支持使用现有的UDF函数,传入的参数是str类型(函数名)或者 :ref:`" +"Function对象 ` 。" msgstr "" "The map method also supports existing UDFs. You can pass in str type " "parameters, which represent function names, or :ref:`Function objects " @@ -1241,8 +1274,9 @@ msgstr "" #: ../../source/df-element.rst:524 msgid "" -"map传入Python函数的实现使用了ODPS Python UDF,因此,如果用户所在的Project不支持Python " -"UDF,则map函数无法使用。除此以外,所有 Python UDF 的限制在此都适用。" +"map传入Python函数的实现使用了ODPS Python UDF,因此,如果用户所在的Project" +"不支持Python UDF,则map函数无法使用。除此以外,所有 Python UDF 的限制在此" +"都适用。" msgstr "" "The implementation of ``map`` depends on MaxCompute Python UDF. If your " "project does not support Python UDF, you cannot use map. All Python UDF " @@ -1250,8 +1284,8 @@ msgstr "" #: ../../source/df-element.rst:527 msgid "" -"目前,第三方库(包含C)只能使用\\ ``numpy``\\ ,第三方库使用参考 :ref:`使用第三方Python库 " -"`。" +"目前,第三方库(包含C)只能使用\\ ``numpy``\\ ,第三方库使用参考 :ref:`" +"使用第三方Python库 `。" msgstr "" "The only builtin third-party library in MaxCompute is ``numpy``. If you " "need to use other libraries, you need to upload these libraries yourself." @@ -1260,8 +1294,9 @@ msgstr "" #: ../../source/df-element.rst:531 ../../source/df-element.rst:600 msgid "" -"由于字节码定义的差异,Python 3 下使用新语言特性(例如 ``yield from`` )时,代码在使用 Python 2.7 的 ODPS" -" Worker 上执行时会发生错误。因而建议在 Python 3 下使用 MapReduce API 编写生产作业前,先确认相关代码是否能正常 " +"由于字节码定义的差异,Python 3 下使用新语言特性(例如 ``yield from`` )时" +",代码在使用 Python 2.7 的 ODPS Worker 上执行时会发生错误。因而建议在 " +"Python 3 下使用 MapReduce API 编写生产作业前,先确认相关代码是否能正常 " "执行。" msgstr "" "Due to the differences in bytecode definitions, new features supported by" @@ -1272,9 +1307,10 @@ msgstr "" #: ../../source/df-element.rst:537 msgid "" -"由于 PyODPS DataFrame 默认 Collection / Sequence " -"等对象均为分布式对象,故不支持在自定义函数内部引用这些对象。 请考虑改用 :ref:`Join 等方法 ` 引用多个 " -"DataFrame 的数据,或者引用 Collection 作为资源,如下文所述。" +"由于 PyODPS DataFrame 默认 Collection / Sequence 等对象均为分布式对象,故" +"不支持在自定义函数内部引用这些对象。 请考虑改用 :ref:`Join 等方法 <" +"dfmerge>` 引用多个 DataFrame 的数据,或者引用 Collection 作为资源,如下文" +"所述。" msgstr "" "PyODPS DataFrame recognizes all collections and sequences as distributed " "objects, and does not support referencing these objects inside user-" @@ -1288,8 +1324,8 @@ msgstr "Reference Resources" #: ../../source/df-element.rst:545 msgid "" -"自定义函数也能读取ODPS上的资源(表资源或文件资源),或者引用一个collection作为资源。 " -"此时,自定义函数需要写成函数闭包或callable的类。" +"自定义函数也能读取ODPS上的资源(表资源或文件资源),或者引用一个" +"collection作为资源。 此时,自定义函数需要写成函数闭包或callable的类。" msgstr "" "Custom functions can also read MaxCompute resources, such as table and " "file resources, or reference Collection objects as resources. To do that," @@ -1374,8 +1410,9 @@ msgstr "Use third-party Python libraries" #: ../../source/df-element.rst:591 msgid "" -"现在用户可以把第三方 Wheel 包作为资源上传到 MaxCompute。在全局或者在立即执行的方法时,指定需要使用的包文件, " -"即可以在自定义函数中使用第三方库。值得注意的是,第三方库的依赖库也必须指定,否则依然会有导入错误。" +"现在用户可以把第三方 Wheel 包作为资源上传到 MaxCompute。在全局或者在立即" +"执行的方法时,指定需要使用的包文件, 即可以在自定义函数中使用第三方库。" +"值得注意的是,第三方库的依赖库也必须指定,否则依然会有导入错误。" msgstr "" "You can upload third-party wheel packages as resources to MaxCompute. You" " need to specify the package files globally or in methods that execute " @@ -1384,19 +1421,21 @@ msgstr "" #: ../../source/df-element.rst:594 msgid "" -"如果你的 MaxCompute 服务支持在执行 SQL 时使用镜像,可以在 execute / persist / to_pandas 方法指定 " -"``image`` 参数以使用镜像。与此同时,DataFrame 的 execute / persist / to_pandas 方法支持增加 " -"``libraries`` 参数以将资源作为三方包。 PyODPS 提供了 ``pyodps-pack`` 工具,可在安装完 PyODPS " -"后打包三方包及其依赖。如何制作及使用三方包的说明请参考 :ref:`这里 `。" -msgstr "" -"If your MaxCompute service supports specifying images when executing SQL" -" statements, you may specify ``image`` argument with ``execute``, ``persist``" -" or ``to_pandas`` to use these images. Meanwhile ``libraries`` argument " -"can be used with ``execute``, ``persist`` or ``to_pandas`` to specify " -"resources as thirdparty libraries. PyODPS installation provides ``pyodps-" -"pack`` tool for packing third-party libraries. You may take a look at " -":ref:`documents here ` to see how to build and use these " -"third-party libraries." +"如果你的 MaxCompute 服务支持在执行 SQL 时使用镜像,可以在 execute / " +"persist / to_pandas 方法指定 ``image`` 参数以使用镜像。与此同时," +"DataFrame 的 execute / persist / to_pandas 方法支持增加 ``libraries`` " +"参数以将资源作为三方包。 PyODPS 提供了 ``pyodps-pack`` 工具,可在安装完 " +"PyODPS 后打包三方包及其依赖。如何制作及使用三方包的说明请参考 :ref:`这里 " +"`。" +msgstr "" +"If your MaxCompute service supports specifying images when executing SQL " +"statements, you may specify ``image`` argument with ``execute``, " +"``persist`` or ``to_pandas`` to use these images. Meanwhile ``libraries``" +" argument can be used with ``execute``, ``persist`` or ``to_pandas`` to " +"specify resources as thirdparty libraries. PyODPS installation provides " +"``pyodps-pack`` tool for packing third-party libraries. You may take a " +"look at :ref:`documents here ` to see how to build and use " +"these third-party libraries." #: ../../source/df-element.rst:605 msgid "使用计数器" @@ -1425,8 +1464,8 @@ msgstr "Call MaxCompute built-in functions or UDFs" #: ../../source/df-element.rst:625 msgid "" -"要想调用 ODPS 上的内建或者已定义函数,来生成列,我们可以使用 ``func`` 接口,该接口默认函数返回值为 String, 可以用 " -"rtype 参数指定返回值。" +"要想调用 ODPS 上的内建或者已定义函数,来生成列,我们可以使用 ``func`` " +"接口,该接口默认函数返回值为 String, 可以用 rtype 参数指定返回值。" msgstr "" "You can use the ``func`` interface to call MaxCompute built-in functions " "or UDFs to generate columns. This interface returns a string type by " diff --git a/docs/source/locale/en/LC_MESSAGES/df-merge.po b/docs/source/locale/en/LC_MESSAGES/df-merge.po index 80fe57e4..2d6d634f 100644 --- a/docs/source/locale/en/LC_MESSAGES/df-merge.po +++ b/docs/source/locale/en/LC_MESSAGES/df-merge.po @@ -15,7 +15,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.9.1\n" +"Generated-By: Babel 2.12.1\n" #: ../../source/df-merge.rst:4 msgid "数据合并" @@ -66,8 +66,8 @@ msgstr "Join operation" #: ../../source/df-merge.rst:52 msgid "" -"DataFrame 也支持对两个 Collection 执行 join 的操作,如果不指定 join 的条件,那么 DataFrame " -"API会寻找名字相同的列,并作为 join 的条件。" +"DataFrame 也支持对两个 Collection 执行 join 的操作,如果不指定 join 的" +"条件,那么 DataFrame API会寻找名字相同的列,并作为 join 的条件。" msgstr "" "DataFrame supports the join operation for two Collection objects. If you " "do not specify the join conditions, the DataFrame application program " @@ -112,7 +112,9 @@ msgid "" msgstr "" #: ../../source/df-merge.rst:73 -msgid "在join时,on条件两边的字段名称相同时,只会选择一个,其他类型的join则会被重命名。" +msgid "" +"在join时,on条件两边的字段名称相同时,只会选择一个,其他类型的join则会被" +"重命名。" msgstr "" "During the join operation, if the field name in the on condition for both" " Collection objects is the same, the system selects one field name. In " @@ -138,8 +140,9 @@ msgstr "" #: ../../source/df-merge.rst:84 msgid "" -"可以看到,\\ ``movie_id``\\ 被重命名为movie\\_id\\_x,以及movie\\_id\\_y,这和\\ " -"``suffixes``\\ 参数有关(默认是\\ ``('_x', '_y')``\\ ), 当遇到重名的列时,就会被重命名为指定的后缀。" +"可以看到,\\ ``movie_id``\\ 被重命名为movie\\_id\\_x,以及movie\\_id\\_y" +",这和\\ ``suffixes``\\ 参数有关(默认是\\ ``('_x', '_y')``\\ ), 当遇到" +"重名的列时,就会被重命名为指定的后缀。" msgstr "" "In this code, \\ ``movie_id``\\ is renamed as movie\\_id\\_x and " "movie\\_id\\_y. This renaming rule depends on the \\ ``suffixes``\\ " @@ -218,17 +221,20 @@ msgstr "" #: ../../source/df-merge.rst:129 msgid "" -"除了\\ ``join``\\ 以外,DataFrame还支持\\ ``left_join``\\ ,\\ ``right_join``\\ " -",和\\ ``outer_join``\\ 。在执行上述外连接操作时, 默认会将重名列加上 _x 和 _y 后缀,可通过在 suffixes " -"参数中传入一个二元 tuple 来自定义后缀。" +"除了\\ ``join``\\ 以外,DataFrame还支持\\ ``left_join``\\ ,\\ ``right_" +"join``\\ ,和\\ ``outer_join``\\ 。在执行上述外连接操作时, 默认会将重" +"名列加上 _x 和 _y 后缀,可通过在 suffixes 参数中传入一个二元 tuple 来自" +"定义后缀。" msgstr "" "In addition to \\ ``join``\\, DataFrame supports \\ ``left_join``\\ , \\ " "``right_join``\\ , and \\ ``outer_join``\\ . In these join operations, " -"renamed columns are suffixed with _x and _y by default. You can use a " -"2‑tuple to define the suffix in the suffixes parameter." +"renamed columns are suffixed with _x and _y by default. You can use a 2‑" +"tuple to define the suffix in the suffixes parameter." #: ../../source/df-merge.rst:132 -msgid "如果需要在外连接中避免对谓词中相等的列取重复列,可以指定 merge_columns 选项,该选项会自动选择两列中的非空值作为新列的值:" +msgid "" +"如果需要在外连接中避免对谓词中相等的列取重复列,可以指定 merge_columns " +"选项,该选项会自动选择两列中的非空值作为新列的值:" msgstr "" "During the outer_join operation, to avoid repeated columns of the same " "equal predicate, set the merge_columns option to True. Therefore, the " @@ -240,7 +246,9 @@ msgid ">>> movies.left_join(ratings, on='movie_id', merge_columns=True)" msgstr "" #: ../../source/df-merge.rst:138 -msgid "要使用 **mapjoin**\\ 需要将 mapjoin 设为 True ,执行时会对右表做 mapjoin 操作,例如" +msgid "" +"要使用 **mapjoin**\\ 需要将 mapjoin 设为 True ,执行时会对右表做 mapjoin " +"操作,例如" msgstr "" "To use the **mapjoin** operation, set mapjoin to True. Therefore, the " "system executes the mapjoin operation for the right DataFrame object." @@ -250,7 +258,9 @@ msgid ">>> movies.left_join(ratings, on='movie_id', mapjoin=True)" msgstr "" #: ../../source/df-merge.rst:144 -msgid "要使用 **skewjoin**\\ 需要将 skewjoin 设为 True ,执行时会对右表做 skewjoin 操作,例如" +msgid "" +"要使用 **skewjoin**\\ 需要将 skewjoin 设为 True ,执行时会对右表做 " +"skewjoin 操作,例如" msgstr "" "To use the **skewjoin** operation, set skewjoin to True. Therefore, the " "system executes the skewjoin operation for the right DataFrame object." @@ -260,10 +270,12 @@ msgid ">>> movies.left_join(ratings, on='movie_id', skewjoin=True)" msgstr "" #: ../../source/df-merge.rst:150 -msgid "如果需要指定特定列有偏斜,可以将 skewjoin 设为对应列,如果有多列应指定为 list,例如:" +msgid "" +"如果需要指定特定列有偏斜,可以将 skewjoin 设为对应列,如果有多列应指定为 " +"list,例如:" msgstr "" -"To specify skewness on specific columns, you may set skewjoin with a list of " -"names of these columns." +"To specify skewness on specific columns, you may set skewjoin with a list" +" of names of these columns." #: ../../source/df-merge.rst:152 msgid "" @@ -273,11 +285,11 @@ msgstr "" #: ../../source/df-merge.rst:156 msgid "" -"如果已知列上的某些值(或者组合)有偏斜,可以将 skewjoin 设为列上对应值组成的 dict,如有多个值应当指定为这些 dict 组成的 " -"list,例如" +"如果已知列上的某些值(或者组合)有偏斜,可以将 skewjoin 设为列上对应值" +"组成的 dict,如有多个值应当指定为这些 dict 组成的 list,例如" msgstr "" -"When you are aware of the skew values, you may specify combination of these values by passing " -"a list of dict of values to skewjoin." +"When you are aware of the skew values, you may specify combination of " +"these values by passing a list of dict of values to skewjoin." #: ../../source/df-merge.rst:158 msgid "" @@ -289,14 +301,19 @@ msgid "" msgstr "" #: ../../source/df-merge.rst:165 -msgid "MaxCompute 推荐为偏斜列指定具体的值,如不指定会带来额外的统计开销。在指定值时,需要保证列每组数据的列都相同。" +msgid "" +"MaxCompute 推荐为偏斜列指定具体的值,如不指定会带来额外的统计开销。在指定" +"值时,需要保证列每组数据的列都相同。" msgstr "" -"It is recommended by MaxCompute to set specific values for skew columns, or extra computation cost " -"for column statistics might be introduced. When specifying skew values, make sure all columns are " -"included for every pair of values." +"It is recommended by MaxCompute to set specific values for skew columns, " +"or extra computation cost for column statistics might be introduced. When" +" specifying skew values, make sure all columns are included for every " +"pair of values." #: ../../source/df-merge.rst:167 -msgid "用户也能join分别来自ODPS和pandas的Collection,或者join分别来自ODPS和数据库的Collection,此时计算会在ODPS上执行。" +msgid "" +"用户也能join分别来自ODPS和pandas的Collection,或者join分别来自ODPS和" +"数据库的Collection,此时计算会在ODPS上执行。" msgstr "" "You can also join Collection objects respectively from MaxCompute and " "pandas, or join those respectively from MaxCompute and a database. " @@ -307,7 +324,9 @@ msgid "Union操作" msgstr "Union operation" #: ../../source/df-merge.rst:172 -msgid "现在有两张表,字段和类型都一致(可以顺序不同),我们可以使用union或者concat来把它们合并成一张表。" +msgid "" +"现在有两张表,字段和类型都一致(可以顺序不同),我们可以使用union或者" +"concat来把它们合并成一张表。" msgstr "" "For two tables of consistent fields and types, which are automatically " "ordered, you can use the union or concat operation to combine both tables" @@ -327,7 +346,9 @@ msgid "" msgstr "" #: ../../source/df-merge.rst:185 -msgid "用户也能union分别来自ODPS和pandas的Collection,或者union分别来自ODPS和数据库的Collection,此时计算会在ODPS上执行。" +msgid "" +"用户也能union分别来自ODPS和pandas的Collection,或者union分别来自ODPS和" +"数据库的Collection,此时计算会在ODPS上执行。" msgstr "" "You can execute the union operation for Collection objects respectively " "from MaxCompute and pandas, or for those respectively from MaxCompute and" diff --git a/docs/source/locale/en/LC_MESSAGES/df-plot.po b/docs/source/locale/en/LC_MESSAGES/df-plot.po index 1b3871a6..72b6ec25 100644 --- a/docs/source/locale/en/LC_MESSAGES/df-plot.po +++ b/docs/source/locale/en/LC_MESSAGES/df-plot.po @@ -15,14 +15,16 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.5.3\n" +"Generated-By: Babel 2.12.1\n" #: ../../source/df-plot.rst:4 msgid "绘图" msgstr "Plotting" #: ../../source/df-plot.rst:6 -msgid "PyODPS DataFrame提供了绘图的方法。如果要使用绘图,需要 **pandas** 和 **matplotlib** 的安装。" +msgid "" +"PyODPS DataFrame提供了绘图的方法。如果要使用绘图,需要 **pandas** 和 **" +"matplotlib** 的安装。" msgstr "" "PyODPS DataFrame provides plotting. To enable plotting, install the " "**pandas** and **matplotlib** libraries." @@ -175,8 +177,8 @@ msgstr "" #: ../../source/df-plot.rst:63 msgid "" -"详细参数可以参考Pandas文档:http://pandas.pydata.org/pandas-" -"docs/stable/generated/pandas.DataFrame.plot.html" +"详细参数可以参考Pandas文档:http://pandas.pydata.org/pandas-docs/stable/" +"generated/pandas.DataFrame.plot.html" msgstr "" "For more information, see pandas.DataFrame.plot: http://pandas.pydata.org" "/pandas-docs/stable/generated/pandas.DataFrame.plot.html" diff --git a/docs/source/locale/en/LC_MESSAGES/df-quickstart.po b/docs/source/locale/en/LC_MESSAGES/df-quickstart.po index 37f528da..5db4c7f9 100644 --- a/docs/source/locale/en/LC_MESSAGES/df-quickstart.po +++ b/docs/source/locale/en/LC_MESSAGES/df-quickstart.po @@ -23,10 +23,10 @@ msgstr "Quick start" #: ../../source/df-quickstart.rst:7 msgid "" -"在本例子中,我们拿 `movielens 100K " -"`_ 来做例子。现在我们已经有三张表了,分别是\\" -" ``pyodps_ml_100k_movies``\\ (电影相关的数据),\\ ``pyodps_ml_100k_users``\\ " -"(用户相关的数据),\\ ``pyodps_ml_100k_ratings``\\ (评分有关的数据)。" +"在本例子中,我们拿 `movielens 100K `_ 来做例子。现在我们已经有三张表了,分别是\\ ``pyodps_ml" +"_100k_movies``\\ (电影相关的数据),\\ ``pyodps_ml_100k_users``\\ (用户" +"相关的数据),\\ ``pyodps_ml_100k_ratings``\\ (评分有关的数据)。" msgstr "" "Here, `movielens 100K `_ " "is used as an example. Assume that three tables already exist, which are " @@ -42,8 +42,10 @@ msgstr "Create a MaxCompute object before starting the following steps:" msgid "" ">>> import os\n" ">>> from odps import ODPS\n" -">>> # 保证 ALIBABA_CLOUD_ACCESS_KEY_ID 环境变量设置为用户 Access Key ID,\n" -">>> # ALIBABA_CLOUD_ACCESS_KEY_SECRET 环境变量设置为用户 Access Key Secret\n" +">>> # 保证 ALIBABA_CLOUD_ACCESS_KEY_ID 环境变量设置为用户 Access Key ID," +"\n" +">>> # ALIBABA_CLOUD_ACCESS_KEY_SECRET 环境变量设置为用户 Access Key " +"Secret\n" ">>> # 不建议直接使用 Access Key ID / Access Key Secret 字符串\n" ">>> o = ODPS(\n" "... os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'),\n" @@ -54,12 +56,12 @@ msgid "" msgstr "" ">>> import os\n" ">>> from odps import ODPS\n" -"# Make sure environment variable ALIBABA_CLOUD_ACCESS_KEY_ID " -"already set to Access Key ID of user\n" -"# while environment variable ALIBABA_CLOUD_ACCESS_KEY_SECRET " -"set to Access Key Secret of user.\n" -"# Not recommended to hardcode Access Key ID or Access Key Secret" -" in your code.\n" +"# Make sure environment variable ALIBABA_CLOUD_ACCESS_KEY_ID already set " +"to Access Key ID of user\n" +"# while environment variable ALIBABA_CLOUD_ACCESS_KEY_SECRET set to " +"Access Key Secret of user.\n" +"# Not recommended to hardcode Access Key ID or Access Key Secret in your " +"code.\n" ">>> o = ODPS(\n" "... os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'),\n" "... os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'),\n" @@ -152,7 +154,9 @@ msgid "" msgstr "" #: ../../source/df-quickstart.rst:88 -msgid "又或者,排除掉一些字段的同时,得通过计算得到一些新的列,比如我想将sex为M的置为True,否则为False,并取名叫sex\\_bool。" +msgid "" +"又或者,排除掉一些字段的同时,得通过计算得到一些新的列,比如我想将sex为M" +"的置为True,否则为False,并取名叫sex\\_bool。" msgstr "" "When excluding some fields, you may want to obtain new columns through " "computation. For example, add the sex\\_bool attribute and set it to True" @@ -223,8 +227,9 @@ msgstr "" #: ../../source/df-quickstart.rst:134 msgid "" -"DataFrame API提供了value\\_counts这个方法来快速达到同样的目的。注意该方法返回的行数受到 " -"``options.df.odps.sort.limit`` 的限制,详见 :ref:`配置选项 ` 。" +"DataFrame API提供了value\\_counts这个方法来快速达到同样的目的。注意该方法" +"返回的行数受到 ``options.df.odps.sort.limit`` 的限制,详见 :ref:`配置选项" +" ` 。" msgstr "" "DataFrame APIs provide the value\\_counts method to quickly achieve the " "same result. An example is shown below. Note that the number of records " @@ -291,7 +296,9 @@ msgid ".. image:: _static/df-age-hist.png" msgstr "" #: ../../source/df-quickstart.rst:178 -msgid "好了,现在我们把这三张表联合起来,只需要使用join就可以了。join完成后我们把它保存成一张新的表。" +msgid "" +"好了,现在我们把这三张表联合起来,只需要使用join就可以了。join完成后我们" +"把它保存成一张新的表。" msgstr "" "Use join to join the three tables and save the joined tables as a new " "table. For example:" @@ -380,8 +387,8 @@ msgstr "" #: ../../source/df-quickstart.rst:230 msgid "" -">>> cut_lens.groupby('年龄分组').agg(cut_lens.rating.count().rename('评分总数'), " -"cut_lens.rating.mean().rename('评分均值'))\n" +">>> cut_lens.groupby('年龄分组').agg(cut_lens.rating.count().rename('评分" +"总数'), cut_lens.rating.mean().rename('评分均值'))\n" " 年龄分组 评分均值 评分总数\n" "0 0-9 3.767442 43\n" "1 10-19 3.486126 8181\n" diff --git a/docs/source/locale/en/LC_MESSAGES/df-sort-distinct-apply.po b/docs/source/locale/en/LC_MESSAGES/df-sort-distinct-apply.po index d15c7f98..dc030dc1 100644 --- a/docs/source/locale/en/LC_MESSAGES/df-sort-distinct-apply.po +++ b/docs/source/locale/en/LC_MESSAGES/df-sort-distinct-apply.po @@ -99,8 +99,8 @@ msgstr "" #: ../../source/df-sort-distinct-apply.rst:65 msgid "" -"多字段排序时,如果是升序降序不同,\\ ``ascending``\\ " -"参数可以传入一个列表,长度必须等同于排序的字段,它们的值都是boolean类型" +"多字段排序时,如果是升序降序不同,\\ ``ascending``\\ 参数可以传入一个列表" +",长度必须等同于排序的字段,它们的值都是boolean类型" msgstr "" "To sort multiple fields in both ascending and descending orders, use the " "\\ ``ascending``\\ parameter with a list of boolean values. The length of" @@ -135,8 +135,9 @@ msgstr "" #: ../../source/df-sort-distinct-apply.rst:92 msgid "" -"由于 ODPS 要求排序必须指定个数,所以在 ODPS 后端执行时, 会通过 ``options.df.odps.sort.limit`` " -"指定排序个数,这个值默认是 10000, 如果要排序尽量多的数据,可以把这个值设到较大的值。不过注意,此时可能会导致 OOM。" +"由于 ODPS 要求排序必须指定个数,所以在 ODPS 后端执行时, 会通过 ``options" +".df.odps.sort.limit`` 指定排序个数,这个值默认是 10000, 如果要排序尽量多" +"的数据,可以把这个值设到较大的值。不过注意,此时可能会导致 OOM。" msgstr "" "MaxCompute requires the number of items to be picked from the top of the " "sorted list. You can use ``options.df.odps.sort.limit`` to specify the " @@ -181,7 +182,9 @@ msgid "" msgstr "" #: ../../source/df-sort-distinct-apply.rst:125 -msgid "在Sequence上,用户可以调用unique,但是记住,调用unique的Sequence不能用在列选择中。" +msgid "" +"在Sequence上,用户可以调用unique,但是记住,调用unique的Sequence不能用在" +"列选择中。" msgstr "" "To deduplicate Sequence objects, use the unique method. However, the " "Sequence object for the unique method cannot be used in column selection." @@ -208,15 +211,17 @@ msgid "采样" msgstr "Sampling" #: ../../source/df-sort-distinct-apply.rst:147 -msgid "要对一个 collection 的数据采样,可以调用 ``sample`` 方法。PyODPS 支持四种采样方式。" +msgid "" +"要对一个 collection 的数据采样,可以调用 ``sample`` 方法。PyODPS 支持四种" +"采样方式。" msgstr "" "To sample data from a Collection object, use the ``sample`` method. " "Python on MaxCompute (PyODPS) supports the following sampling means:" #: ../../source/df-sort-distinct-apply.rst:150 msgid "" -"除了按份数采样外,其余方法如果要在 ODPS DataFrame 上执行,需要 Project 支持 XFlow,否则,这些方法只能在 " -"Pandas DataFrame 后端上执行。" +"除了按份数采样外,其余方法如果要在 ODPS DataFrame 上执行,需要 Project " +"支持 XFlow,否则,这些方法只能在 Pandas DataFrame 后端上执行。" msgstr "" "Except for sampling by parts, other sampling methods require XFlow to " "execute on the MaxCompute DataFrame. If they do not support XFlow, these " @@ -238,8 +243,8 @@ msgid "" ">>> iris.sample(parts=10) # 分成10份,默认取第0份\n" ">>> iris.sample(parts=10, i=0) # 手动指定取第0份\n" ">>> iris.sample(parts=10, i=[2, 5]) # 分成10份,取第2和第5份\n" -">>> iris.sample(parts=10, columns=['name', 'sepalwidth']) # " -"根据name和sepalwidth的值做采样" +">>> iris.sample(parts=10, columns=['name', 'sepalwidth']) # 根据name和" +"sepalwidth的值做采样" msgstr "" ">>> iris.sample(parts=10) # split into 10 parts and take the 0th by " "default\n" @@ -255,7 +260,9 @@ msgid "按比例 / 条数采样" msgstr "Sampling by percentage or items" #: ../../source/df-sort-distinct-apply.rst:166 -msgid "在这种采样方式下,用户指定需要采样的数据条数或采样比例。指定 ``replace`` 参数为 True 可启用放回采样。" +msgid "" +"在这种采样方式下,用户指定需要采样的数据条数或采样比例。指定 ``replace`` " +"参数为 True 可启用放回采样。" msgstr "" "You need to specify the number of data items or the percentage of data " "that you want to sample using this method. To enable sampling with " @@ -274,7 +281,9 @@ msgid "按权重列采样" msgstr "Sampling by weight" #: ../../source/df-sort-distinct-apply.rst:175 -msgid "在这种采样方式下,用户指定权重列和数据条数 / 采样比例。指定 ``replace`` 参数为 True 可启用放回采样。" +msgid "" +"在这种采样方式下,用户指定权重列和数据条数 / 采样比例。指定 ``replace`` " +"参数为 True 可启用放回采样。" msgstr "" "You need to specify the weight column and the number of data items or the" " proportion of data that you want sample in this way. To enable sampling " @@ -292,8 +301,8 @@ msgstr "Stratified sampling" #: ../../source/df-sort-distinct-apply.rst:184 msgid "" -"在这种采样方式下,用户指定用于分层的标签列,同时为需要采样的每个标签指定采样比例( ``frac`` 参数)或条数 ( ``n`` " -"参数)。暂不支持放回采样。" +"在这种采样方式下,用户指定用于分层的标签列,同时为需要采样的每个标签指定" +"采样比例( ``frac`` 参数)或条数 ( ``n`` 参数)。暂不支持放回采样。" msgstr "" "In this sampling, you need to specify the label column that you want to " "stratify, and specify the sampling proportion (the ``frac`` parameter) or" @@ -347,8 +356,8 @@ msgstr "" #: ../../source/df-sort-distinct-apply.rst:220 msgid "" -"min_max_scale 还支持使用 feature_range 参数指定输出值的范围,例如,如果我们需要使输出值在 (-1, 1) " -"范围内,可使用" +"min_max_scale 还支持使用 feature_range 参数指定输出值的范围,例如,如果" +"我们需要使输出值在 (-1, 1) 范围内,可使用" msgstr "" "min_max_scale can work with the feature_range parameter to specify the " "output range. The following is an example of how to output values in the " @@ -368,13 +377,14 @@ msgstr "" #: ../../source/df-sort-distinct-apply.rst:234 msgid "" -"如果需要保留原始值,可以使用 preserve 参数。此时,缩放后的数据将会以新增列的形式追加到数据中, " -"列名默认为原列名追加“_scaled”后缀,该后缀可使用 suffix 参数更改。例如," +"如果需要保留原始值,可以使用 preserve 参数。此时,缩放后的数据将会以新增" +"列的形式追加到数据中, 列名默认为原列名追加“_scaled”后缀,该后缀可使用 " +"suffix 参数更改。例如," msgstr "" "To keep original values, use the preserve parameter. Then, scaled data is" " added to a new column that is named as the original column name suffixed" -" with “_scaled”. To change the suffix, use the suffix parameter, as shown" -" in the following code:" +" with “_scaled”. To change the suffix, use the suffix parameter, as " +"shown in the following code:" #: ../../source/df-sort-distinct-apply.rst:237 msgid "" @@ -389,7 +399,9 @@ msgid "" msgstr "" #: ../../source/df-sort-distinct-apply.rst:248 -msgid "min_max_scale 也支持使用 group 参数指定一个或多个分组列,在分组列中分别取最值进行缩放。例如," +msgid "" +"min_max_scale 也支持使用 group 参数指定一个或多个分组列,在分组列中分别取" +"最值进行缩放。例如," msgstr "" "min_max_scale can also work with the group parameter to specify one or " "more group columns and to retrieve the minimum and maximum from the " @@ -432,7 +444,9 @@ msgid "" msgstr "" #: ../../source/df-sort-distinct-apply.rst:276 -msgid "std_scale 同样支持 preserve 参数保留原始列以及使用 group 进行分组,具体请参考 min_max_scale,此处不再赘述。" +msgid "" +"std_scale 同样支持 preserve 参数保留原始列以及使用 group 进行分组,具体请" +"参考 min_max_scale,此处不再赘述。" msgstr "" "This method also supports using the preserve parameter to keep the " "original column and the group parameter, in order to group data. For more" @@ -548,7 +562,9 @@ msgid "" msgstr "" #: ../../source/df-sort-distinct-apply.rst:355 -msgid "特别地,DataFrame 提供了向前 / 向后填充的功能。通过指定 method 参数为下列值可以达到目的:" +msgid "" +"特别地,DataFrame 提供了向前 / 向后填充的功能。通过指定 method 参数为下列" +"值可以达到目的:" msgstr "" "DataFrame also provides backward filling and forward filling by using the" " method parameter, as follows:" @@ -605,8 +621,8 @@ msgstr "" #: ../../source/df-sort-distinct-apply.rst:387 msgid "" -"你也可以使用 ffill / bfill 函数来简化代码。ffill 等价于 fillna(method='ffill'), bfill 等价于 " -"fillna(method='bfill')" +"你也可以使用 ffill / bfill 函数来简化代码。ffill 等价于 fillna(method='" +"ffill'), bfill 等价于 fillna(method='bfill')" msgstr "" "You can use the ffill / bfill function to simplify code, where ffill " "equals to fillna(method='ffill') and bfill equals to " @@ -621,13 +637,17 @@ msgid "对一行数据使用自定义函数" msgstr "Using custom functions for one row" #: ../../source/df-sort-distinct-apply.rst:398 -msgid "要对一行数据使用自定义函数,可以使用 apply 方法,axis 参数必须为 1,表示在行上操作。" +msgid "" +"要对一行数据使用自定义函数,可以使用 apply 方法,axis 参数必须为 1,表示" +"在行上操作。" msgstr "" "To use custom functions for one row, you can use the apply method. The " "axis parameter must be 1 to indicate that the operation works on the row." #: ../../source/df-sort-distinct-apply.rst:400 -msgid "apply 的自定义函数接收一个参数,为上一步 Collection 的一行数据,用户可以通过属性、或者偏移取得一个字段的数据。" +msgid "" +"apply 的自定义函数接收一个参数,为上一步 Collection 的一行数据,用户可以" +"通过属性、或者偏移取得一个字段的数据。" msgstr "" "The apply method calls your function. You can specify the type or offset " "in the function to retrieve data in a field from the row in the preceding" @@ -645,8 +665,9 @@ msgstr "" #: ../../source/df-sort-distinct-apply.rst:410 msgid "" -"``reduce``\\ 为 True 时,表示返回结果为Sequence,否则返回结果为Collection。 ``names``\\ 和 " -"``types``\\ 参数分别指定返回的Sequence或Collection的字段名和类型。 如果类型不指定,将会默认为string类型。" +"``reduce``\\ 为 True 时,表示返回结果为Sequence,否则返回结果为Collection" +"。 ``names``\\ 和 ``types``\\ 参数分别指定返回的Sequence或Collection的" +"字段名和类型。 如果类型不指定,将会默认为string类型。" msgstr "" "If ``reduce``\\ is True, the system returns the Sequence object. If not, " "the system returns the Collection object. Use the ``names``\\ and " @@ -655,7 +676,9 @@ msgstr "" "specified." #: ../../source/df-sort-distinct-apply.rst:414 -msgid "在 apply 的自定义函数中,reduce 为 False 时,也可以使用 ``yield``\\ 关键字来返回多行结果。" +msgid "" +"在 apply 的自定义函数中,reduce 为 False 时,也可以使用 ``yield``\\ " +"关键字来返回多行结果。" msgstr "" "If reduce is False in your function, you can use the ``yield``\\ keyword " "to return multiple rows." @@ -745,8 +768,8 @@ msgstr "" #: ../../source/df-sort-distinct-apply.rst:477 msgid "" -"使用 apply 对行操作,且 ``reduce``\\ 为 False 时,可以使用 :ref:`dflateralview` " -"与已有的行结合,用于后续聚合等操作。" +"使用 apply 对行操作,且 ``reduce``\\ 为 False 时,可以使用 :ref:`" +"dflateralview` 与已有的行结合,用于后续聚合等操作。" msgstr "" "When you use the apply method in a row and ``reduce``\\ is False, combine" " this row with an existing row by using :ref:`dflateralview` to prepare " @@ -771,7 +794,9 @@ msgid "对所有列调用自定义聚合" msgstr "Using custom aggregations for all columns" #: ../../source/df-sort-distinct-apply.rst:493 -msgid "调用apply方法,当我们不指定axis,或者axis为0的时候,我们可以通过传入一个自定义聚合类来对所有sequence进行聚合操作。" +msgid "" +"调用apply方法,当我们不指定axis,或者axis为0的时候,我们可以通过传入一个" +"自定义聚合类来对所有sequence进行聚合操作。" msgstr "" "When you use the apply method and axis is not specified or is 0, call a " "custom aggregation to aggregate all Sequence objects." @@ -807,16 +832,19 @@ msgid "" msgstr "" #: ../../source/df-sort-distinct-apply.rst:523 -msgid "目前,受限于 Python UDF,自定义函数无法支持将 list / dict 类型作为初始输入或最终输出结果。" +msgid "" +"目前,受限于 Python UDF,自定义函数无法支持将 list / dict 类型作为初始" +"输入或最终输出结果。" msgstr "" "Limited by Python UDFs, custom aggregations cannot specify the input or " "the output result type as the list or dict type." #: ../../source/df-sort-distinct-apply.rst:527 msgid "" -"由于 PyODPS DataFrame 默认 Collection / Sequence " -"等对象均为分布式对象,故不支持在自定义函数内部引用这些对象。 请考虑改用 :ref:`Join 等方法 ` 引用多个 " -"DataFrame 的数据,或者引用 Collection 作为资源,如下文所述。" +"由于 PyODPS DataFrame 默认 Collection / Sequence 等对象均为分布式对象,故" +"不支持在自定义函数内部引用这些对象。 请考虑改用 :ref:`Join 等方法 <" +"dfmerge>` 引用多个 DataFrame 的数据,或者引用 Collection 作为资源,如下文" +"所述。" msgstr "" "PyODPS DataFrame recognizes all collections and sequences as distributed " "objects, and does not support referencing these objects inside user-" @@ -831,16 +859,16 @@ msgstr "Referring to resources" #: ../../source/df-sort-distinct-apply.rst:533 msgid "" -"类似于对 :ref:`map ` " -"方法的resources参数,每个resource可以是ODPS上的资源(表资源或文件资源),或者引用一个collection作为资源。" +"类似于对 :ref:`map ` 方法的resources参数,每个resource可以是ODPS上的" +"资源(表资源或文件资源),或者引用一个collection作为资源。" msgstr "" "Similar to the resources parameter for :ref:`map ` , resources can " "be tables, files, or Collection objects that you refer to in MaxCompute." #: ../../source/df-sort-distinct-apply.rst:535 msgid "" -"对于axis为1,也就是在行上操作,我们需要写一个函数闭包或者callable的类。 而对于列上的聚合操作,我们只需在 " -"\\_\\_init\\_\\_ 函数里读取资源即可。" +"对于axis为1,也就是在行上操作,我们需要写一个函数闭包或者callable的类。 " +"而对于列上的聚合操作,我们只需在 \\_\\_init\\_\\_ 函数里读取资源即可。" msgstr "" "If axis is 1 in a row operation, write a function closure or callable " "class. For column aggregation, you can read resources by using the " @@ -873,7 +901,9 @@ msgid "" msgstr "" #: ../../source/df-sort-distinct-apply.rst:563 -msgid "可以看到这里的stop_words是存放于本地,但在真正执行时会被上传到ODPS作为资源引用。" +msgid "" +"可以看到这里的stop_words是存放于本地,但在真正执行时会被上传到ODPS作为" +"资源引用。" msgstr "" "In this example, ``stop_words`` is a local variable which is referenced " "as a resource in MaxCompute during execution." @@ -886,8 +916,9 @@ msgstr "Using a third-party Python library" #: ../../source/df-sort-distinct-apply.rst:568 #: ../../source/df-sort-distinct-apply.rst:688 msgid "" -"现在用户可以把第三方 Wheel 包作为资源上传到 MaxCompute。在全局或者在立即执行的方法时,指定需要使用的包文件, " -"即可以在自定义函数中使用第三方库。值得注意的是,第三方库的依赖库也必须指定,否则依然会有导入错误。" +"现在用户可以把第三方 Wheel 包作为资源上传到 MaxCompute。在全局或者在立即" +"执行的方法时,指定需要使用的包文件, 即可以在自定义函数中使用第三方库。" +"值得注意的是,第三方库的依赖库也必须指定,否则依然会有导入错误。" msgstr "" "You can upload third-party wheel packages as resources to MaxCompute. You" " need to specify the package files globally or in methods that execute " @@ -897,25 +928,28 @@ msgstr "" #: ../../source/df-sort-distinct-apply.rst:571 #: ../../source/df-sort-distinct-apply.rst:691 msgid "" -"如果你的 MaxCompute 服务支持在执行 SQL 时使用镜像,可以在 execute / persist / to_pandas 方法指定 " -"``image`` 参数以使用镜像。与此同时,DataFrame 的 execute / persist / to_pandas 方法支持增加 " -"``libraries`` 参数以将资源作为三方包。 PyODPS 提供了 ``pyodps-pack`` 工具,可在安装完 PyODPS " -"后打包三方包及其依赖。如何制作及使用三方包的说明请参考 :ref:`这里 `。" -msgstr "" -"If your MaxCompute service supports specifying images when executing SQL" -" statements, you may specify ``image`` argument with ``execute``, ``persist``" -" or ``to_pandas`` to use these images. Meanwhile ``libraries`` argument " -"can be used with ``execute``, ``persist`` or ``to_pandas`` to specify " -"resources as thirdparty libraries. PyODPS installation provides ``pyodps-" -"pack`` tool for packing third-party libraries. You may take a look at " -":ref:`documents here ` to see how to build and use these " -"third-party libraries." +"如果你的 MaxCompute 服务支持在执行 SQL 时使用镜像,可以在 execute / " +"persist / to_pandas 方法指定 ``image`` 参数以使用镜像。与此同时," +"DataFrame 的 execute / persist / to_pandas 方法支持增加 ``libraries`` " +"参数以将资源作为三方包。 PyODPS 提供了 ``pyodps-pack`` 工具,可在安装完 " +"PyODPS 后打包三方包及其依赖。如何制作及使用三方包的说明请参考 :ref:`这里 " +"`。" +msgstr "" +"If your MaxCompute service supports specifying images when executing SQL " +"statements, you may specify ``image`` argument with ``execute``, " +"``persist`` or ``to_pandas`` to use these images. Meanwhile ``libraries``" +" argument can be used with ``execute``, ``persist`` or ``to_pandas`` to " +"specify resources as thirdparty libraries. PyODPS installation provides " +"``pyodps-pack`` tool for packing third-party libraries. You may take a " +"look at :ref:`documents here ` to see how to build and use " +"these third-party libraries." #: ../../source/df-sort-distinct-apply.rst:577 #: ../../source/df-sort-distinct-apply.rst:698 msgid "" -"由于字节码定义的差异,Python 3 下使用新语言特性(例如 ``yield from`` )时,代码在使用 Python 2.7 的 ODPS" -" Worker 上执行时会发生错误。因而建议在 Python 3 下使用 MapReduce API 编写生产作业前,先确认相关代码是否能正常 " +"由于字节码定义的差异,Python 3 下使用新语言特性(例如 ``yield from`` )时" +",代码在使用 Python 2.7 的 ODPS Worker 上执行时会发生错误。因而建议在 " +"Python 3 下使用 MapReduce API 编写生产作业前,先确认相关代码是否能正常 " "执行。" msgstr "" "Due to the difference in bytecode definition, when you write code with " @@ -931,9 +965,9 @@ msgstr "" #: ../../source/df-sort-distinct-apply.rst:585 msgid "" -"PyODPS DataFrame也支持MapReduce " -"API,用户可以分别编写map和reduce函数(map_reduce可以只有mapper或者reducer过程)。 " -"我们来看个简单的wordcount的例子。" +"PyODPS DataFrame也支持MapReduce API,用户可以分别编写map和reduce函数(map" +"_reduce可以只有mapper或者reducer过程)。 我们来看个简单的wordcount的例子" +"。" msgstr "" "PyODPS DataFrame supports the MapReduce API. You can write the map and " "reduce functions respectively, because map_reduce may include the mapper " @@ -946,7 +980,8 @@ msgid "" ">>> yield word.lower(), 1\n" ">>>\n" ">>> def reducer(keys):\n" -">>> # 这里使用 list 而不是 cnt = 0,否则 h 内的 cnt 会被认为是局部变量,其中的赋值无法输出\n" +">>> # 这里使用 list 而不是 cnt = 0,否则 h 内的 cnt 会被认为是" +"局部变量,其中的赋值无法输出\n" ">>> cnt = [0]\n" ">>> def h(row, done): # done表示这个key已经迭代结束\n" ">>> cnt[0] += row[1]\n" @@ -1009,8 +1044,9 @@ msgstr "" #: ../../source/df-sort-distinct-apply.rst:621 msgid "" -"其中对于reducer来说,会稍微有些不同。它需要接收聚合的keys初始化,并能继续处理按这些keys聚合的每行数据。 " -"第2个参数表示这些keys相关的所有行是不是都迭代完成。" +"其中对于reducer来说,会稍微有些不同。它需要接收聚合的keys初始化,并能继续" +"处理按这些keys聚合的每行数据。 第2个参数表示这些keys相关的所有行是不是都" +"迭代完成。" msgstr "" "The reducer process is different. The reducer receives aggregated initial" " values of keys, and then processes each row aggregated by keys. The " @@ -1057,7 +1093,8 @@ msgid "" ">>>\n" ">>> @output(['word', 'cnt'], ['string', 'int'])\n" ">>> def reducer(keys):\n" -">>> # 这里使用 list 而不是 cnt = 0,否则 h 内的 cnt 会被认为是局部变量,其中的赋值无法输出\n" +">>> # 这里使用 list 而不是 cnt = 0,否则 h 内的 cnt 会被认为是" +"局部变量,其中的赋值无法输出\n" ">>> cnt = [0]\n" ">>> def h(row, done): # done表示这个key已经迭代结束\n" ">>> cnt[0] += row.cnt\n" @@ -1109,9 +1146,10 @@ msgstr "" #: ../../source/df-sort-distinct-apply.rst:669 msgid "" -"有时候我们在迭代的时候需要按某些列排序,则可以使用 ``sort``\\ 参数,来指定按哪些列排序,升序降序则通过 ``ascending``\\" -" 参数指定。 ``ascending`` 参数可以是一个bool值,表示所有的 ``sort``\\ 字段是相同升序或降序, " -"也可以是一个列表,长度必须和 ``sort``\\ 字段长度相同。" +"有时候我们在迭代的时候需要按某些列排序,则可以使用 ``sort``\\ 参数,来" +"指定按哪些列排序,升序降序则通过 ``ascending``\\ 参数指定。 ``ascending``" +" 参数可以是一个bool值,表示所有的 ``sort``\\ 字段是相同升序或降序, 也" +"可以是一个列表,长度必须和 ``sort``\\ 字段长度相同。" msgstr "" "Sometimes, when you need to sort the data by column during iteration, you" " can use the ``sort``\\ parameter to specify columns for sorting, and set" @@ -1126,9 +1164,9 @@ msgstr "Specifying the combiner" #: ../../source/df-sort-distinct-apply.rst:677 msgid "" -"combiner表示在map_reduce " -"API里表示在mapper端,就先对数据进行聚合操作,它的用法和reducer是完全一致的,但不能引用资源。 " -"并且,combiner的输出的字段名和字段类型必须和mapper完全一致。" +"combiner表示在map_reduce API里表示在mapper端,就先对数据进行聚合操作,它" +"的用法和reducer是完全一致的,但不能引用资源。 并且,combiner的输出的" +"字段名和字段类型必须和mapper完全一致。" msgstr "" "If the combiner has been expressed in the map_reduce API, the mapper " "aggregates data first. The reducer has the same usage but cannot " @@ -1136,7 +1174,9 @@ msgstr "" "must be consistent with the mapper." #: ../../source/df-sort-distinct-apply.rst:680 -msgid "上面的例子,我们就可以使用reducer作为combiner来先在mapper端对数据做初步的聚合,减少shuffle出去的数据量。" +msgid "" +"上面的例子,我们就可以使用reducer作为combiner来先在mapper端对数据做初步的" +"聚合,减少shuffle出去的数据量。" msgstr "" "In the preceding example, you can use the reducer as the combiner to " "aggregate data in the mapper to reduce shuffled data." @@ -1147,9 +1187,9 @@ msgstr "" #: ../../source/df-sort-distinct-apply.rst:704 msgid "" -"由于 PyODPS DataFrame 默认 Collection / Sequence " -"等对象均为分布式对象,故不支持在自定义函数内部引用这些对象。 请考虑改用 :ref:`Join 等方法 ` 引用多个 " -"DataFrame 的数据,或者引用 Collection 作为资源。" +"由于 PyODPS DataFrame 默认 Collection / Sequence 等对象均为分布式对象,故" +"不支持在自定义函数内部引用这些对象。 请考虑改用 :ref:`Join 等方法 <" +"dfmerge>` 引用多个 DataFrame 的数据,或者引用 Collection 作为资源。" msgstr "" "PyODPS DataFrame recognizes all collections and sequences as distributed " "objects, and does not support referencing these objects inside user-" @@ -1164,7 +1204,9 @@ msgstr "" "reducer in MapReduce API." #: ../../source/df-sort-distinct-apply.rst:712 -msgid "如下面的例子,我们对mapper里的单词做停词过滤,在reducer里对白名单的单词数量加5。" +msgid "" +"如下面的例子,我们对mapper里的单词做停词过滤,在reducer里对白名单的单词" +"数量加5。" msgstr "" "In the following example, you filter stop words in the mapper, and the " "number of whitelisted words in the reducer is plus 5." @@ -1216,7 +1258,9 @@ msgid "重排数据" msgstr "Reshuffling data" #: ../../source/df-sort-distinct-apply.rst:757 -msgid "有时候我们的数据在集群上分布可能是不均匀的,我们需要对数据重排。调用 ``reshuffle`` 接口即可。" +msgid "" +"有时候我们的数据在集群上分布可能是不均匀的,我们需要对数据重排。调用 ``" +"reshuffle`` 接口即可。" msgstr "" "If data is unevenly distributed in a cluster, you need to reshuffle the " "data by using the ``reshuffle`` API." @@ -1226,7 +1270,9 @@ msgid ">>> df1 = df.reshuffle()" msgstr "" #: ../../source/df-sort-distinct-apply.rst:765 -msgid "默认会按随机数做哈希来分布。也可以指定按那些列做分布,且可以指定重排后的排序顺序。" +msgid "" +"默认会按随机数做哈希来分布。也可以指定按那些列做分布,且可以指定重排后的" +"排序顺序。" msgstr "" "By default, data is hashed as random numbers. You can also distribute the" " data by a specified column, and sort the reshuffled data in a specified " @@ -1248,8 +1294,9 @@ msgstr "" #: ../../source/df-sort-distinct-apply.rst:779 msgid "" -"给定某个collection,和它的某个列计算的sequence1,我们能对另外一个sequence2进行布隆过滤,sequence1不在sequence2中的一定会过滤," -" 但可能不能完全过滤掉不存在于sequence2中的数据,这也是一种近似的方法。" +"给定某个collection,和它的某个列计算的sequence1,我们能对另外一个" +"sequence2进行布隆过滤,sequence1不在sequence2中的一定会过滤, 但可能不能" +"完全过滤掉不存在于sequence2中的数据,这也是一种近似的方法。" msgstr "" "When you have specified a Collection object and its sequence1 for column " "calculation, you can apply the Bloom filter to sequence2. Therefore, the " @@ -1265,9 +1312,10 @@ msgstr "" #: ../../source/df-sort-distinct-apply.rst:784 msgid "" -"这在大规模join的时候,一边数据量远大过另一边数据,而大部分并不会join上的场景很有用。 " -"比如,我们在join用户的浏览数据和交易数据时,用户的浏览大部分不会带来交易,我们可以利用交易数据先对浏览数据进行布隆过滤, " -"然后再join能很好提升性能。" +"这在大规模join的时候,一边数据量远大过另一边数据,而大部分并不会join上的" +"场景很有用。 比如,我们在join用户的浏览数据和交易数据时,用户的浏览大部分" +"不会带来交易,我们可以利用交易数据先对浏览数据进行布隆过滤, 然后再join能" +"很好提升性能。" msgstr "" "The Bloom filter is particularly suitable for the large-scale join " "operation used between large volumes of data and small amounts of data. " @@ -1291,7 +1339,8 @@ msgid "" ">>> df2\n" " a\n" "0 name1\n" -">>> df1.bloom_filter('a', df2.a) # 这里第0个参数可以是个计算表达式如: df1.a + '1'\n" +">>> df1.bloom_filter('a', df2.a) # 这里第0个参数可以是个计算表达式如: df1" +".a + '1'\n" " a b\n" "0 name1 1\n" "1 name1 4" @@ -1315,7 +1364,9 @@ msgstr "" "1 name1 4" #: ../../source/df-sort-distinct-apply.rst:806 -msgid "这里由于数据量很小,df1中的a为name2和name3的行都被正确过滤掉了,当数据量很大的时候,可能会有一定的数据不能被过滤。" +msgid "" +"这里由于数据量很小,df1中的a为name2和name3的行都被正确过滤掉了,当数据量" +"很大的时候,可能会有一定的数据不能被过滤。" msgstr "" "The preceding code processes a small volume of data. Therefore, the rows " "that contain name2 and name3 in column a of df1 are filtered out. " @@ -1323,7 +1374,9 @@ msgstr "" " out some data that meets the specified condition." #: ../../source/df-sort-distinct-apply.rst:808 -msgid "如之前提的join场景中,少量不能过滤并不能并不会影响正确性,但能较大提升join的性能。" +msgid "" +"如之前提的join场景中,少量不能过滤并不能并不会影响正确性,但能较大提升" +"join的性能。" msgstr "" "As shown in the preceding join operation, some data that cannot be " "filtered out does not affect the program correctness, but can enhance the" @@ -1331,14 +1384,16 @@ msgstr "" #: ../../source/df-sort-distinct-apply.rst:810 msgid "" -"我们可以传入 ``capacity`` 和 ``error_rate`` 来设置数据的量以及错误率,默认值是 ``3000`` 和 " -"``0.01``。" +"我们可以传入 ``capacity`` 和 ``error_rate`` 来设置数据的量以及错误率," +"默认值是 ``3000`` 和 ``0.01``。" msgstr "" "You can use ``capacity`` and ``error_rate`` to specify the data capacity " "and error rate, ``3000`` and ``0.01`` by default." #: ../../source/df-sort-distinct-apply.rst:813 -msgid "要注意,调大 ``capacity`` 或者减小 ``error_rate`` 会增加内存的使用,所以应当根据实际情况选择一个合理的值。" +msgid "" +"要注意,调大 ``capacity`` 或者减小 ``error_rate`` 会增加内存的使用,所以" +"应当根据实际情况选择一个合理的值。" msgstr "" "Tuning the ``capacity`` value up or the ``error_rate`` value down " "increases the usage of memory. Therefore, you need to select a proper " @@ -1370,7 +1425,9 @@ msgid "" msgstr "" #: ../../source/df-sort-distinct-apply.rst:839 -msgid "最简单的透视表必须提供一个 ``rows`` 参数,表示按一个或者多个字段做取平均值的操作。" +msgid "" +"最简单的透视表必须提供一个 ``rows`` 参数,表示按一个或者多个字段做取" +"平均值的操作。" msgstr "" "The simplest pivot_table must provide the ``rows`` parameter to retrieve " "the mean from one or more fields." @@ -1434,7 +1491,9 @@ msgid "" msgstr "" #: ../../source/df-sort-distinct-apply.rst:884 -msgid "我们也可以把原始数据的某一列的值,作为新的collection的列。 **这也是透视表最强大的地方。**" +msgid "" +"我们也可以把原始数据的某一列的值,作为新的collection的列。 **这也是透视表" +"最强大的地方。**" msgstr "" "You can also use the values of a column in original data as a new " "Collection column. **This is the most powerful benefit of pivot_table.**" @@ -1469,7 +1528,9 @@ msgid "Key-Value 字符串转换" msgstr "Key-value string transformation" #: ../../source/df-sort-distinct-apply.rst:910 -msgid "DataFrame 提供了将 Key-Value 对展开为列,以及将普通列转换为 Key-Value 列的功能。" +msgid "" +"DataFrame 提供了将 Key-Value 对展开为列,以及将普通列转换为 Key-Value 列" +"的功能。" msgstr "" "DataFrame can extract key-value pairs into a column, and transform a " "common column to a key-value column." @@ -1504,9 +1565,11 @@ msgstr "" #: ../../source/df-sort-distinct-apply.rst:934 msgid "" -"其中,需要展开的字段名由 columns 指定,Key 和 Value 之间的分隔符,以及 Key-Value 对之间的分隔符分别由 " -"kv_delim 和 item_delim 这两个参数指定,默认分别为半角冒号和半角逗号。输出的字段名为原字段名和 Key " -"值的组合,通过“_”相连。缺失值默认为 None,可通过 ``fill_value`` 选择需要填充的值。例如,相同的 df," +"其中,需要展开的字段名由 columns 指定,Key 和 Value 之间的分隔符,以及 " +"Key-Value 对之间的分隔符分别由 kv_delim 和 item_delim 这两个参数指定," +"默认分别为半角冒号和半角逗号。输出的字段名为原字段名和 Key 值的组合,通过" +"“_”相连。缺失值默认为 None,可通过 ``fill_value`` 选择需要填充的值。" +"例如,相同的 df," msgstr "" "In this code, use columns to specify the field name that you want to " "extract. The separators for Key, Value, and key-value pairs are specified" @@ -1528,7 +1591,9 @@ msgid "" msgstr "" #: ../../source/df-sort-distinct-apply.rst:947 -msgid "extract_kv 默认输出类型为 ``float``\\ 。如果需要输出其他类型,可以指定 ``dtype`` 参数,例如" +msgid "" +"extract_kv 默认输出类型为 ``float``\\ 。如果需要输出其他类型,可以指定 ``" +"dtype`` 参数,例如" msgstr "" "The default output type of ``extract_kv`` is ``float``\\ . If you need to" " output with other data types, please specify ``dtype`` argument. For " diff --git a/docs/source/locale/en/LC_MESSAGES/df-window.po b/docs/source/locale/en/LC_MESSAGES/df-window.po index 24a405f3..5e184b83 100644 --- a/docs/source/locale/en/LC_MESSAGES/df-window.po +++ b/docs/source/locale/en/LC_MESSAGES/df-window.po @@ -15,7 +15,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.5.3\n" +"Generated-By: Babel 2.12.1\n" #: ../../source/df-window.rst:6 msgid "from odps.df import DataFrame" @@ -202,11 +202,13 @@ msgid "qcut" msgstr "" #: ../../source/df-window.rst:70 -msgid "将分组数据按顺序切分成n片,并返回当前切片值,如果切片不均匀,默认增加第一个切片的分布" +msgid "" +"将分组数据按顺序切分成n片,并返回当前切片值,如果切片不均匀,默认增加" +"第一个切片的分布" msgstr "" -"Cuts data in the group into n slices in order and returns the number of the cut" -" containing the current data. If data are not distributed evenly in cuts, extra" -" data will be put in the first cut." +"Cuts data in the group into n slices in order and returns the number of " +"the cut containing the current data. If data are not distributed evenly " +"in cuts, extra data will be put in the first cut." #: ../../source/df-window.rst:71 msgid "nth_value" @@ -222,7 +224,9 @@ msgstr "" #: ../../source/df-window.rst:72 msgid "计算分组中值小于等于当前值的行数占分组总行数的比例" -msgstr "Calculates the ratio of lines whose line numbers are less than the current line" +msgstr "" +"Calculates the ratio of lines whose line numbers are less than the " +"current line" #: ../../source/df-window.rst:75 msgid "其中,rank、dense_rank、percent_rank 和 row_number 支持下列参数:" @@ -281,8 +285,8 @@ msgstr "" #: ../../source/df-window.rst:93 msgid "" -"而 cumsum、cummax、cummin、cummean、cummedian、cumcount 和 cumstd 除 rank " -"的上述参数外,还支持下列参数:" +"而 cumsum、cummax、cummin、cummean、cummedian、cumcount 和 cumstd 除 rank" +" 的上述参数外,还支持下列参数:" msgstr "" "In addition to the rank parameter, the cumsum, cummax, cummin, cummean, " "cummedian, cumcount, and cumstd window functions support the following " diff --git a/docs/source/locale/en/LC_MESSAGES/df.po b/docs/source/locale/en/LC_MESSAGES/df.po index 26ebe780..38228885 100644 --- a/docs/source/locale/en/LC_MESSAGES/df.po +++ b/docs/source/locale/en/LC_MESSAGES/df.po @@ -15,7 +15,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.9.1\n" +"Generated-By: Babel 2.12.1\n" #: ../../source/df.rst:5 msgid "DataFrame" @@ -23,8 +23,8 @@ msgstr "DataFrame" #: ../../source/df.rst:8 msgid "" -"PyODPS 提供了 DataFrame API,它提供了类似 pandas 的接口,但是能充分利用 ODPS 的计算能力; " -"同时能在本地使用同样的接口,用 pandas 进行计算。" +"PyODPS 提供了 DataFrame API,它提供了类似 pandas 的接口,但是能充分利用 " +"ODPS 的计算能力; 同时能在本地使用同样的接口,用 pandas 进行计算。" msgstr "" "PyODPS provides a pandas-like interface, PyODPS DataFrame, which operates" " on MaxCompute tables and can make full use of MaxCompute's computing " @@ -33,12 +33,14 @@ msgstr "" #: ../../source/df.rst:13 msgid "" -"PyODPS DataFrame 尽管看起来和 pandas 形似,但并不是 pandas。pandas 的功能,例如完整的 Series " -"支持、Index 支持、按行读取数据、多 DataFrame 按 iloc 横向合并等,PyODPS DataFrame " -"并不支持。因而使用前请参考文档确定你的写法是否被支持。" +"PyODPS DataFrame 尽管看起来和 pandas 形似,但并不是 pandas。pandas 的功能" +",例如完整的 Series 支持、Index 支持、按行读取数据、多 DataFrame 按 iloc " +"横向合并等,PyODPS DataFrame 并不支持。因而使用前请参考文档确定你的写法" +"是否被支持。" msgstr "" -"Though PyODPS DataFrame looks like pandas DataFrame, it is not pandas though. " -"Features of pandas, such as full supports for Series and Index, reading data by rows, " -"concatenation of multiple DataFrame by columns are not supported in PyODPS DataFrame. " -"Therefore please check this document before code to make sure methods are supported." +"Though PyODPS DataFrame looks like pandas DataFrame, it is not pandas " +"though. Features of pandas, such as full supports for Series and Index, " +"reading data by rows, concatenation of multiple DataFrame by columns are " +"not supported in PyODPS DataFrame. Therefore please check this document " +"before code to make sure methods are supported." diff --git a/docs/source/locale/en/LC_MESSAGES/faq-ext.po b/docs/source/locale/en/LC_MESSAGES/faq-ext.po index 118c3e91..27dfc8cd 100644 --- a/docs/source/locale/en/LC_MESSAGES/faq-ext.po +++ b/docs/source/locale/en/LC_MESSAGES/faq-ext.po @@ -22,7 +22,9 @@ msgid "安装失败 / 出现问题" msgstr "Installation failure/error" #: ../../source/faq-ext.rst:3 -msgid "请参考 `PyODPS 安装常见问题解决 `_ 。" +msgid "" +"请参考 `PyODPS 安装常见问题解决 `_" +" 。" msgstr "" "For more information, see `PyODPS installation FAQ (Chinese version only)" " `_ ." @@ -33,9 +35,9 @@ msgstr "Project not found error" #: ../../source/faq-ext.rst:7 msgid "" -"Endpoint配置不对,详细配置参考 `MaxCompute 开通 Region 和服务连接对照表 " -"`_ 。 此外还需要注意 ODPS 入口对象参数位置是否填写正确。" +"Endpoint配置不对,详细配置参考 `MaxCompute 开通 Region 和服务连接对照表 <" +"https://help.aliyun.com/document_detail/34951.html#h2-maxcompute-region-3" +">`_ 。 此外还需要注意 ODPS 入口对象参数位置是否填写正确。" msgstr "" "This error is caused by an error in the configuration of Endpoint. For " "more information, see `MaxCompute activation and service connections by " @@ -48,7 +50,9 @@ msgid "如何手动指定 Tunnel Endpoint" msgstr "" #: ../../source/faq-ext.rst:14 -msgid "可以使用下面的方法创建带有 Tunnel Endpoint 的 ODPS 入口(参数值请自行替换,不包含星号):" +msgid "" +"可以使用下面的方法创建带有 Tunnel Endpoint 的 ODPS 入口(参数值请自行替换" +",不包含星号):" msgstr "" "You can create your MaxCompute (ODPS) entrance object with an extra " "```tunnel_endpoint``` parameter, as shown in the following code. " diff --git a/docs/source/locale/en/LC_MESSAGES/faq.po b/docs/source/locale/en/LC_MESSAGES/faq.po index ab6a6e5c..2752f5eb 100644 --- a/docs/source/locale/en/LC_MESSAGES/faq.po +++ b/docs/source/locale/en/LC_MESSAGES/faq.po @@ -8,14 +8,14 @@ msgid "" msgstr "" "Project-Id-Version: PyODPS 0.7.16\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-03-27 15:53+0800\n" +"POT-Creation-Date: 2024-07-28 10:50+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.9.1\n" +"Generated-By: Babel 2.12.1\n" #: ../../source/faq.rst:5 msgid "常见问题" @@ -31,15 +31,16 @@ msgid "" "print(odps.__version__)" msgstr "" +#: ../../source/faq.rst:-1 msgid "怎么配置 SQL / DataFrame 的执行选项?" msgstr "How to configure execution options in SQL or DataFrame" #: ../../source/faq.rst:20 msgid "" -"ODPS SQL 的执行选项可在 `这里 " -"`_ 找到。设置时,可将该选项设置到 ``options.sql.settings``,即" +"ODPS SQL 的执行选项可在 `这里 `_ 找到。设置时,可将该选项设置到 ``options.sql." +"settings``,即" msgstr "" "You can find a list of options for MaxCompute SQL `here " "', hints={'': ''})" +#: ../../source/faq.rst:-1 msgid "读取数据时报\"project is protected\"" msgstr "" "An error occurred while reading data: \"project is protected\". How can I" " deal with this error?" #: ../../source/faq.rst:49 -msgid "Project 上的安全策略禁止读取表中的数据,此时,如果想使用全部数据,有以下选项可用:" +msgid "" +"Project 上的安全策略禁止读取表中的数据,此时,如果想使用全部数据,有以下" +"选项可用:" msgstr "" "The project security policy disables reading data from tables. To " "retrieve all the data, you can apply the following solutions:" @@ -116,7 +120,9 @@ msgid "联系 Project Owner 增加例外规则" msgstr "Contact the Project Owner to add exceptions." #: ../../source/faq.rst:52 -msgid "使用 DataWorks 或其他脱敏工具先对数据进行脱敏,导出到非保护 Project,再进行读取" +msgid "" +"使用 DataWorks 或其他脱敏工具先对数据进行脱敏,导出到非保护 Project,再" +"进行读取" msgstr "" "Use DataWorks or other masking tool to mask the data and export the data " "as an unprotected project before reading." @@ -141,8 +147,8 @@ msgstr "" #: ../../source/faq.rst:61 msgid "" -"如果 ``from odps import errors`` 也不行,则是缺少 ipython 组件,执行 ``pip install -U " -"jupyter`` 解决。" +"如果 ``from odps import errors`` 也不行,则是缺少 ipython 组件,执行 ``" +"pip install -U jupyter`` 解决。" msgstr "" "If running ``from odps import errors`` does not fix the error, you need " "to execute ``pip install -U jupyter`` to install the ipython component." @@ -155,14 +161,16 @@ msgstr "" #: ../../source/faq.rst:65 msgid "" -"使用 ``create table as select ...`` 把SQL的结果保存成表,再使用 :ref:`table.open_reader" -" ` 来读取。" +"使用 ``create table as select ...`` 把SQL的结果保存成表,再使用 :ref:`" +"table.open_reader ` 来读取。" msgstr "" "Use ``create table as select ...`` to save the SQL execution result to a " "table, and use :ref:`table.open_reader ` to read data." #: ../../source/faq.rst:68 -msgid "上传 pandas DataFrame 到 ODPS 时报错:ODPSError: ODPS entrance should be provided" +msgid "" +"上传 pandas DataFrame 到 ODPS 时报错:ODPSError: ODPS entrance should be " +"provided" msgstr "" "An error occurred while uploading pandas DataFrame to MaxCompute ODPS: " "ODPSError: ODPS entrance should be provided. How can I deal with this " @@ -235,8 +243,8 @@ msgstr "" #: ../../source/faq.rst:95 msgid "" -"执行 SQL 时报 Please add put { \"odps.sql.submit.mode\" : \"script\"} for " -"multi-statement query in settings" +"执行 SQL 时报 Please add put { \"odps.sql.submit.mode\" : \"script\"} for" +" multi-statement query in settings" msgstr "" "Error \"Please add put { \"odps.sql.submit.mode\" : \"script\"} for " "multi-statement query in settings\" occurred when executing SQL scripts" @@ -247,16 +255,19 @@ msgstr "" "Please read :ref:`set runtime parameters ` for more " "information." +#: ../../source/faq.rst:-1 msgid "如何遍历 PyODPS DataFrame 中的每行数据" msgstr "How to enumerate rows in PyODPS DataFrame" #: ../../source/faq.rst:101 msgid "" -"PyODPS DataFrame 不支持遍历每行数据。这样设计的原因是由于 PyODPS DataFrame 面向大规模数据设计,在这种场景下, " -"数据遍历是非常低效的做法。我们建议使用 DataFrame 提供的 ``apply`` 或 ``map_reduce`` " -"接口将原本串行的遍历操作并行化, 具体可参见 `这篇文章 `_ " -"。如果确认你的场景必须要使用数据遍历, 而且遍历的代价可以接受,可以使用 ``to_pandas`` 方法将 DataFrame 转换为 " -"Pandas DataFrame,或者将 DataFrame 存储为表后使用 ``read_table`` 或者 Tunnel 读取数据。" +"PyODPS DataFrame 不支持遍历每行数据。这样设计的原因是由于 PyODPS " +"DataFrame 面向大规模数据设计,在这种场景下, 数据遍历是非常低效的做法。" +"我们建议使用 DataFrame 提供的 ``apply`` 或 ``map_reduce`` 接口将原本串行" +"的遍历操作并行化, 具体可参见 `这篇文章 `_ 。如果确认你的场景必须要使用数据遍历, 而且遍历的代价可以接受," +"可以使用 ``to_pandas`` 方法将 DataFrame 转换为 Pandas DataFrame,或者将 " +"DataFrame 存储为表后使用 ``read_table`` 或者 Tunnel 读取数据。" msgstr "" "We do not support enumerating over every row in PyODPS DataFrame. As " "PyODPS DataFrame mainly focuses on handling huge amount of data, " @@ -269,6 +280,7 @@ msgstr "" "DataFrame into Pandas, or persist your DataFrame into a MaxCompute table " "and read it via ```read_table``` method or table tunnel." +#: ../../source/faq.rst:-1 msgid "为何调用 to_pandas 后内存使用显著大于表的大小?" msgstr "" "Why memory usage after calling to_pandas is significantly larger than the" @@ -276,9 +288,11 @@ msgstr "" #: ../../source/faq.rst:110 msgid "" -"有两个原因可能导致这个现象发生。首先,MaxCompute 在存储数据时会对数据进行压缩,你看到的表大小应当是压缩后的大小。 其次,Python " -"中的值存在额外的存储开销。例如,对于字符串类型而言,每个 Python 字符串都会额外占用近 40 字节空间, 即便该字符串为空串,这可以通过调用" -" ``sys.getsizeof(\"\")`` 发现。" +"有两个原因可能导致这个现象发生。首先,MaxCompute 在存储数据时会对数据进行" +"压缩,你看到的表大小应当是压缩后的大小。 其次,Python 中的值存在额外的" +"存储开销。例如,对于字符串类型而言,每个 Python 字符串都会额外占用近 40 " +"字节空间, 即便该字符串为空串,这可以通过调用 ``sys.getsizeof(\"\")`` " +"发现。" msgstr "" "Two possible reasons might cause this issue. First, MaxCompute compresses" " table data, and the size you see is the size after compression. Second, " @@ -289,21 +303,25 @@ msgstr "" #: ../../source/faq.rst:114 msgid "" -"需要注意的是,使用 Pandas 的 ``info`` 或者 ``memory_usage`` 方法获得的 Pandas DataFrame " -"内存使用可能是不准确的,因为这些方法默认不计算 string 或者其他 object 类型对象的实际内存占用。使用 " -"``df.memory_usage(deep=True).sum()`` 获得的大小更接近实际内存使用,具体可参考 `这篇 Pandas 文档 " -"`_" -" 。" +"需要注意的是,使用 Pandas 的 ``info`` 或者 ``memory_usage`` 方法获得的 " +"Pandas DataFrame 内存使用可能是不准确的,因为这些方法默认不计算 string " +"或者其他 object 类型对象的实际内存占用。使用 ``df.memory_usage(deep=True)" +".sum()`` 获得的大小更接近实际内存使用,具体可参考 `这篇 Pandas 文档 <" +"https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.memory_" +"usage.html>`_ 。" msgstr "" "Note that when using ``info`` or ``memory_usage`` of Pandas to calculate " "the size of your DataFrame might not be accurate, as these methods does " "not take string types or objects into account by default. To get sizes of" -" DataFrames with more accuracy, ``df.memory_usage(deep=True).sum()`` might" -" be used. Details can be seen `in this Pandas document " +" DataFrames with more accuracy, ``df.memory_usage(deep=True).sum()`` " +"might be used. Details can be seen `in this Pandas document " "`_." #: ../../source/faq.rst:119 -msgid "为减小读取数据时的内存开销,可以考虑使用 Arrow 格式,具体可以参考 :ref:`这里 `。" +#, fuzzy +msgid "" +"为减小读取数据时的内存开销,可以考虑使用 Arrow 格式,具体可以参考 :ref:`" +"这里 `。" msgstr "" "To reduce memory usage when reading data, you might try Arrow format. " "Details can be found :ref:`here `." diff --git a/docs/source/locale/en/LC_MESSAGES/index.po b/docs/source/locale/en/LC_MESSAGES/index.po index 5372603e..e369b24d 100644 --- a/docs/source/locale/en/LC_MESSAGES/index.po +++ b/docs/source/locale/en/LC_MESSAGES/index.po @@ -23,9 +23,10 @@ msgstr "" #: ../../source/index.rst:9 msgid "" -"`PyODPS `_ 是 MaxCompute" -" (ODPS) 的 Python 版本 SDK。它提供了对 MaxCompute 对象的基本操作,并提供了 DataFrame 框架,能轻松在 " -"MaxCompute (ODPS) 上进行数据分析。" +"`PyODPS `_ 是 " +"MaxCompute (ODPS) 的 Python 版本 SDK。它提供了对 MaxCompute 对象的" +"基本操作,并提供了 DataFrame 框架,能轻松在 MaxCompute (ODPS) 上进行" +"数据分析。" msgstr "" "`PyODPS `_ is the " "Python SDK of MaxCompute. It supports basic actions on MaxCompute objects" @@ -36,7 +37,9 @@ msgid "安装" msgstr "Installation" #: ../../source/index.rst:18 -msgid "PyODPS 支持 Python 2.7 以上的 Python 版本,包括 Python 3。系统安装了 pip 后,只需运行:" +msgid "" +"PyODPS 支持 Python 2.7 以上的 Python 版本,包括 Python 3。系统安装了 pip " +"后,只需运行:" msgstr "" "PyODPS supports Python 2.7 and later versions (including Python 3). After" " installing PIP in the system, you only need to run" @@ -54,7 +57,9 @@ msgid "快速开始" msgstr "Quick start" #: ../../source/index.rst:31 -msgid "你可以使用阿里云 Access ID / Key 来初始化一个 MaxCompute 的入口(参数值请自行替换,不包含星号)。" +msgid "" +"你可以使用阿里云 Access ID / Key 来初始化一个 MaxCompute 的入口(参数值请" +"自行替换,不包含星号)。" msgstr "" "You can use access id and key of an Alibaba Cloud account to initialize a" " MaxCompute (ODPS) entrance object, as shown in the following code. " @@ -98,8 +103,8 @@ msgstr "" #: ../../source/index.rst:49 msgid "" -"如果你使用 `STS Token `_ " -"访问 MaxCompute,可以使用下面的语句初始化 MaxCompute 入口对象:" +"如果你使用 `STS Token `_ 访问 MaxCompute,可以使用下面的语句初始化 MaxCompute 入口对象:" msgstr "" "If you need to use `STS Token `_ to access MaxCompute, you may use code " @@ -149,14 +154,16 @@ msgstr "" #: ../../source/index.rst:72 msgid "" -"在主入口,我们对于主要的 MaxCompute 对象都提供了最基本的几个操作,包括 ``list``、``get``、\\ " -"``exist``、``create``、``delete``。" +"在主入口,我们对于主要的 MaxCompute 对象都提供了最基本的几个操作,包括 ``" +"list``、``get``、\\ ``exist``、``create``、``delete``。" msgstr "" "We provide elementary functions for major MaxCompute objects, including " "``list``, ``get``, ``exist``, ``create`` and ``delete``." #: ../../source/index.rst:75 -msgid "我们会对这几部分来分别展开说明。后文中的 o 对象如无说明均指的是 MaxCompute 入口对象。" +msgid "" +"我们会对这几部分来分别展开说明。后文中的 o 对象如无说明均指的是 " +"MaxCompute 入口对象。" msgstr "" "We will elaborate every object in the next chapters. If not mentioned, " "the variable ``o`` represents the MaxCompute (ODPS) entrance object." diff --git a/docs/source/locale/en/LC_MESSAGES/installation-ext.po b/docs/source/locale/en/LC_MESSAGES/installation-ext.po index 81774d94..94b66304 100644 --- a/docs/source/locale/en/LC_MESSAGES/installation-ext.po +++ b/docs/source/locale/en/LC_MESSAGES/installation-ext.po @@ -23,9 +23,10 @@ msgstr "Installation instructions" #: ../../source/installation-ext.rst:7 msgid "" -"如果能访问外网,推荐使用 pip 安装。较新版本的 Python 通常自带 pip。如果你的 Python 不包含 pip,可以参考 `地址 " -"`_ 安装,推荐使用 `阿里云镜像 " -"`_ 加快下载速度。" +"如果能访问外网,推荐使用 pip 安装。较新版本的 Python 通常自带 pip。如果你" +"的 Python 不包含 pip,可以参考 `地址 `_ 安装,推荐使用 `阿里云镜像 `_ 加快下载速度。" msgstr "" "We recommend that you use pip to install Python on MaxCompute (PyODPS) " "when you have access to the Internet. pip is often installed with latest " @@ -37,8 +38,8 @@ msgstr "" #: ../../source/installation-ext.rst:11 msgid "接着确保 setuptools 的版本:" msgstr "" -"Make sure that you install the correct versions of setuptools by the" -" following command." +"Make sure that you install the correct versions of setuptools by the " +"following command." #: ../../source/installation-ext.rst:13 msgid "pip install setuptools>=3.0" diff --git a/docs/source/locale/en/LC_MESSAGES/options.po b/docs/source/locale/en/LC_MESSAGES/options.po index 91f82f01..3ff402ae 100644 --- a/docs/source/locale/en/LC_MESSAGES/options.po +++ b/docs/source/locale/en/LC_MESSAGES/options.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: PyODPS 0.8.0\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2024-02-16 09:45+0800\n" +"POT-Creation-Date: 2024-07-27 19:27+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -22,7 +22,9 @@ msgid "配置选项" msgstr "Configuration" #: ../../source/options.rst:8 -msgid "PyODPS 提供了一系列的配置选项,可通过 ``odps.options`` 获得,如下面的例子:" +msgid "" +"PyODPS 提供了一系列的配置选项,可通过 ``odps.options`` 获得,如下面的例子" +":" msgstr "" "PyODPS provides a series of configuration options, which can be obtained " "through ``odps.options``. Here is a simple example:" @@ -32,10 +34,11 @@ msgid "" "from odps import options\n" "# 设置所有输出表的生命周期(lifecycle 选项)\n" "options.lifecycle = 30\n" -"# 使用 Tunnel 下载 string 类型时使用 bytes(tunnel.string_as_binary 选项)\n" +"# 使用 Tunnel 下载 string 类型时使用 bytes(tunnel.string_as_binary 选项" +")\n" "options.tunnel.string_as_binary = True\n" -"# PyODPS DataFrame 用 ODPS 执行时,参照下面 dataframe 相关配置,sort 时设置 limit 到一个比较大的值" -"\n" +"# PyODPS DataFrame 用 ODPS 执行时,参照下面 dataframe 相关配置,sort 时" +"设置 limit 到一个比较大的值\n" "options.df.odps.sort.limit = 100000000" msgstr "" "from odps import options\n" @@ -107,12 +110,22 @@ msgstr "LogView holding time (hours)" msgid "24" msgstr "" +#: ../../source/options.rst:1 +msgid "quota_name" +msgstr "" + +#: ../../source/options.rst:1 +msgid "提交任务时使用的计算 Quota 名称" +msgstr "Name of computational quota used when submitting tasks" + #: ../../source/options.rst:1 msgid "local_timezone" msgstr "" #: ../../source/options.rst:1 -msgid "使用的时区,None 表示不处理,True 表示本地时区,False 表示 UTC,也可用 pytz 的时区" +msgid "" +"使用的时区,None 表示不处理,True 表示本地时区,False 表示 UTC,也可用 " +"pytz 的时区" msgstr "" "Used time zone. None indicates that PyODPS takes no actions, True " "indicates local time, and False indicates UTC. The time zone of pytz " @@ -303,14 +316,23 @@ msgid "启用 MaxCompute 2.0 语言扩展" msgstr "Enable MaxCompute 2.0 language extension" #: ../../source/options.rst:1 -msgid "sql.always_enable_schema" +msgid "sql.enable_schema" msgstr "" #: ../../source/options.rst:1 msgid "在任何情形下启用 MaxCompute Schema" msgstr "Enable Schema level under any scenario" -#: ../../source/options.rst:56 +#: ../../source/options.rst:1 +msgid "pythonpack.settings" +msgstr "" + +#: ../../source/options.rst:1 +#, fuzzy +msgid "PythonPack运行全局hints" +msgstr "Global hints for PythonPack" + +#: ../../source/options.rst:58 msgid "数据上传/下载配置" msgstr "Data upload/download configurations" @@ -366,7 +388,15 @@ msgstr "Buffer size for block tunnel writers" msgid "20 * 1024 ** 2" msgstr "" -#: ../../source/options.rst:70 +#: ../../source/options.rst:1 +msgid "tunnel.tags" +msgstr "" + +#: ../../source/options.rst:1 +msgid "配置使用 Tunnel 所需的标签" +msgstr "Tags when calling tunnel service" + +#: ../../source/options.rst:73 msgid "DataFrame 配置" msgstr "DataFrame configurations" diff --git a/docs/source/locale/en/LC_MESSAGES/platform-d2.po b/docs/source/locale/en/LC_MESSAGES/platform-d2.po index 3efbecb5..42dcee3a 100644 --- a/docs/source/locale/en/LC_MESSAGES/platform-d2.po +++ b/docs/source/locale/en/LC_MESSAGES/platform-d2.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: PyODPS 0.7.16\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-07-19 13:45+0800\n" +"POT-Creation-Date: 2024-04-04 18:39+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -41,8 +41,8 @@ msgstr "Use the ODPS object" #: ../../source/platform-d2.rst:21 msgid "" -"DataWorks 的 PyODPS 节点中,将会包含一个全局的变量 ``odps`` 或者 ``o`` ,即 ODPS 入口。用户不需要手动定义" -" ODPS 入口。" +"DataWorks 的 PyODPS 节点中,将会包含一个全局的变量 ``odps`` 或者 ``o`` ," +"即 ODPS 入口。用户不需要手动定义 ODPS 入口。" msgstr "" "The PyODPS node in DataWorks includes global variable ``odps`` or ``o``, " "which is the ODPS object. You do not need to manually define the ODPS " @@ -54,8 +54,9 @@ msgstr "" #: ../../source/platform-d2.rst:29 msgid "" -"o 变量在 PyODPS 节点执行前已经提前赋值。除非确定必须这么做,请不要手动设置该变量,这将导致 ODPS 入口被改写, " -"并可能使节点在生产调度时由于默认账号发生变化从而导致权限错误。" +"o 变量在 PyODPS 节点执行前已经提前赋值。除非确定必须这么做,请不要手动" +"设置该变量,这将导致 ODPS 入口被改写, 并可能使节点在生产调度时由于默认" +"账号发生变化从而导致权限错误。" msgstr "" "Value of variable ``o`` is already set before your PyODPS node is " "executed. Please do not set this variable manually unless you have to, or" @@ -68,14 +69,14 @@ msgid "执行SQL" msgstr "Execute SQL statements" #: ../../source/platform-d2.rst:35 -msgid "可以参考 :ref:`执行SQL文档 ` 。" +msgid "可以参考\\ :ref:`执行SQL文档 `\\ 。" msgstr "For more information, see :ref:`Execute SQL statements ` ." #: ../../source/platform-d2.rst:38 msgid "" -"Dataworks 上默认没有打开 instance tunnel,即 instance.open_reader 默认走 result " -"接口(最多一万条)。 打开 instance tunnel,通过 reader.count 能取到记录数,如果要迭代获取全部数据,则需要关闭 " -"limit 限制。" +"Dataworks 上默认没有打开 instance tunnel,即 instance.open_reader 默认走 " +"result 接口(最多一万条)。 打开 instance tunnel,通过 reader.count 能取" +"到记录数,如果要迭代获取全部数据,则需要关闭 limit 限制。" msgstr "" "Instance tunnel is not enabled by default on Dataworks, thus 10000 " "records can be fetched at most. When instance tunnel is enabled, " @@ -103,8 +104,9 @@ msgstr "" #: ../../source/platform-d2.rst:52 msgid "" -"或者通过在 open_reader 上添加 ``tunnel=True``,来仅对这次 open_reader 打开 instance " -"tunnel; 添加 ``limit=False``,来关闭 limit 限制从而能下载全部数据。" +"或者通过在 open_reader 上添加 ``tunnel=True``,来仅对这次 open_reader " +"打开 instance tunnel; 添加 ``limit=False``,来关闭 limit 限制从而能下载" +"全部数据。" msgstr "" "Also you can add ``tunnel=True`` to open_reader to enable instance tunnel" " for this reader only, and add ``limit=False`` to disable limitation and " @@ -120,8 +122,9 @@ msgstr "" #: ../../source/platform-d2.rst:61 msgid "" -"需要注意的是,部分 Project 可能对 Tunnel 下载全部数据设置了限制,因而打开这些选项可能会导致权限错误。 此时应当与 Project" -" Owner 协调,或者考虑将数据处理放在 MaxCompute 中,而不是下载到本地。" +"需要注意的是,部分 Project 可能对 Tunnel 下载全部数据设置了限制,因而打开" +"这些选项可能会导致权限错误。 此时应当与 Project Owner 协调,或者考虑将" +"数据处理放在 MaxCompute 中,而不是下载到本地。" msgstr "" "Note that some project may limit downloading all data from tables, " "therefore you may get a permission error after configuring these options." @@ -138,8 +141,8 @@ msgstr "Execution" #: ../../source/platform-d2.rst:70 msgid "" -"在 DataWorks 的环境里, :ref:`DataFrame ` 的执行需要显式调用 " -":ref:`立即执行的方法(如execute,head等) ` 。" +"在 DataWorks 的环境里,\\ :ref:`DataFrame ` 的执行需要显式调用\\ :ref" +":`立即执行的方法(如execute,head等) `\\ 。" msgstr "" "To execute :ref:`DataFrame ` in DataWorks, you need to explicitly " "call :ref:`automatically executed actions such as execute and head " @@ -150,7 +153,8 @@ msgid "" "from odps.df import DataFrame\n" "\n" "iris = DataFrame(o.get_table('pyodps_iris'))\n" -"for record in iris[iris.sepal_width < 3].execute(): # 调用立即执行的方法\n" +"for record in iris[iris.sepal_width < 3].execute(): # 调用立即执行的方法" +"\n" " # 处理每条record" msgstr "" "from odps.df import DataFrame\n" @@ -191,8 +195,8 @@ msgstr "Print details" #: ../../source/platform-d2.rst:97 msgid "" -"通过设置 ``options.verbose`` 选项。在 DataWorks 上,默认已经处于打开状态,运行过程会打印 logview " -"等详细过程。" +"通过设置 ``options.verbose`` 选项。在 DataWorks 上,默认已经处于打开状态" +",运行过程会打印 logview 等详细过程。" msgstr "" "To print details, you need to set ``options.verbose``. By default, this " "parameter is set to True in DataWorks. The system prints the logview and " @@ -204,9 +208,10 @@ msgstr "Obtain scheduling parameters" #: ../../source/platform-d2.rst:103 msgid "" -"与 DataWorks 中的 SQL 节点不同,为了避免侵入代码,PyODPS 节点 **不会** 在代码中替换 ${param_name} " -"这样的字符串,而是在执行代码前,在全局变量中增加一个名为 ``args`` 的 dict,调度参数可以在此获取。例如, 在节点基本属性 -> " -"参数中设置 ``ds=${yyyymmdd}`` ,则可以通过下面的方式在代码中获取此参数" +"与 DataWorks 中的 SQL 节点不同,为了避免侵入代码,PyODPS 节点 **不会** 在" +"代码中替换 ${param_name} 这样的字符串,而是在执行代码前,在全局变量中增加" +"一个名为 ``args`` 的 dict,调度参数可以在此获取。例如, 在节点基本属性 ->" +" 参数中设置 ``ds=${yyyymmdd}`` ,则可以通过下面的方式在代码中获取此参数" msgstr "" "Different from SQL nodes in DataWorks, to avoid invading your Python code" " which might lead to unpredictable consequences, PyODPS nodes DOES NOT " @@ -242,22 +247,26 @@ msgstr "" #: ../../source/platform-d2.rst:123 msgid "" -"关于如何使用调度参数的详细例子可以参考 `DataWorks 文档 " -"`_ 。" +"关于如何使用调度参数的详细例子可以参考 `DataWorks 文档 `_ 。" msgstr "" "More examples of using schedule parameters can be seen in `DataWorks " "documentation `_ ." #: ../../source/platform-d2.rst:126 -msgid "args 变量在 PyODPS 节点执行前已经提前赋值,请不要手动设置该变量,这将导致调度参数被改写。" +msgid "" +"args 变量在 PyODPS 节点执行前已经提前赋值,请不要手动设置该变量,这将导致" +"调度参数被改写。" msgstr "" "Value of ``arg`` is already set before your PyODPS node is executed. " "Please do not set this variable manually or schedule parameters can be " "overwritten." #: ../../source/platform-d2.rst:128 -msgid "SQL 节点中可用的 ${param_name} 写法不能在 PyODPS 节点中使用, 即便在某些情况下它似乎输出了正确的结果。" +msgid "" +"SQL 节点中可用的 ${param_name} 写法不能在 PyODPS 节点中使用, 即便在某些" +"情况下它似乎输出了正确的结果。" msgstr "" "${param_name} style parameters in SQL nodes shall never be used in PyODPS" " nodes, even if it seems that they produce `correct` results in some " @@ -383,89 +392,128 @@ msgstr "" #: ../../source/platform-d2.rst:150 msgid "" -"如果你需要使用上面列表中不存在的包,DataWorks 节点提供了 ``load_resource_package`` 方法,支持从 " -"MaxCompute 资源下载三方包。使用 ``pyodps-pack`` 打包后,可以直接使用 " -"``load_resource_package`` 方法加载三方包,此后就可以 import 包中的内容。关于 ``pyodps-pack`` " -"的文档可见 :ref:`制作和使用三方包 `。" -msgstr "" -"If you need to use packages not listed above, ``load_resource_package`` " -"method can be used. After calling ``pyodps-pack`` to pack your " -"dependencies, you can call ``load_resource_package`` to install third-" -"party libraries and then use import statement to use them. Details about " -"``pyodps-pack`` can be found :ref:`here `." +"如果你需要使用上面列表中不存在的包,0.12.0 以上版本的 DataWorks PyODPS " +"Python 3 节点提供了 ``resource_pack`` 注释,支持从 MaxCompute 资源下载" +"三方包。使用 ``pyodps-pack`` 打包后,可以直接使用 ``resource_pack`` 注释" +"加载三方包,此后就可以 import 包中的内容。关于 ``pyodps-pack`` 的文档可见" +"\\ :ref:`制作和使用三方包 `。" +msgstr "" +"If you need to use packages not listed above, ``resource_pack`` comment " +"annotation can be used when you are using PyODPS Python 3 node in " +"DataWorks and the version is above 0.12.0. After calling ``pyodps-pack`` " +"to pack your dependencies, you can add a comment with ``resource_pack`` " +"to install third-party libraries and then use import statement to use " +"them. Details about ``pyodps-pack`` can be found :ref:`here " +"`." #: ../../source/platform-d2.rst:156 -msgid "如果为 Python 2 节点打包,请在打包时为 ``pyodps-pack`` 增加 ``--dwpy27`` 参数。" +msgid "" +"如果为 Python 2 节点打包,请在打包时为 ``pyodps-pack`` 增加 ``--dwpy27`` " +"参数。" msgstr "" "If you are creating packages for Python2 nodes, please add ``--dwpy27`` " "argument when calling ``pyodps-pack``." #: ../../source/platform-d2.rst:158 +msgid "" +"建议使用 PyODPS 包版本至少为 0.11.3,否则部分生成的包可能无法正常加载。" +"关于 PyODPS 包及节点执行组件的升级可参考\\ :ref:`这个章节 `。" +msgstr "" +"We propose using PyODPS later than 0.11.3 to load third-party libraries " +"with DataWorks, or some packages may not be imported properly. For more " +"information about upgrading PyODPS and node execution component can be " +"seen in :ref:`this chapter `." + +#: ../../source/platform-d2.rst:161 msgid "例如,使用下面的命令打包" msgstr "For instance, we use command below to create package." -#: ../../source/platform-d2.rst:160 +#: ../../source/platform-d2.rst:163 msgid "pyodps-pack -o ipaddress-bundle.tar.gz ipaddress" msgstr "" -#: ../../source/platform-d2.rst:164 +#: ../../source/platform-d2.rst:167 msgid "" -"并上传 / 提交 ``ipaddress-bundle.tar.gz`` 为资源后,可以在 PyODPS 3 节点中按照下面的方法使用 " -"ipaddress 包:" +"并上传 / 提交 ``ipaddress-bundle.tar.gz`` 为资源后,可以在 PyODPS 3 节点" +"中按照下面的方法使用 ipaddress 包(注意注释是必须的):" msgstr "" "After uploading and submitting ``ipaddress-bundle.tar.gz`` as a resource," -" you can use ``ipaddress`` package with code below." +" you can use ``ipaddress`` package with code below. Note that you need to" +" keep the comment line in your code." -#: ../../source/platform-d2.rst:167 +#: ../../source/platform-d2.rst:170 msgid "" -"load_resource_package(\"ipaddress-bundle.tar.gz\")\n" +"# -*- resource_pack: ipaddress-bundle.tar.gz\n" "import ipaddress" msgstr "" -#: ../../source/platform-d2.rst:172 +#: ../../source/platform-d2.rst:175 msgid "" -"DataWorks 限制下载的包总大小为 100MB。如果你需要跳过预装包的打包,可以在打包时使用 ``pyodps-pack`` 提供的 " -"``--exclude`` 参数。例如,下面的打包方法排除了 DataWorks 环境中存在的 numpy 和 pandas 包。" +"DataWorks 限制下载的包总大小为 100MB。如果你需要跳过预装包的打包,可以在" +"打包时使用 ``pyodps-pack`` 提供的 ``--exclude`` 参数。例如,下面的打包" +"方法排除了 DataWorks 环境中存在的 numpy 和 pandas 包。" msgstr "" "DataWorks limits total package size to 100MB. If you want to exclude " "these preinstalled packages, you can add ``--exclude`` argument when " "calling ``pyodps-pack``. For instance, command below excludes numpy and " "scipy which exists in DataWorks environment." -#: ../../source/platform-d2.rst:175 +#: ../../source/platform-d2.rst:178 msgid "" "pyodps-pack -o bundle.tar.gz --exclude numpy --exclude pandas " "" msgstr "" -#: ../../source/platform-d2.rst:180 -msgid "使用其他账号" -msgstr "Use other accounts" +#: ../../source/platform-d2.rst:182 +msgid "你可以在 ``resource_pack`` 中通过逗号分割的方式引入多个包。" +msgstr "" +"You may add multiple packages with ``resource_pack`` by separating them " +"with commas." #: ../../source/platform-d2.rst:184 msgid "" -"``as_account`` 方法从 PyODPS 0.11.3 开始支持。如果你的 DataWorks 未部署该版本,则无法使用该方法。 " -"如果你使用的是独享资源组,可以考虑升级资源组中的 PyODPS 版本,具体可见 `该文档 " -"`_ 。" +"对于 0.11.3 以上版本的 DataWorks PyODPS Python 3 节点,你也可以使用 ``" +"pyodps-pack`` 打包,并在包加载前\\ 使用 ``load_resource_package`` 方法" +"引入三方包:" msgstr "" -"``as_account`` method is supported since PyODPS 0.11.3. The method is not" -" available when DataWorks you are using does not deploy this version. If " -"you are using exclusive resource groups, you may consider upgrading " -"PyODPS to the latest version. Details can be seen in `this document " -"`_." +"For node execution component later tha 0.11.3, you can also create your " +"package with ``pyodps-pack`` and call ``load_resource_package`` before " +"importing your package." #: ../../source/platform-d2.rst:187 msgid "" -"在某些情形下你可能希望使用其他账号(而非平台提供的账号)访问 MaxCompute。此时,可以使用 ODPS 入口对象的 " -"``as_account`` 方法创建一个使用新账号的入口对象,该对象与系统默认提供的 ``o`` 实例彼此独立。例如:" +"load_resource_package('ipaddress-bundle.tar.gz')\n" +"import ipaddress" +msgstr "" + +#: ../../source/platform-d2.rst:192 +msgid "" +"需要注意的是,如果你需要使用的三方包已经在预装三方包中,使用 ``load_" +"resource_package`` 可能无法加载所需\\ 的版本,此时建议使用 ``resource_" +"pack`` 注释的方式。" +msgstr "" +"Note that if the third-party library you need is already pre-installed, " +"the expected version may not be imported with ``load_resource_package``. " +"In this case you may use ``resource_pack`` comment." + +#: ../../source/platform-d2.rst:196 +msgid "使用其他账号" +msgstr "Use other accounts" + +#: ../../source/platform-d2.rst:197 +msgid "" +"在某些情形下你可能希望使用其他账号(而非平台提供的账号)访问 MaxCompute。" +"从 PyODPS 0.11.3 开始,可以使用 MaxCompute 入口对象的 ``as_account`` 方法" +"创建一个使用新账号的入口对象,该对象与系统默认提供的 ``o`` 实例彼此独立。" +"例如:" msgstr "" "In some cases you may want to use another account to access MaxCompute " -"instead of the one provided by the platform. You may use ``as_account`` " -"method of ``ODPS`` entrance object to create a new entrance object " -"independent with the variable ``o`` provided by the platform. For " -"instance," +"instead of the one provided by the platform. Since PyODPS 0.11.3, you may" +" use ``as_account`` method of MaxCompute entrance object to create a new " +"entrance object independent with the variable ``o`` provided by the " +"platform. For instance," -#: ../../source/platform-d2.rst:190 +#: ../../source/platform-d2.rst:200 msgid "" "import os\n" "# 确保 ALIBABA_CLOUD_ACCESS_KEY_ID 环境变量设置为 Access Key ID,\n" @@ -488,41 +536,45 @@ msgstr "" " os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'),\n" ")" -#: ../../source/platform-d2.rst:202 +#: ../../source/platform-d2.rst:212 msgid "问题诊断" -msgstr "" +msgstr "Diagnose problems" -#: ../../source/platform-d2.rst:203 -msgid "如果你的代码在执行中卡死且没有任何输出,你可以在代码头部增加以下注释,DataWorks 每隔 30 秒将输出所有线程的堆栈:" +#: ../../source/platform-d2.rst:213 +msgid "" +"如果你的代码在执行中卡死且没有任何输出,你可以在代码头部增加以下注释," +"0.11.3 以上版本的 DataWorks PyODPS Python 3 节点每隔 30 秒将输出所有线程" +"的堆栈:" msgstr "" "If your code stuck on execution and no outputs emitted, you can add " -"comment shown below to let DataWorks dump stack trace of all threads " -"every 30 seconds." +"comment shown below to let DataWorks PyODPS Python 3 node dumps stack " +"trace of all threads every 30 seconds." -#: ../../source/platform-d2.rst:205 +#: ../../source/platform-d2.rst:216 msgid "# -*- dump_traceback: true -*-" msgstr "" -#: ../../source/platform-d2.rst:210 +#: ../../source/platform-d2.rst:221 msgid "受限功能" msgstr "Feature restriction" -#: ../../source/platform-d2.rst:212 +#: ../../source/platform-d2.rst:223 msgid "由于缺少 matplotlib 等包,所以如下功能可能会受限。" msgstr "" "DataWorks does not have the ``matplotlib`` library. Therefore, the " "following features may be restricted:" -#: ../../source/platform-d2.rst:214 +#: ../../source/platform-d2.rst:225 msgid "DataFrame的plot函数" msgstr "DataFrame plot function" -#: ../../source/platform-d2.rst:216 +#: ../../source/platform-d2.rst:227 msgid "" -"DataFrame 自定义函数需要提交到 MaxCompute 执行。由于 Python 沙箱的原因,第三方库只支持所有的纯 Python 库以及" -" Numpy, 因此不能直接使用 Pandas,可参考 :ref:`第三方库支持 ` " -"上传和使用所需的库。DataWorks 中执行的非自定义函数代码可以使用平台预装的 Numpy 和 " -"Pandas。其他带有二进制代码的三方包不被支持。" +"DataFrame 自定义函数需要提交到 MaxCompute 执行。由于 Python 沙箱的原因," +"第三方库只支持所有的纯 Python 库以及 Numpy, 因此不能直接使用 Pandas,可" +"参考\\ :ref:`第三方库支持 `\\ 上传和使用所需的库。" +"DataWorks 中执行的非自定义函数代码可以使用平台预装的 Numpy 和 Pandas。" +"其他带有二进制代码的三方包不被支持。" msgstr "" "Custom functions in DataFrame need to be submitted to MaxCompute before " "execution. Due to Python sandbox, third-party libraries which are written" @@ -533,45 +585,108 @@ msgstr "" "can use pre-installed Numpy and Pandas in DataWorks. Other third-party " "libraries with binary codes are not supported currently." -#: ../../source/platform-d2.rst:220 +#: ../../source/platform-d2.rst:231 msgid "" -"由于兼容性的原因,在 DataWorks 中,`options.tunnel.use_instance_tunnel` 默认设置为 " -"False。如果需要全局开启 Instance Tunnel, 需要手动将该值设置为 True。" +"由于兼容性的原因,在 DataWorks 中,`options.tunnel.use_instance_tunnel` " +"默认设置为 False。如果需要全局开启 Instance Tunnel, 需要手动将该值设置为" +" True。" msgstr "" "For compatibility reasons, `options.tunnel.use_instance_tunnel` in " "DataWorks is set to False by default. To enable Instance Tunnel globally," " you need to manually set `options.tunnel.use_instance_tunnel` to True." -#: ../../source/platform-d2.rst:223 -msgid "由于实现的原因,Python 的 atexit 包不被支持,请使用 try - finally 结构实现相关功能。" +#: ../../source/platform-d2.rst:234 +msgid "" +"由于实现的原因,Python 的 atexit 包不被支持,请使用 try - finally 结构" +"实现相关功能。" msgstr "" "For implementation reasons, the Python atexit package is not supported. " "You need to use the try - finally structure to implement related " "features." -#: ../../source/platform-d2.rst:226 +#: ../../source/platform-d2.rst:237 msgid "使用限制" msgstr "Usage restrictions" -#: ../../source/platform-d2.rst:228 +#: ../../source/platform-d2.rst:239 msgid "" -"在 DataWorks 上使用 PyODPS,为了防止对 DataWorks 的 gateway 造成压力,对内存和 CPU 都有限制。这个限制由" -" DataWorks 统一管理。" +"在 DataWorks 上使用 PyODPS,为了防止对 DataWorks 的 gateway 造成压力,对" +"内存和 CPU 都有限制。这个限制由 DataWorks 统一管理。" msgstr "" "To avoid pressure on the gateway of DataWorks when running PyODPS in " "DataWorks, the CPU and memory usage is restricted. DataWorks provides " "central management for this restriction." -#: ../../source/platform-d2.rst:230 -msgid "如果看到 **Got killed** ,即内存使用超限,进程被 kill。因此,尽量避免本地的数据操作。" +#: ../../source/platform-d2.rst:241 +msgid "" +"如果看到 **Got killed** ,即内存使用超限,进程被 kill。因此,尽量避免本地" +"的数据操作。" msgstr "" "If the system displays **Got killed**, this indicates an out-of-memory " "error and that the process has been terminated. Therefore, we do not " "recommend starting local data operations." -#: ../../source/platform-d2.rst:232 +#: ../../source/platform-d2.rst:243 msgid "通过 PyODPS 起的 SQL 和 DataFrame 任务(除 to_pandas) 不受此限制。" msgstr "" "However, the preceding restriction does not work on SQL and DataFrame " "tasks (except to_pandas) that are initiated by PyODPS." +#: ../../source/platform-d2.rst:248 +msgid "升级" +msgstr "Upgrade" + +#: ../../source/platform-d2.rst:250 +msgid "" +"共享资源组中的 DataWorks PyODPS 节点执行组件及 PyODPS 包版本由阿里云维护" +",并会随着 PyODPS 更新而更新。\\ 独享资源组中的节点执行组件及 PyODPS 包则" +"可能在资源组生成时即固定下来。如果你需要使用更新版本 PyODPS 包中\\ 提供的" +"功能(通常指本文档以外的 API),可以参考\\ `该文档 `_\\ 自行升级所需的 PyODPS 版本。需要注意" +"的是,下列功能由 PyODPS 节点执行组件而非 PyODPS 包本身提供。无法通过\\ " +"自行升级进行安装:" +msgstr "" +"PyODPS and DataWorks PyODPS node execution component in shared resource " +"groups is maintained by Alibaba Cloud and will be upgraded when PyODPS " +"package is upgraded. PyODPS and DataWorks PyODPS node execution component" +" in private resource groups are fixed once the resource groups are " +"created. If you need to use functionalities provided by PyODPS package, " +"i.e., functionalities described outside this document, you may take a " +"look at `this document " +"`_" +" to upgrade your PyODPS package. Note that features listed below are " +"provided by the node execution component, not PyODPS package, and cannot " +"be upgraded via self-assisting approach." + +#: ../../source/platform-d2.rst:256 +msgid "调度参数" +msgstr "Scheduling parameters" + +#: ../../source/platform-d2.rst:257 +msgid "通过代码注释提供的能力,例如 ``dump_traceback`` 等" +msgstr "Capabilities provided by comments, for instance, ``dump_traceback``" + +#: ../../source/platform-d2.rst:258 +msgid "``load_resource_package``" +msgstr "" + +#: ../../source/platform-d2.rst:259 +msgid "错误信息自动提示" +msgstr "Automatic hints for errors" + +#: ../../source/platform-d2.rst:261 +msgid "" +"对于 0.11.5 及后续版本的 PyODPS 节点执行组件,当版本与 PyODPS 版本不一致" +"时,会在执行时在日志中同时显示两个\\ 版本号。阿里云将会不定期更新 PyODPS " +"节点执行组件,更新时间点相比共享资源组存在一定的延后。如你对节点执行组件" +"有\\ 更新需求,可以通过工单联系阿里云寻求升级支持。" +msgstr "" +"For node execution components later than 0.11.5, when its version is " +"different from the version of PyODPS, both versions will be shown in the " +"execution log. Alibaba Cloud will upgrade node execution components of " +"node execution component in private resource groups if needed, and the " +"upgrade will be later than shared resource groups. If you need to upgrade" +" node execution component within your private resource group right now, " +"please submit a supporting ticket to seek for assistance from Alibaba " +"Cloud." + diff --git a/docs/source/locale/en/LC_MESSAGES/platform-migrate-ext.po b/docs/source/locale/en/LC_MESSAGES/platform-migrate-ext.po index b1444510..fdbea9b9 100644 --- a/docs/source/locale/en/LC_MESSAGES/platform-migrate-ext.po +++ b/docs/source/locale/en/LC_MESSAGES/platform-migrate-ext.po @@ -15,7 +15,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.5.3\n" +"Generated-By: Babel 2.12.1\n" #: ../../source/platform-migrate-ext.rst:2 msgid "从平台到自行部署" @@ -23,8 +23,8 @@ msgstr "Migration from a platform to local PyODPS" #: ../../source/platform-migrate-ext.rst:3 msgid "" -"如果你需要在本地调试 PyODPS,或者平台中没有提供你所需的包,或者平台的资源限制不能满足你的要求,此时你可能需要 从平台迁移到自己部署的 " -"PyODPS 环境。" +"如果你需要在本地调试 PyODPS,或者平台中没有提供你所需的包,或者平台的资源" +"限制不能满足你的要求,此时你可能需要 从平台迁移到自己部署的 PyODPS 环境。" msgstr "" "If Python on MaxCompute (PyODPS) does not have the required packages or " "resources, and you need to debug PyODPS locally, then you need to migrate" @@ -32,8 +32,9 @@ msgstr "" #: ../../source/platform-migrate-ext.rst:6 msgid "" -"安装 PyODPS 的步骤可以参考 :ref:`安装指南 `。你需要手动创建先前平台为你创建的 ODPS " -"入口对象。可以在先前的平台 使用下列语句生成创建 ODPS 入口对象所需要的语句模板,然后手动修改为可用的代码:" +"安装 PyODPS 的步骤可以参考 :ref:`安装指南 `。你需要手动创建先前" +"平台为你创建的 ODPS 入口对象。可以在先前的平台 使用下列语句生成创建 ODPS " +"入口对象所需要的语句模板,然后手动修改为可用的代码:" msgstr "" "For more information about installing PyODPS, see :ref:`Installation " "instruction `. You need to use PyODPS to manually create the " @@ -50,10 +51,11 @@ msgstr "" #: ../../source/platform-migrate-ext.rst:13 msgid "" -"其中,\\ 和 \\ 需要手动替换成可用的值。access-key 可在 DataWorks " -"中点击右上角图标 -> 用户信息, 再点击“点击查看”获得。Endpoint 可在 `MaxCompute开通Region和服务连接对照表 " -"`_ 获得, 或者联系项目管理员。" +"其中,\\ 和 \\ 需要手动替换成可用的值。access-key " +"可在 DataWorks 中点击右上角图标 -> 用户信息, 再点击“点击查看”获得。" +"Endpoint 可在 `MaxCompute开通Region和服务连接对照表 `_ 获得, 或者联系" +"项目管理员。" msgstr "" "In this code, replace \\ and \\ with applicable " "values. To obtain AccessKeys, move the pointer to the user icon in the " diff --git a/docs/source/locale/en/LC_MESSAGES/platform.po b/docs/source/locale/en/LC_MESSAGES/platform.po index b75b514e..e129b39d 100644 --- a/docs/source/locale/en/LC_MESSAGES/platform.po +++ b/docs/source/locale/en/LC_MESSAGES/platform.po @@ -15,7 +15,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.5.3\n" +"Generated-By: Babel 2.12.1\n" #: ../../source/platform.rst:5 msgid "工具平台使用指南" @@ -23,7 +23,14 @@ msgstr "Platform instructions" #: ../../source/platform.rst:7 msgid "" -"PyODPS 可在 DataWorks 等数据开发平台中作为节点调用。这些平台提供了 PyODPS 运行环境,**不需要** 手动创建 ODPS " -"入口对象,免除了手动配置的麻烦,而且还提供了调度执行的能力。对于想从平台迁移到自行部署 PyODPS 环境的用户,下面也提供了迁移注意事项。" +"PyODPS 可在 DataWorks 等数据开发平台中作为节点调用。这些平台提供了 PyODPS" +" 运行环境,**不需要** 手动创建 ODPS 入口对象,免除了手动配置的麻烦,而且" +"还提供了调度执行的能力。对于想从平台迁移到自行部署 PyODPS 环境的用户," +"下面也提供了迁移注意事项。" msgstr "" -"Python on MaxCompute (PyODPS) can be used as a node on data development platforms such as DataWorks. These platforms provide the PyODPS running environment and allow scheduling and execution. Therefore, you **do not need** to manually create the ODPS object. To migrate from these platforms to local PyODPS, see the following migration instructions:" +"Python on MaxCompute (PyODPS) can be used as a node on data development " +"platforms such as DataWorks. These platforms provide the PyODPS running " +"environment and allow scheduling and execution. Therefore, you **do not " +"need** to manually create the ODPS object. To migrate from these " +"platforms to local PyODPS, see the following migration instructions:" + diff --git a/docs/source/locale/en/LC_MESSAGES/pyodps-pack-minikube.po b/docs/source/locale/en/LC_MESSAGES/pyodps-pack-minikube.po index 518cd48a..08754a94 100644 --- a/docs/source/locale/en/LC_MESSAGES/pyodps-pack-minikube.po +++ b/docs/source/locale/en/LC_MESSAGES/pyodps-pack-minikube.po @@ -15,7 +15,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.9.1\n" +"Generated-By: Babel 2.12.1\n" #: ../../source/pyodps-pack-minikube.rst:4 msgid "使用 Minikube 作为 Docker 环境" @@ -23,9 +23,9 @@ msgstr "Use Minikube as Docker environment" #: ../../source/pyodps-pack-minikube.rst:5 msgid "" -"`Minikube `_ 是一种常用的 Docker Desktop " -"替代环境。与 Docker Desktop 或者 Rancher Desktop 直接通过图形界面启动不同,Minikube " -"需要通过命令行启动并手动配置环境。" +"`Minikube `_ 是一种常用的 Docker " +"Desktop 替代环境。与 Docker Desktop 或者 Rancher Desktop 直接通过图形界面" +"启动不同,Minikube 需要通过命令行启动并手动配置环境。" msgstr "" "`Minikube `_ is a common alternative " "to Docker Desktop. Instead of graphical user interfaces in Docker Desktop" @@ -34,8 +34,8 @@ msgstr "" #: ../../source/pyodps-pack-minikube.rst:8 msgid "" -"依照 `这篇文档 `_ 完成安装 Minikube 后,启动 " -"Minikube:" +"依照 `这篇文档 `_ 完成安装 " +"Minikube 后,启动 Minikube:" msgstr "" "After installing minikube according to `this article " "`_, launch it with command " @@ -56,7 +56,9 @@ msgid "eval $(minikube -p minikube docker-env)" msgstr "" #: ../../source/pyodps-pack-minikube.rst:20 -msgid "此后,即可在当前 Shell 会话中使用 pyodps-pack 进行打包。如果启动新的 Shell 会话,你可能需要重新配置环境变量。" +msgid "" +"此后,即可在当前 Shell 会话中使用 pyodps-pack 进行打包。如果启动新的 " +"Shell 会话,你可能需要重新配置环境变量。" msgstr "" "Then use ``pyodps-pack`` to pack in current shell. If you want to pack " "under new shells, you might need to configure environment variables " @@ -64,8 +66,8 @@ msgstr "" #: ../../source/pyodps-pack-minikube.rst:22 msgid "" -"对于 Windows 用户,可能需要使用 HyperV 作为默认 VM 驱动,参见 `这篇文档 " -"`_:" +"对于 Windows 用户,可能需要使用 HyperV 作为默认 VM 驱动,参见 `这篇文档 <" +"https://minikube.sigs.k8s.io/docs/drivers/hyperv/>`_:" msgstr "" "Windows users might need to use HyperV as default VM driver, see `this " "document `_ for more " @@ -98,9 +100,9 @@ msgstr "" #: ../../source/pyodps-pack-minikube.rst:40 msgid "" -"关于如何使用 Minikube 的进一步信息请参考 `Minikube 文档 " -"`_ 。" +"关于如何使用 Minikube 的进一步信息请参考 `Minikube 文档 `_ 。" msgstr "" -"Please take a look at `minikube documents `_" -" for more details about Minikube." +"Please take a look at `minikube documents " +"`_ for more details about Minikube." diff --git a/docs/source/locale/en/LC_MESSAGES/pyodps-pack.po b/docs/source/locale/en/LC_MESSAGES/pyodps-pack.po index 4e84c64e..fa82c816 100644 --- a/docs/source/locale/en/LC_MESSAGES/pyodps-pack.po +++ b/docs/source/locale/en/LC_MESSAGES/pyodps-pack.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: PyODPS 0.11.3\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2024-07-08 17:12+0800\n" +"POT-Creation-Date: 2024-08-29 19:10+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -27,10 +27,11 @@ msgstr "Create third-party libraries" #: ../../source/pyodps-pack.rst:8 msgid "" -"PyODPS 自 0.11.3 起提供了 ``pyodps-pack`` 命令行工具,用于制作符合 PyODPS 及 DataWorks " -"PyODPS 节点标准的三方包,使用方法类似 ``pip`` 命令。你可以使用该工具将所有依赖项目制作成一个 ``.tar.gz`` " -"压缩包,其中包含所有依照 MaxCompute / DataWorks 环境编译并打包的项目依赖。如果你的项目有自行创建的 Python " -"包,也可以使用该工具进行打包。" +"PyODPS 自 0.11.3 起提供了 ``pyodps-pack`` 命令行工具,用于制作符合 PyODPS" +" 及 DataWorks PyODPS 节点标准的三方包,使用方法类似 ``pip`` 命令。你可以" +"使用该工具将所有依赖项目制作成一个 ``.tar.gz`` 压缩包,其中包含所有依照 " +"MaxCompute / DataWorks 环境编译并打包的项目依赖。如果你的项目有自行创建的" +" Python 包,也可以使用该工具进行打包。" msgstr "" "PyODPS provides a pip-like command line tool, ``pyodps-pack``, to support" " creating third-party library bundles that can be used in PyODPS and " @@ -49,16 +50,17 @@ msgstr "Docker mode" #: ../../source/pyodps-pack.rst:17 msgid "" -"你需要安装 Docker 以顺利使用 Docker 模式运行 ``pyodps-pack``。 ``pyodps-pack`` 会自动调用已安装的" -" Docker,你不需要手动在 Docker 中运行 ``pyodps-pack``。对于 Linux 环境,可以参考 `Docker 官方文档 " -"`_ 安装 Docker。对于 MacOS / " -"Windows,个人开发者可以使用 `Docker Desktop `_ 。对于没有购买过授权的企业用户,推荐使用开源的 `Rancher Desktop " -"`_ ( `中国内地镜像 " -"`_ )。你也可以考虑使用 `minikube `_, " -"但需要一些额外的步骤,见 :ref:`这份文档 `。我们没有在其他 Docker 环境中测试 ``pyodps-" -"pack`` ,不保证在这些环境中的可用性。" +"你需要安装 Docker 以顺利使用 Docker 模式运行 ``pyodps-pack``。 ``pyodps-" +"pack`` 会自动调用已安装的 Docker,你不需要手动在 Docker 中运行 ``pyodps-" +"pack``。对于 Linux 环境,可以参考 `Docker 官方文档 `_ 安装 Docker。对于 MacOS / Windows,个人开发者可以" +"使用 `Docker Desktop `_ " +"。对于没有购买过授权的企业用户,推荐使用开源的 `Rancher Desktop `_ ( `中国内地镜像 `_ )。你也可以考虑使用 `" +"minikube `_, 但需要一些额外的步骤," +"见 :ref:`这份文档 `。我们没有在其他 Docker 环境中测试 ``" +"pyodps-pack`` ,不保证在这些环境中的可用性。" msgstr "" "You need to install Docker to run ``pyodps-pack`` correctly in Docker " "mode. You don't need to run ``pyodps-pack`` manually inside a Docker " @@ -76,10 +78,10 @@ msgstr "" #: ../../source/pyodps-pack.rst:26 msgid "" -"对于期望在版本较老的专有云中的 MaxCompute / DataWorks 使用 ``--legacy-image`` 选项打包的用户,在 " -"Windows / MacOS 或者部分内核的 Linux 系统中可能出现无法打包的错误,请参考 `本文 " -"`_ 配置合适的打包环境。" +"对于期望在版本较老的专有云中的 MaxCompute / DataWorks 使用 ``--legacy-" +"image`` 选项打包的用户,在 Windows / MacOS 或者部分内核的 Linux 系统中" +"可能出现无法打包的错误,请参考 `本文 `_ 配置合适的打包环境。" msgstr "" "For users who want to create packages for legacy MaxCompute / DataWorks " "in private clouds, ``--legacy-image`` option might be used. In Windows, " @@ -90,10 +92,12 @@ msgstr "" #: ../../source/pyodps-pack.rst:31 msgid "" -"对于 Windows 用户,可能你的 Docker 服务需要依赖 Windows 系统的 Server 服务才能启动,而 Server " -"服务由于安全问题在很多企业被禁止启动。 在遇到问题时,请改用 Linux 打包或者设法启用 Server 服务。Rancher Desktop 在" -" Windows 10 下可能无法使用 ``containerd`` 作为容器引擎,可以尝试改用 ``dockerd`` ,具体参考 `该文档 " -"`_ 进行配置。" +"对于 Windows 用户,可能你的 Docker 服务需要依赖 Windows 系统的 Server " +"服务才能启动,而 Server 服务由于安全问题在很多企业被禁止启动。 在遇到问题" +"时,请改用 Linux 打包或者设法启用 Server 服务。Rancher Desktop 在 Windows" +" 10 下可能无法使用 ``containerd`` 作为容器引擎,可以尝试改用 ``dockerd`` " +",具体参考 `该文档 `_ 进行配置。" msgstr "" "For Windows users, it is possible that your Docker service depends on " "Server service of Windows system. However, this service is often " @@ -106,9 +110,10 @@ msgstr "" #: ../../source/pyodps-pack.rst:36 msgid "" -"如果你的 MaxCompute / DataWorks 基于 ARM64 机型部署(通常是专有云),你需要额外增加 ``--arch " -"aarch64`` 参数指定打包需要的架构。通常 Docker Desktop / Rancher Desktop 已经安装了跨平台打包所需的 " -"``binfmt`` 相关组件,你也可以使用命令" +"如果你的 MaxCompute / DataWorks 基于 ARM64 机型部署(通常是专有云),你" +"需要额外增加 ``--arch aarch64`` 参数指定打包需要的架构。通常 Docker " +"Desktop / Rancher Desktop 已经安装了跨平台打包所需的 ``binfmt`` 相关组件" +",你也可以使用命令" msgstr "" "If your MaxCompute or DataWorks are deployed on ARM64 architecture " "(usually within proprietary clouds), you need to add an extra ``--arch " @@ -123,8 +128,8 @@ msgstr "" #: ../../source/pyodps-pack.rst:43 msgid "" -"安装相关的虚拟环境。该命令要求 Linux Kernel 版本高于 4.8,具体可以参考 `该页面 " -"`_。" +"安装相关的虚拟环境。该命令要求 Linux Kernel 版本高于 4.8,具体可以参考 `" +"该页面 `_。" msgstr "" "This command requires version of Linux kernel above 4.8. Details of the " "command can be found in `this article " @@ -135,7 +140,9 @@ msgid "无 Docker 模式" msgstr "Non-Docker mode" #: ../../source/pyodps-pack.rst:55 -msgid "我们建议在打包时,尽量使用 Docker 模式。非 Docker 模式仅用于 Docker 不可用的场景,且生成的包有可能不可用。" +msgid "" +"我们建议在打包时,尽量使用 Docker 模式。非 Docker 模式仅用于 Docker 不可" +"用的场景,且生成的包有可能不可用。" msgstr "" "We recommend using Docker mode to create packages if possible. Non-Docker" " mode might be used only when Docker is not available. It is also " @@ -143,16 +150,17 @@ msgstr "" #: ../../source/pyodps-pack.rst:57 msgid "" -"如果你安装 Docker 遇到困难,你可以尝试使用非 Docker 模式。使用方式为新增一个 ``--without-docker`` " -"参数。该模式需要你的 Python 环境中已经安装 pip。如果使用该模式出现错误,请改用 Docker 模式。Windows 用户需要安装 " -"Git bash 以使用该模式,Git bash 包含在 `Git for Windows " -"`_ 中。" +"如果你安装 Docker 遇到困难,你可以尝试使用非 Docker 模式。使用方式为新增" +"一个 ``--no-docker`` 参数。该模式需要你的 Python 环境中已经安装 pip。如果" +"使用该模式出现错误,请改用 Docker 模式。Windows 用户需要安装 Git bash 以" +"使用该模式,Git bash 包含在 `Git for Windows `" +"_ 中。" msgstr "" "When you have problems installing Docker, you might try non-Docker mode " -"by adding a ``--without-docker`` argument. When using non-Docker mode, " -"``pip`` is needed in your Python installation. Windows users need to " -"install Git bash to use non-Docker mode, which is included in `Git for " -"Windows `_." +"by adding a ``--no-docker`` argument. When using non-Docker mode, ``pip``" +" is needed in your Python installation. Windows users need to install Git" +" bash to use non-Docker mode, which is included in `Git for Windows " +"`_." #: ../../source/pyodps-pack.rst:62 msgid "打包所有依赖" @@ -160,8 +168,9 @@ msgstr "Pack all dependencies" #: ../../source/pyodps-pack.rst:65 msgid "" -"MaxCompute 建议除非不得已,新项目请尽量使用 Python 3。我们不保证下面的打包步骤对 Python 2 的可用性。 " -"旧项目在可能的情况下请尽量迁移到 Python 3 以减少后续维护的难度。" +"MaxCompute 建议除非不得已,新项目请尽量使用 Python 3。我们不保证下面的" +"打包步骤对 Python 2 的可用性。 旧项目在可能的情况下请尽量迁移到 Python 3 " +"以减少后续维护的难度。" msgstr "" "It is recommended to use Python 3 for new projects. We do not guarantee " "availability of methods below for Python 2. You might try your best " @@ -169,19 +178,25 @@ msgstr "" "maintenance in future." #: ../../source/pyodps-pack.rst:68 -msgid "在 Linux 中使用下列命令时,请使用 ``sudo`` 调用 ``pyodps-pack`` 以保证 Docker 正常运行。" +msgid "" +"在 Linux 中使用下列命令时,请使用 ``sudo`` 调用 ``pyodps-pack`` 以保证 " +"Docker 正常运行。" msgstr "" "Please add ``sudo`` when calling ``pyodps-pack`` in Linux to make sure " "Docker is called correctly." #: ../../source/pyodps-pack.rst:70 -msgid "在 macOS 中使用下列命令时,\\ **不建议**\\ 使用 ``sudo``,这可能会导致无法预期的权限问题。" +msgid "" +"在 macOS 中使用下列命令时,\\ **不建议**\\ 使用 ``sudo``,这可能会导致" +"无法预期的权限问题。" msgstr "" "Please avoid using ``sudo`` when calling ``pyodps-pack`` in macOS to " "avoid potential permission errors." #: ../../source/pyodps-pack.rst:72 -msgid "安装完 PyODPS 后,你可以使用下面的命令为 Python 3 打包 pandas 及所有 pandas 的依赖项:" +msgid "" +"安装完 PyODPS 后,你可以使用下面的命令为 Python 3 打包 pandas 及所有 " +"pandas 的依赖项:" msgstr "" "After PyODPS is installed, you can use command below to pack pandas and " "all its dependencies." @@ -195,7 +210,7 @@ msgid "使用非 Docker 模式打包,可以用" msgstr "If you want to use non-Docker mode to pack, you can use" #: ../../source/pyodps-pack.rst:80 -msgid "pyodps-pack --without-docker pandas" +msgid "pyodps-pack --no-docker pandas" msgstr "" #: ../../source/pyodps-pack.rst:84 @@ -224,7 +239,9 @@ msgid "" msgstr "" #: ../../source/pyodps-pack.rst:102 -msgid "并在当前目录中生成一个 ``packages.tar.gz`` 文件,其中包括上面列出的所有依赖项目。" +msgid "" +"并在当前目录中生成一个 ``packages.tar.gz`` 文件,其中包括上面列出的所有" +"依赖项目。" msgstr "" "and generates a ``packages.tar.gz`` with all dependency items listed " "above." @@ -232,9 +249,10 @@ msgstr "" #: ../../source/pyodps-pack.rst:104 msgid "" "如果你希望为 Python 2.7 打包,请确定你的包要在 MaxCompute 还是 DataWorks " -"中使用。如果你不确定你的包将在哪个环境中使用, 请参考 `这篇文章 `_ 。 如果要在 " -"MaxCompute 中使用 Python 2.7 包,可以使用下面的打包命令:" +"中使用。如果你不确定你的包将在哪个环境中使用, 请参考 `这篇文章 `_ 。 如果要在 MaxCompute 中使用 Python 2.7 包,可以使用下面" +"的打包命令:" msgstr "" "If you need to create packages for Python 2.7, please check which " "environment your package will work with, MaxCompute or DataWorks. If you " @@ -263,9 +281,10 @@ msgstr "Pack custom source code" #: ../../source/pyodps-pack.rst:120 msgid "" -"``pyodps-pack`` 支持打包使用 ``setup.py`` 或者 ``pyproject.toml`` 组织的用户自定义 Python" -" project。如果你之前从未 接触过相关知识,可以参考 `这个链接 " -"`_ 获取更多信息。" +"``pyodps-pack`` 支持打包使用 ``setup.py`` 或者 ``pyproject.toml`` 组织的" +"用户自定义 Python project。如果你之前从未 接触过相关知识,可以参考 `这个" +"链接 `_ 获取更多" +"信息。" msgstr "" "``pyodps-pack`` supports packing user-defined source code organized with " "``setup.py`` or ``pyproject.toml``. If you want to know how to build " @@ -274,7 +293,9 @@ msgstr "" "information." #: ../../source/pyodps-pack.rst:123 -msgid "下面用基于 ``pyproject.toml`` 组织的项目举例介绍一下如何使用 ``pyodps-pack`` 打包。假定项目的目录结构如下:" +msgid "" +"下面用基于 ``pyproject.toml`` 组织的项目举例介绍一下如何使用 ``pyodps-" +"pack`` 打包。假定项目的目录结构如下:" msgstr "" "We show how to pack custom code by creating a custom package with " "``pyproject.toml`` and packing with ``pyodps-pack``. Assuming that the " @@ -309,8 +330,8 @@ msgstr "" #: ../../source/pyodps-pack.rst:148 msgid "" -"完成包的开发后,使用下面的命令可以将此包和所有依赖打包进 ``packages.tar.gz`` 文件中( ``path_to_package``" -" 为 ``test_package_root`` 的上级路径):" +"完成包的开发后,使用下面的命令可以将此包和所有依赖打包进 ``packages.tar." +"gz`` 文件中( ``path_to_package`` 为 ``test_package_root`` 的上级路径):" msgstr "" "After development of the package, we can pack this package and all the " "dependencies into ``packages.tar.gz``. (``path_to_package`` is the parent" @@ -326,8 +347,8 @@ msgstr "Pack code in a Git repository" #: ../../source/pyodps-pack.rst:157 msgid "" -"``pyodps-pack`` 支持打包远程 Git 代码仓库(例如 Github)中的代码。以 PyODPS " -"本身为例,可以使用下面的命令执行打包:" +"``pyodps-pack`` 支持打包远程 Git 代码仓库(例如 Github)中的代码。以 " +"PyODPS 本身为例,可以使用下面的命令执行打包:" msgstr "" "Packing remote Git repositories is supported in ``pyodps-pack``. We take " "PyODPS repository as an example to show how to pack a remote Git " @@ -349,9 +370,10 @@ msgstr "" #: ../../source/pyodps-pack.rst:169 msgid "" -"如果打包前需要安装一些打包依赖(例如 ``cython``),可以使用 ``--install-requires`` 参数增加安装时依赖。 " -"也可以编写一个格式与 ``requirements.txt`` 相同的 ``install-requires.txt`` 文件,并使用 " -"``--install-requires-file`` 选项指定。例如,如果需要先安装 ``Cython`` 再打包 PyODPS,可以写" +"如果打包前需要安装一些打包依赖(例如 ``cython``),可以使用 ``--install-" +"requires`` 参数增加安装时依赖。 也可以编写一个格式与 ``requirements.txt``" +" 相同的 ``install-requires.txt`` 文件,并使用 ``--install-requires-file``" +" 选项指定。例如,如果需要先安装 ``Cython`` 再打包 PyODPS,可以写" msgstr "" "If you want to install dependencies on build, for instance, ``cython``, " "you can use ``--install-requires`` argument to specify a build-time " @@ -392,9 +414,10 @@ msgstr "A more complicated case: adding binary dependencies" #: ../../source/pyodps-pack.rst:195 msgid "" -"一部分包包含额外的二进制依赖,例如需要编译 / 安装的外部动态链接库。``pyodps-pack`` 提供了 ``--run-before`` " -"参数用以指定打包前需要执行的步骤,该步骤中可以安装所需的二进制依赖。 我们用地理信息库 `GDAL `_ " -"来说明如何打包。" +"一部分包包含额外的二进制依赖,例如需要编译 / 安装的外部动态链接库。``" +"pyodps-pack`` 提供了 ``--run-before`` 参数用以指定打包前需要执行的步骤," +"该步骤中可以安装所需的二进制依赖。 我们用地理信息库 `GDAL `_ 来说明如何打包。" msgstr "" "Some third-party libraries depend on extra binary dependencies, for " "instance, extra dynamically-linked libraries needed to be built and " @@ -405,10 +428,11 @@ msgstr "" #: ../../source/pyodps-pack.rst:199 msgid "" -"首先确定打包时需要安装的二进制依赖。根据 GDAL 3.6.0.1 在 `PyPI 上的文档 " -"`_ ,我们需要安装 3.6.0 以上版本的 libgdal。 `libgdal " -"的编译说明 `_ 则指出,该包依赖 6.0 以上的 " -"PROJ 包,这两个二进制包均使用 CMake 打包。据此,编写二进制包安装文件并保存为 ``install-gdal.sh``:" +"首先确定打包时需要安装的二进制依赖。根据 GDAL 3.6.0.1 在 `PyPI 上的文档 <" +"https://pypi.org/project/GDAL/>`_ ,我们需要安装 3.6.0 以上版本的 libgdal" +"。 `libgdal 的编译说明 `_ " +"则指出,该包依赖 6.0 以上的 PROJ 包,这两个二进制包均使用 CMake 打包。" +"据此,编写二进制包安装文件并保存为 ``install-gdal.sh``:" msgstr "" "First, we need to find which dependencies needed to install. Given `the " "document of GDAL 3.6.0.1 on PyPI `_, we " @@ -455,10 +479,12 @@ msgstr "" #: ../../source/pyodps-pack.rst:232 msgid "" -"在某些情况下,二进制依赖被通过动态链接(例如使用 ``ctypes.cdll.LoadLibrary`` )引入到 Python " -"中。此时,你可以使用 ``--dynlib`` 参数指定需要包含在包中的二进制依赖路径(或者 /lib 下的包名),该依赖将被打包到 " -"``packages/dynlibs`` 路径下。例如,Python 库 ``unrar`` 动态链接了 ``libunrar`` " -"这个二进制库,我们使用下面的 ``install-libunrar.sh`` 代码编译和安装:" +"在某些情况下,二进制依赖被通过动态链接(例如使用 ``ctypes.cdll." +"LoadLibrary`` )引入到 Python 中。此时,你可以使用 ``--dynlib`` 参数指定" +"需要包含在包中的二进制依赖路径(或者 /lib 下的包名),该依赖将被打包到 ``" +"packages/dynlibs`` 路径下。例如,Python 库 ``unrar`` 动态链接了 ``" +"libunrar`` 这个二进制库,我们使用下面的 ``install-libunrar.sh`` 代码编译" +"和安装:" msgstr "" "In some scenarios binary dependencies are dynamically linked (for " "instance, with ``ctypes.cdll.LoadLibrary``) into Python. You may use " @@ -497,8 +523,9 @@ msgstr "" #: ../../source/pyodps-pack.rst:256 msgid "" -"在上述命令中, ``--dynlib`` 的值 ``unrar`` 省略了 lib 前缀, ``pyodps-pack`` 实际找到的是 " -"``/lib/libunrar.so`` 。如果有多个动态链接库, ``--dynlib`` 可被指定多次。" +"在上述命令中, ``--dynlib`` 的值 ``unrar`` 省略了 lib 前缀, ``pyodps-" +"pack`` 实际找到的是 ``/lib/libunrar.so`` 。如果有多个动态链接库, ``--" +"dynlib`` 可被指定多次。" msgstr "" "In the above command, the value ``unrar`` for ``--dynlib`` omits prefix " "``lib``, and what ``pyodps-pack`` actually finds is ``/lib/libunrar.so``." @@ -506,7 +533,9 @@ msgstr "" "specify ``--dynlib`` multiple times." #: ../../source/pyodps-pack.rst:259 -msgid "由于动态链接库的复杂性,你可能需要在 import 你的三方库前手动加载动态链接库,例如" +msgid "" +"由于动态链接库的复杂性,你可能需要在 import 你的三方库前手动加载" +"动态链接库,例如" msgstr "" "Due to complexity of dynamically-linked libraries, you may need to load " "these libraries manually before actually importing your Python library. " @@ -522,8 +551,8 @@ msgstr "" #: ../../source/pyodps-pack.rst:267 msgid "" -"对 ``LoadLibrary`` 路径的具体说明请参考 :ref:`Python UDF 使用三方包 ` " -"中的说明。" +"对 ``LoadLibrary`` 路径的具体说明请参考 :ref:`Python UDF 使用三方包 <" +"pyodps_pack_udf>` 中的说明。" msgstr "" "Detail information about path used in ``LoadLibrary`` in code above can " "be seen in directions in :ref:`using third-party libraries in Python UDF " @@ -558,7 +587,9 @@ msgid "``--install-requires ``" msgstr "" #: ../../source/pyodps-pack.rst:284 -msgid "指定打包时所需的 PyPI 依赖,可指定多个。这些依赖 **不一定** 会包含在最终的包中。" +msgid "" +"指定打包时所需的 PyPI 依赖,可指定多个。这些依赖 **不一定** 会包含在最终" +"的包中。" msgstr "" "Specify build-time requirements, might not be included in the final " "package." @@ -568,7 +599,9 @@ msgid "``--install-requires-file ``" msgstr "" #: ../../source/pyodps-pack.rst:288 -msgid "指定打包时所需的 PyPI 依赖定义文件,可指定多个。这些依赖 **不一定** 会包含在最终的包中。" +msgid "" +"指定打包时所需的 PyPI 依赖定义文件,可指定多个。这些依赖 **不一定** 会" +"包含在最终的包中。" msgstr "" "Specify build-time requirements in files, might not be included in the " "final package." @@ -616,7 +649,9 @@ msgid "``--proxy ``" msgstr "" #: ../../source/pyodps-pack.rst:308 -msgid "指定打包所用的代理服务器,以 scheme://[user:passwd@]proxy.server:port 这样的形式。" +msgid "" +"指定打包所用的代理服务器,以 scheme://[user:passwd@]proxy.server:port " +"这样的形式。" msgstr "Specify a proxy in the form scheme://[user:passwd@]proxy.server:port." #: ../../source/pyodps-pack.rst:310 @@ -643,8 +678,8 @@ msgstr "" #: ../../source/pyodps-pack.rst:320 msgid "" -"指定打包时所需的仓库索引 URL。如果缺省,会使用 ``pip config list`` 命令返回的 ``global.index-url`` " -"值,该值通常配置在 ``pip.conf`` 配置文件中。" +"指定打包时所需的仓库索引 URL。如果缺省,会使用 ``pip config list`` 命令" +"返回的 ``global.index-url`` 值,该值通常配置在 ``pip.conf`` 配置文件中。" msgstr "" "Specify URL of package indexes of PyPI package. If absent, will use " "``global.index-url`` in ``pip config list`` command by default." @@ -654,7 +689,9 @@ msgid "``--extra-index-url ``" msgstr "" #: ../../source/pyodps-pack.rst:325 -msgid "指定除 ``--index-url`` 之外需要使用的仓库索引 URL,规则与 ``--index-url`` 类似。" +msgid "" +"指定除 ``--index-url`` 之外需要使用的仓库索引 URL,规则与 ``--index-url``" +" 类似。" msgstr "" "Extra URLs of package indexes to use in addition to ``--index-url``. " "Should follow the same rules as ``--index-url``." @@ -674,7 +711,9 @@ msgid "``-l``, ``--legacy-image``" msgstr "" #: ../../source/pyodps-pack.rst:333 -msgid "指定后,将使用 CentOS 5 镜像进行打包,这使得包可以被用在旧版专有云等环境中。" +msgid "" +"指定后,将使用 CentOS 5 镜像进行打包,这使得包可以被用在旧版专有云等环境" +"中。" msgstr "" "If specified, will use CentOS 5 to pack, making the final package " "available under old environments such as legacy proprietary clouds." @@ -684,7 +723,9 @@ msgid "``--mcpy27``" msgstr "" #: ../../source/pyodps-pack.rst:337 -msgid "指定后,将为 MaxCompute 内的 Python 2.7 制作三方包。如果启用,将默认 ``--legacy-image`` 选项开启。" +msgid "" +"指定后,将为 MaxCompute 内的 Python 2.7 制作三方包。如果启用,将默认 ``--" +"legacy-image`` 选项开启。" msgstr "" "If specified, will build packages for Python 2.7 on MaxCompute and assume" " ``--legacy-image`` is enabled." @@ -694,7 +735,9 @@ msgid "``--dwpy27``" msgstr "" #: ../../source/pyodps-pack.rst:341 -msgid "指定后,将为 DataWorks 内的 Python 2.7 制作三方包。如果启用,将默认 ``--legacy-image`` 选项开启。" +msgid "" +"指定后,将为 DataWorks 内的 Python 2.7 制作三方包。如果启用,将默认 ``--" +"legacy-image`` 选项开启。" msgstr "" "If specified, will build packages for Python 2.7 on DataWorks and assume " "``--legacy-image`` is enabled." @@ -715,8 +758,9 @@ msgstr "" #: ../../source/pyodps-pack.rst:349 msgid "" -"指定目标包面向的硬件架构,目前仅支持 x86\\_64 和 aarch64(或 arm64),默认为 x86\\_64。如果你并不在专有云使用 " -"MaxCompute 或 DataWorks,**不要指定这个参数**。" +"指定目标包面向的硬件架构,目前仅支持 x86\\_64 和 aarch64(或 arm64)," +"默认为 x86\\_64。如果你并不在专有云使用 MaxCompute 或 DataWorks,**不要" +"指定这个参数**。" msgstr "" "Specify the hardware architecture for the package. Currently only " "x86\\_64 and aarch64 (or equivalently arm64) supported. x86_64 by " @@ -729,8 +773,8 @@ msgstr "" #: ../../source/pyodps-pack.rst:354 msgid "" -"指定目标面向的 Python 版本,可使用 3.6 或者 36 表示 Python 3.6。如果你并不在专有云使用 MaxCompute 或 " -"DataWorks,**不要指定这个参数**。" +"指定目标面向的 Python 版本,可使用 3.6 或者 36 表示 Python 3.6。如果你并" +"不在专有云使用 MaxCompute 或 DataWorks,**不要指定这个参数**。" msgstr "" "Specify Python version for the package. You may use 3.6 or 36 to stand " "for Python 3.6. If you are not running your code inside a proprietary " @@ -742,9 +786,10 @@ msgstr "" #: ../../source/pyodps-pack.rst:359 msgid "" -"指定后,将引入 .so 动态链接库,可以指定具体路径,也可以指定库名(包含或不包含 lib 前缀均可)。 ``pyodps-pack`` " -"将在/lib、/lib64、/usr/lib、/usr/lib64中查找对应库,并置入包中 packages/dynlibs " -"下。你可能需要手动调用 ``ctypes.cdll.LoadLibrary`` 在相应包路径引用这些库。" +"指定后,将引入 .so 动态链接库,可以指定具体路径,也可以指定库名(包含或不" +"包含 lib 前缀均可)。 ``pyodps-pack`` 将在/lib、/lib64、/usr/lib、/usr/" +"lib64中查找对应库,并置入包中 packages/dynlibs 下。你可能需要手动调用 ``" +"ctypes.cdll.LoadLibrary`` 在相应包路径引用这些库。" msgstr "" "Specify .so libraries to link dynamically. You may specify a path to the " "required library, or just the name of the library (with or without lib " @@ -760,26 +805,28 @@ msgstr "" #: ../../source/pyodps-pack.rst:365 msgid "" -"指定在执行 Docker 命令时需要额外附加的参数。如有多个参数需用引号包裹,例如 ``--docker-args \"--ip " -"192.168.1.10\"``。" +"指定在执行 Docker 命令时需要额外附加的参数。如有多个参数需用引号包裹," +"例如 ``--docker-args \"--ip 192.168.1.10\"``。" msgstr "" "Specify extra arguments needed for Docker command. If there are more than" " one argument, please put them within quote marks. For instance, " "``--docker-args \"--ip 192.168.1.10\"``." #: ../../source/pyodps-pack.rst:367 -msgid "``--without-docker``" +msgid "``--no-docker``" msgstr "" #: ../../source/pyodps-pack.rst:369 -msgid "使用无 Docker 模式运行 ``pyodps-pack``。当依赖中存在二进制依赖,可能报错或导致包不可用。" +msgid "" +"使用无 Docker 模式运行 ``pyodps-pack``。当依赖中存在二进制依赖,可能报错" +"或导致包不可用。" msgstr "" "Use non-Docker mode to run ``pyodps-pack``. You might receive errors or " "get malfunctioning packages with this mode when there are binary " "dependencies." #: ../../source/pyodps-pack.rst:371 -msgid "``--without-merge``" +msgid "``--no-merge``" msgstr "" #: ../../source/pyodps-pack.rst:373 @@ -793,80 +840,101 @@ msgid "``--skip-scan-pkg-resources``" msgstr "" #: ../../source/pyodps-pack.rst:377 -msgid "在打包过程中不在包中扫描和解决 ``pkg_resources`` 的依赖,当依赖项较多时可加快打包速度。" +msgid "" +"在打包过程中不在包中扫描和解决 ``pkg_resources`` 的依赖,当依赖项较多时可" +"加快打包速度。" msgstr "" "Skip scanning and resolving dependencies for ``pkg_resources`` in the " "package. Once configured, may save time when there are a large number of " "dependencies." #: ../../source/pyodps-pack.rst:379 -msgid "``--debug``" +msgid "``--find-vcs-root``" msgstr "" #: ../../source/pyodps-pack.rst:381 +msgid "" +"当需要打包的本地代码需要依赖 Git 等版本管理工具上的 Tag 作为版本信息(" +"例如使用 ``setuptools_scm`` 管理版本号)\\ 且 Python 包根目录与代码根目录" +"不一致时,该选项能自动向上找到版本管理工具中代码的根目录。" +msgstr "" +"When local code to pack relies on tags of some VCS like Git to provide " +"package version info, for instance, ``setuptools_scm`` is used, and the " +"root directory of Python package is not the root directory of source code" +" in the VCS, this option can be added to help ``pyodps-pack`` seek and " +"use the root directory in VCS." + +#: ../../source/pyodps-pack.rst:384 +msgid "``--debug``" +msgstr "" + +#: ../../source/pyodps-pack.rst:386 msgid "指定后,将输出命令运行的详细信息,用于排查问题。" msgstr "" "If specified, will output details when executing the command. This " "argument is for debug purpose." -#: ../../source/pyodps-pack.rst:383 +#: ../../source/pyodps-pack.rst:388 msgid "除此之外,还有若干环境变量可供配置:" msgstr "You can also specify environment variables to control the build." -#: ../../source/pyodps-pack.rst:385 +#: ../../source/pyodps-pack.rst:390 msgid "``DOCKER_PATH=\"path to docker installation\"``" msgstr "" -#: ../../source/pyodps-pack.rst:387 +#: ../../source/pyodps-pack.rst:392 msgid "指定 Docker 可执行文件路径,路径下需要包括 ``docker`` 可执行文件。" msgstr "" "Specify path to executable files of Docker, which should contain " "``docker`` executable." -#: ../../source/pyodps-pack.rst:389 +#: ../../source/pyodps-pack.rst:394 msgid "``BEFORE_BUILD=\"command before build\"``" msgstr "" -#: ../../source/pyodps-pack.rst:391 +#: ../../source/pyodps-pack.rst:396 msgid "指定打包前需要执行的命令。" msgstr "Specify commands to run before build." -#: ../../source/pyodps-pack.rst:393 +#: ../../source/pyodps-pack.rst:398 msgid "``AFTER_BUILD=\"command after build\"``" msgstr "" -#: ../../source/pyodps-pack.rst:395 +#: ../../source/pyodps-pack.rst:400 msgid "指定编译后生成 Tar 包前需要执行的命令。" msgstr "Specify commands to run after tar packages are created." -#: ../../source/pyodps-pack.rst:397 +#: ../../source/pyodps-pack.rst:402 msgid "``DOCKER_IMAGE=\"quay.io/pypa/manylinux2010_x86_64\"``" msgstr "" -#: ../../source/pyodps-pack.rst:399 -msgid "自定义需要使用的 Docker Image。建议基于 ``pypa/manylinux`` 系列镜像定制自定义打包用 Docker Image。" +#: ../../source/pyodps-pack.rst:404 +msgid "" +"自定义需要使用的 Docker Image。建议基于 ``pypa/manylinux`` 系列镜像定制" +"自定义打包用 Docker Image。" msgstr "" "Customize Docker Image to use. It is recommended to build Docker image " "based on ``pypa/manylinux`` images." -#: ../../source/pyodps-pack.rst:402 +#: ../../source/pyodps-pack.rst:407 msgid "使用三方包" msgstr "Use third-party libraries" -#: ../../source/pyodps-pack.rst:405 +#: ../../source/pyodps-pack.rst:410 msgid "上传三方包" msgstr "Upload third-party libraries" -#: ../../source/pyodps-pack.rst:406 +#: ../../source/pyodps-pack.rst:411 msgid "" -"使用三方包前,请确保你生成的包被上传到 MaxCompute Archive 资源。可以使用下面的代码上传资源。 需要注意的是,你需要将 " -"packages.tar.gz 替换成你刚生成的包所在的路径和文件名:" +"使用三方包前,请确保你生成的包被上传到 MaxCompute Archive 资源。可以使用" +"下面的代码上传资源。 需要注意的是,你需要将 packages.tar.gz 替换成你刚" +"生成的包所在的路径和文件名:" msgstr "" "Please make sure your packages are uploaded as MaxCompute resources with " "archive type. To upload resources, you may use code below. Note that you " "need to change ``packages.tar.gz`` into the path to your package." -#: ../../source/pyodps-pack.rst:409 +#: ../../source/pyodps-pack.rst:414 msgid "" "import os\n" "from odps import ODPS\n" @@ -901,97 +969,104 @@ msgstr "" "o.create_resource(\"test_packed.tar.gz\", \"archive\", " "fileobj=open(\"packages.tar.gz\", \"rb\"))" -#: ../../source/pyodps-pack.rst:425 +#: ../../source/pyodps-pack.rst:430 msgid "也可以使用 DataWorks 上传。具体步骤为:" msgstr "You can also try uploading packages with DataWorks following steps below." -#: ../../source/pyodps-pack.rst:427 +#: ../../source/pyodps-pack.rst:432 msgid "进入数据开发页面。" msgstr "Go to the DataStudio page." -#: ../../source/pyodps-pack.rst:429 +#: ../../source/pyodps-pack.rst:434 msgid "登录 DataWorks 控制台。" msgstr "Log on to the DataWorks console." -#: ../../source/pyodps-pack.rst:430 +#: ../../source/pyodps-pack.rst:435 msgid "在左侧导航栏,单击工作空间列表。" msgstr "In the top navigation bar, click list of regions." -#: ../../source/pyodps-pack.rst:431 +#: ../../source/pyodps-pack.rst:436 msgid "选择工作空间所在地域后,单击相应工作空间后的进入数据开发。" msgstr "" "Select the region where your workspace resides, find the workspace, and " "then click Data Analytics in the Actions column." -#: ../../source/pyodps-pack.rst:433 +#: ../../source/pyodps-pack.rst:438 msgid "鼠标悬停至新建图标,单击MaxCompute \\> 资源 \\> Archive" msgstr "" "On the Data Analytics tab, move the pointer over the Create icon and " "choose MaxCompute \\> Resource \\> Python." -#: ../../source/pyodps-pack.rst:435 -msgid "也可以展开业务流程目录下的目标业务流程,右键单击 MaxCompute,选择新建 \\> 资源 \\> Archive" +#: ../../source/pyodps-pack.rst:440 +msgid "" +"也可以展开业务流程目录下的目标业务流程,右键单击 MaxCompute,选择新建 \\>" +" 资源 \\> Archive" msgstr "" "Alternatively, you can click the required workflow in the Business Flow " "section, right-click MaxCompute, and then choose Create \\> Resource \\> " "Python." -#: ../../source/pyodps-pack.rst:437 +#: ../../source/pyodps-pack.rst:442 msgid "在新建资源对话框中,输入资源名称,并选择目标文件夹。" msgstr "" "In the Create Resource dialog box, set the Resource Name and Location " "parameters." -#: ../../source/pyodps-pack.rst:438 +#: ../../source/pyodps-pack.rst:443 msgid "单击点击上传,选择相应的文件进行上传。" msgstr "Click Upload and select the file that you want to upload." -#: ../../source/pyodps-pack.rst:439 +#: ../../source/pyodps-pack.rst:444 msgid "单击确定。" msgstr "Click Create." -#: ../../source/pyodps-pack.rst:440 +#: ../../source/pyodps-pack.rst:445 msgid "单击工具栏中的提交图标,提交资源至调度开发服务器端。" msgstr "" "Click the Submit icon icon in the top toolbar to commit the resource to " "the development environment." -#: ../../source/pyodps-pack.rst:442 -msgid "更详细的细节请参考 `这篇文章 `_ 。" +#: ../../source/pyodps-pack.rst:447 +msgid "" +"更详细的细节请参考 `这篇文章 `_ 。" msgstr "" "More details can be seen in `this article " "`_." -#: ../../source/pyodps-pack.rst:447 +#: ../../source/pyodps-pack.rst:452 msgid "在 Python UDF 中使用三方包" msgstr "Use third-party libraries in Python UDFs" -#: ../../source/pyodps-pack.rst:448 +#: ../../source/pyodps-pack.rst:453 msgid "" -"你需要对你的 UDF 进行修改以使用上传的三方包。具体地,你需要在 UDF 类的 ``__init__`` 方法中添加对三方包的引用, " -"然后再在UDF代码中(例如 evaluate / process 方法)调用三方包。" +"你需要对你的 UDF 进行修改以使用上传的三方包。具体地,你需要在 UDF 类的 ``" +"__init__`` 方法中添加对三方包的引用, 然后再在UDF代码中(例如 evaluate / " +"process 方法)调用三方包。" msgstr "" "You need to modify your UDF code to use uploaded packages. You need to " "add references to your packages in ``__init__`` method of your UDF class," " and use these packages in your UDF code, for instance, evaluate or " "process methods." -#: ../../source/pyodps-pack.rst:451 -msgid "我们以实现 scipy 中的 psi 函数为例展示如何在 Python UDF 中使用三方包。首先使用下面的命令打包:" +#: ../../source/pyodps-pack.rst:456 +msgid "" +"我们以实现 scipy 中的 psi 函数为例展示如何在 Python UDF 中使用三方包。" +"首先使用下面的命令打包:" msgstr "" "We take psi function in scipy for example to show how to use third-party " "libraries in Python UDF. First, pack dependencies use commands below:" -#: ../../source/pyodps-pack.rst:453 ../../source/pyodps-pack.rst:544 +#: ../../source/pyodps-pack.rst:458 ../../source/pyodps-pack.rst:549 msgid "pyodps-pack -o scipy-bundle.tar.gz scipy" msgstr "" -#: ../../source/pyodps-pack.rst:457 +#: ../../source/pyodps-pack.rst:462 msgid "随后编写下面的代码,并保存为 ``test_psi_udf.py``:" msgstr "Then write code below and store as ``test_psi_udf.py``." -#: ../../source/pyodps-pack.rst:459 +#: ../../source/pyodps-pack.rst:464 msgid "" "import sys\n" "from odps.udf import annotate\n" @@ -1029,26 +1104,28 @@ msgstr "" "\n" " return float(psi(arg0))" -#: ../../source/pyodps-pack.rst:479 +#: ../../source/pyodps-pack.rst:484 msgid "对上面的代码做一些解释。" msgstr "We give some explanations to code above." -#: ../../source/pyodps-pack.rst:481 +#: ../../source/pyodps-pack.rst:486 msgid "" -"当依赖中包含 protobuf 时,需要为 ``__init__`` 函数增加 ``sys.setdlopenflags(10)`` ( " -"``pyodps-pack`` 打包过程中会提示),该设置可以避免三方包和 MaxCompute 间相关的版本冲突。" +"当依赖中包含 protobuf 时,需要为 ``__init__`` 函数增加 ``sys." +"setdlopenflags(10)`` ( ``pyodps-pack`` 打包过程中会提示),该设置可以" +"避免三方包和 MaxCompute 间相关的版本冲突。" msgstr "" "When protobuf is a dependency, you need to add ``sys.setdlopenflags(10)``" " to ``__init__`` function. ``pyodps-pack`` will notify you when you need " "to do this. Adding this line will avoid conflict between different " "versions of binaries of your libraries and MaxCompute itself." -#: ../../source/pyodps-pack.rst:484 +#: ../../source/pyodps-pack.rst:489 msgid "" -"``__init__`` 函数中将 ``work/scipy-bundle.tar.gz/packages`` 添加到 ``sys.path``," -" 因为 MaxCompute 会将所有 UDF 引用的 Archive 资源以资源名称为目录解压到 ``work`` 目录下,而 " -"``packages`` 则是 ``pyodps-pack`` 生成包的子目录。如果你需要通过 ``LoadLibrary`` 引入 " -"``--dynlib`` 参数引入的动态链接库,也可以在此处引用。" +"``__init__`` 函数中将 ``work/scipy-bundle.tar.gz/packages`` 添加到 ``sys." +"path``, 因为 MaxCompute 会将所有 UDF 引用的 Archive 资源以资源名称为目录" +"解压到 ``work`` 目录下,而 ``packages`` 则是 ``pyodps-pack`` 生成包的" +"子目录。如果你需要通过 ``LoadLibrary`` 引入 ``--dynlib`` 参数引入的" +"动态链接库,也可以在此处引用。" msgstr "" "In ``__init__`` method, ``work/scipy-bundle.tar.gz/packages`` is inserted" " into ``sys.path``, as MaxCompute will extract all archive resources the " @@ -1057,10 +1134,11 @@ msgstr "" "If you need to load dynamically-linked libraries packed with ``--dynlib``" " with ``LoadLibrary``, code can also be added here." -#: ../../source/pyodps-pack.rst:489 +#: ../../source/pyodps-pack.rst:494 msgid "" -"将对 scipy 的 import 放在 evaluate 函数体内部的原因是三方包仅在执行时可用,当 UDF 在 MaxCompute " -"服务端被解析时,解析环境不包含三方包,函数体外的三方包 import 会导致报错。" +"将对 scipy 的 import 放在 evaluate 函数体内部的原因是三方包仅在执行时可用" +",当 UDF 在 MaxCompute 服务端被解析时,解析环境不包含三方包,函数体外的" +"三方包 import 会导致报错。" msgstr "" "The reason of putting import statement of scipy inside the method body of" " the function evaluate is that third-party libraries are only available " @@ -1068,22 +1146,22 @@ msgstr "" "MaxCompute service, there is no packages for use and import statements of" " these packages outside method bodies will cause errors." -#: ../../source/pyodps-pack.rst:492 +#: ../../source/pyodps-pack.rst:497 msgid "" -"随后需要将 ``test_psi_udf.py`` 上传为 MaxCompute Python 资源,以及将 ``scipy-" -"bundle.tar.gz`` 上传为 Archive 资源。此后,创建 UDF 名为 " -"``test_psi_udf``,引用上面两个资源文件,并指定类名为 ``test_psi_udf.MyPsi``。" +"随后需要将 ``test_psi_udf.py`` 上传为 MaxCompute Python 资源,以及将 ``" +"scipy-bundle.tar.gz`` 上传为 Archive 资源。此后,创建 UDF 名为 ``test_psi" +"_udf``,引用上面两个资源文件,并指定类名为 ``test_psi_udf.MyPsi``。" msgstr "" "Then you need to upload ``test_psi_udf.py`` as MaxCompute Python resource" " and ``scipy-bundle.tar.gz`` as archive resource. After that, you need to" " create a Python UDF named as ``test_psi_udf``, reference two resource " "files and specify class name as ``test_psi_udf.MyPsi``." -#: ../../source/pyodps-pack.rst:495 +#: ../../source/pyodps-pack.rst:500 msgid "利用 PyODPS 完成上述步骤的代码为" msgstr "Code to accomplish above steps with PyODPS is shown below." -#: ../../source/pyodps-pack.rst:497 +#: ../../source/pyodps-pack.rst:502 msgid "" "import os\n" "from odps import ODPS\n" @@ -1138,13 +1216,13 @@ msgstr "" "resources=[bundle_res, udf_res]\n" ")" -#: ../../source/pyodps-pack.rst:521 +#: ../../source/pyodps-pack.rst:526 msgid "使用 MaxCompute Console 上传的方法为" msgstr "" "If you want to use MaxCompute Console to accomplish these steps, you may " "type commands below." -#: ../../source/pyodps-pack.rst:523 +#: ../../source/pyodps-pack.rst:528 msgid "" "add archive scipy-bundle.tar.gz;\n" "add py test_psi_udf.py;\n" @@ -1152,53 +1230,53 @@ msgid "" ",scipy-bundle.tar.gz;" msgstr "" -#: ../../source/pyodps-pack.rst:529 +#: ../../source/pyodps-pack.rst:534 msgid "完成上述步骤后,即可使用 UDF 执行 SQL:" msgstr "After that, you can call the UDF you just created with SQL." -#: ../../source/pyodps-pack.rst:531 +#: ../../source/pyodps-pack.rst:536 msgid "" "set odps.pypy.enabled=false;\n" "set odps.isolation.session.enable=true;\n" "select test_psi_udf(sepal_length) from iris;" msgstr "" -#: ../../source/pyodps-pack.rst:538 +#: ../../source/pyodps-pack.rst:543 msgid "在 PyODPS DataFrame 中使用三方包" msgstr "Use third-party libraries in PyODPS DataFrame" -#: ../../source/pyodps-pack.rst:539 +#: ../../source/pyodps-pack.rst:544 msgid "" -"PyODPS DataFrame 支持在 execute / persist 时使用 libraries 参数使用上面的第三方库。 下面以 map" -" 方法为例,apply / map_reduce 方法的过程类似。" +"PyODPS DataFrame 支持在 execute / persist 时使用 libraries 参数使用上面的" +"第三方库。 下面以 map 方法为例,apply / map_reduce 方法的过程类似。" msgstr "" "PyODPS DataFrame supports using third-party libraries created above by " "adding a ``libraries`` argument when calling methods like execute or " "persist. We take map method for example, the same procedure can be used " "for apply or map_reduce method." -#: ../../source/pyodps-pack.rst:542 +#: ../../source/pyodps-pack.rst:547 msgid "首先,用下面的命令打包 scipy:" msgstr "First, create a package for scipy with command below." -#: ../../source/pyodps-pack.rst:548 +#: ../../source/pyodps-pack.rst:553 msgid "假定我们的表名为 ``test_float_col`` ,内容只包含一列 float 值:" msgstr "" "Assuming that the table is named as ``test_float_col`` and it only " "contains one column with float value." -#: ../../source/pyodps-pack.rst:552 +#: ../../source/pyodps-pack.rst:557 msgid "" " col1\n" "0 3.75\n" "1 2.51" msgstr "" -#: ../../source/pyodps-pack.rst:556 +#: ../../source/pyodps-pack.rst:561 msgid "计算 psi(col1) 的值,可以编写下面的代码:" msgstr "Write code below to compute value of psi(col1)." -#: ../../source/pyodps-pack.rst:558 +#: ../../source/pyodps-pack.rst:563 msgid "" "import os\n" "from odps import ODPS, options\n" @@ -1256,53 +1334,58 @@ msgstr "" "df.col1.map(psi).persist(\"result_table\", libraries=[\"scipy-" "bundle.tar.gz\"])" -#: ../../source/pyodps-pack.rst:586 +#: ../../source/pyodps-pack.rst:591 msgid "如果希望在整个代码执行过程中使用相同的三方包,可以设置全局选项:" msgstr "" "If you want to use the same third-party packages, you can configure these" " packages as global:" -#: ../../source/pyodps-pack.rst:588 +#: ../../source/pyodps-pack.rst:593 msgid "" "from odps import options\n" "options.df.libraries = [\"scipy-bundle.tar.gz\"]" msgstr "" -#: ../../source/pyodps-pack.rst:593 +#: ../../source/pyodps-pack.rst:598 msgid "此后即可在 DataFrame 执行时用到相关的三方包。" msgstr "" "After that, you can use these third-party libraries when DataFrames are " "being executed." -#: ../../source/pyodps-pack.rst:596 +#: ../../source/pyodps-pack.rst:601 msgid "在 DataWorks 中使用三方包" msgstr "Use third-party libraries in DataWorks" -#: ../../source/pyodps-pack.rst:597 +#: ../../source/pyodps-pack.rst:602 msgid "" -"DataWorks PyODPS 节点预装了若干三方包,同时提供了 ``load_resource_package`` 方法用以引用其他的包, " -"具体使用方式可参考 :ref:`这里 `。" +"DataWorks PyODPS 节点预装了若干三方包,同时提供了 ``load_resource_package" +"`` 方法用以引用其他的包, 具体使用方式可参考 :ref:`这里 " +"`。" msgstr "" "PyODPS nodes in DataWorks already installed several third-party libraries" " beforehand. ``load_resource_package`` method is also provided to load " "packages not preinstalled. Details of usage can be seen :ref:`here " "`." -#: ../../source/pyodps-pack.rst:601 +#: ../../source/pyodps-pack.rst:606 msgid "手动上传和使用三方包" msgstr "Upload and use third-party libraries manually" -#: ../../source/pyodps-pack.rst:604 -msgid "以下内容仅作为维护旧项目或者旧环境的参考,新项目建议直接使用 ``pyodps-pack`` 打包。" +#: ../../source/pyodps-pack.rst:609 +msgid "" +"以下内容仅作为维护旧项目或者旧环境的参考,新项目建议直接使用 ``pyodps-" +"pack`` 打包。" msgstr "" "Documents below is only a reference for maintenance of legacy projects or" " projects in legacy environments. For new projects please use ``pyodps-" "pack`` straightforwardly." -#: ../../source/pyodps-pack.rst:606 +#: ../../source/pyodps-pack.rst:611 msgid "" -"部分旧项目可能使用了之前的方式使用三方包,即手动上传所有依赖的 Wheel 包并在代码中引用,或者使用了不支持二进制包的旧版 MaxCompute" -" 环境,本章节为这部分场景准备。下面以在 map 中使用 python_dateutil 为例说明使用三方包的步骤。" +"部分旧项目可能使用了之前的方式使用三方包,即手动上传所有依赖的 Wheel 包并" +"在代码中引用,或者使用了不支持二进制包的旧版 MaxCompute 环境,本章节为" +"这部分场景准备。下面以在 map 中使用 python_dateutil 为例说明使用三方包的" +"步骤。" msgstr "" "Some legacy projects might use old-style method to deploy and use third-" "party libraries, i.e., manually upload all dependant wheel packages and " @@ -1311,11 +1394,12 @@ msgstr "" "chapter is written for these scenarios. Take the following python-" "dateutil package as an example." -#: ../../source/pyodps-pack.rst:609 +#: ../../source/pyodps-pack.rst:614 msgid "" -"首先,我们可以在 Linux bash 中使用 ``pip download`` 命令,下载包以及其依赖到某个路径。 " -"这里下载后会出现两个包:six-1.10.0-py2.py3-none-any.whl和python_dateutil-2.5.3-py2.py3" -"-none-any.whl (这里注意需要下载支持 Linux 环境的包,建议直接在 Linux 下调用该命令。)" +"首先,我们可以在 Linux bash 中使用 ``pip download`` 命令,下载包以及其" +"依赖到某个路径。 这里下载后会出现两个包:six-1.10.0-py2.py3-none-any.whl" +"和python_dateutil-2.5.3-py2.py3-none-any.whl (这里注意需要下载支持 Linux" +" 环境的包,建议直接在 Linux 下调用该命令。)" msgstr "" "First, you can use the pip download command to download the package and " "its dependencies to a specific path. Two packages are downloaded: " @@ -1323,15 +1407,15 @@ msgstr "" "any.whl. Note that the packages must support Linux environment. It is " "recommended to call this command under Linux." -#: ../../source/pyodps-pack.rst:613 +#: ../../source/pyodps-pack.rst:618 msgid "pip download python-dateutil -d /to/path/" msgstr "" -#: ../../source/pyodps-pack.rst:617 +#: ../../source/pyodps-pack.rst:622 msgid "然后我们分别把两个文件上传到ODPS资源" msgstr "Then upload the files to MaxCompute as resources." -#: ../../source/pyodps-pack.rst:619 +#: ../../source/pyodps-pack.rst:624 msgid "" "# 这里要确保资源名的后缀是正确的文件类型\n" "odps.create_resource('six.whl', 'file', file_obj=open('six-1.10.0-py2.py3" @@ -1345,11 +1429,11 @@ msgstr "" ">>> odps.create_resource('python_dateutil.whl', 'file', " "file_obj=open('python_dateutil-2.5.3-py2.py3-none-any.whl', 'rb'))" -#: ../../source/pyodps-pack.rst:625 +#: ../../source/pyodps-pack.rst:630 msgid "现在我们有个DataFrame,只有一个string类型字段。" msgstr "Now you have a DataFrame object that only contains a string field." -#: ../../source/pyodps-pack.rst:627 +#: ../../source/pyodps-pack.rst:632 msgid "" ">>> df\n" " datestr\n" @@ -1357,11 +1441,11 @@ msgid "" "1 2015-08-26 14:03:29" msgstr "" -#: ../../source/pyodps-pack.rst:634 +#: ../../source/pyodps-pack.rst:639 msgid "全局配置使用到的三方库:" msgstr "Set the third-party library as global:" -#: ../../source/pyodps-pack.rst:636 +#: ../../source/pyodps-pack.rst:641 msgid "" ">>> from odps import options\n" ">>>\n" @@ -1376,11 +1460,11 @@ msgid "" "1 2015" msgstr "" -#: ../../source/pyodps-pack.rst:650 +#: ../../source/pyodps-pack.rst:655 msgid "或者,通过立即运行方法的 ``libraries`` 参数指定:" msgstr "Or use the ``libraries`` attribute of an action to specify the package:" -#: ../../source/pyodps-pack.rst:653 +#: ../../source/pyodps-pack.rst:658 msgid "" ">>> def get_year(t):\n" ">>> from dateutil.parser import parse\n" @@ -1393,10 +1477,11 @@ msgid "" "1 2015" msgstr "" -#: ../../source/pyodps-pack.rst:664 +#: ../../source/pyodps-pack.rst:669 msgid "" -"PyODPS 默认支持执行纯 Python 且不含文件操作的第三方库。在较新版本的 MaxCompute 服务下,PyODPS " -"也支持执行带有二进制代码或带有文件操作的 Python 库。这些库名必须拥有一定的后缀,可根据下表判断" +"PyODPS 默认支持执行纯 Python 且不含文件操作的第三方库。在较新版本的 " +"MaxCompute 服务下,PyODPS 也支持执行带有二进制代码或带有文件操作的 Python" +" 库。这些库名必须拥有一定的后缀,可根据下表判断" msgstr "" "By default, PyODPS supports third-party libraries that contain pure " "Python code but no file operations. In newer versions of MaxCompute, " @@ -1404,68 +1489,68 @@ msgstr "" "operations. These libraries must be suffixed with certain strings, which " "can be looked up in the table below." -#: ../../source/pyodps-pack.rst:668 +#: ../../source/pyodps-pack.rst:673 msgid "平台" msgstr "Platform" -#: ../../source/pyodps-pack.rst:668 +#: ../../source/pyodps-pack.rst:673 msgid "Python 版本" msgstr "Python version" -#: ../../source/pyodps-pack.rst:668 +#: ../../source/pyodps-pack.rst:673 msgid "可用的后缀" msgstr "Suffixes available" -#: ../../source/pyodps-pack.rst:670 ../../source/pyodps-pack.rst:671 +#: ../../source/pyodps-pack.rst:675 ../../source/pyodps-pack.rst:676 msgid "RHEL 5 x86\\_64" msgstr "" -#: ../../source/pyodps-pack.rst:670 ../../source/pyodps-pack.rst:672 +#: ../../source/pyodps-pack.rst:675 ../../source/pyodps-pack.rst:677 msgid "Python 2.7" msgstr "" -#: ../../source/pyodps-pack.rst:670 +#: ../../source/pyodps-pack.rst:675 msgid "cp27-cp27m-manylinux1_x86_64" msgstr "" -#: ../../source/pyodps-pack.rst:671 ../../source/pyodps-pack.rst:673 -#: ../../source/pyodps-pack.rst:674 +#: ../../source/pyodps-pack.rst:676 ../../source/pyodps-pack.rst:678 +#: ../../source/pyodps-pack.rst:679 msgid "Python 3.7" msgstr "" -#: ../../source/pyodps-pack.rst:671 +#: ../../source/pyodps-pack.rst:676 msgid "cp37-cp37m-manylinux1_x86_64" msgstr "" -#: ../../source/pyodps-pack.rst:672 ../../source/pyodps-pack.rst:673 +#: ../../source/pyodps-pack.rst:677 ../../source/pyodps-pack.rst:678 msgid "RHEL 7 x86\\_64" msgstr "" -#: ../../source/pyodps-pack.rst:672 +#: ../../source/pyodps-pack.rst:677 msgid "" "cp27-cp27m-manylinux1_x86_64, cp27-cp27m-manylinux2010_x86_64, cp27" "-cp27m-manylinux2014_x86_64" msgstr "" -#: ../../source/pyodps-pack.rst:673 +#: ../../source/pyodps-pack.rst:678 msgid "" "cp37-cp37m-manylinux1_x86_64, cp37-cp37m-manylinux2010_x86_64, cp37" "-cp37m-manylinux2014_x86_64" msgstr "" -#: ../../source/pyodps-pack.rst:674 +#: ../../source/pyodps-pack.rst:679 msgid "RHEL 7 ARM64" msgstr "" -#: ../../source/pyodps-pack.rst:674 +#: ../../source/pyodps-pack.rst:679 msgid "cp37-cp37m-manylinux2014_aarch64" msgstr "" -#: ../../source/pyodps-pack.rst:677 +#: ../../source/pyodps-pack.rst:682 msgid "" -"所有的 whl 包都需要以 archive 格式上传,whl 后缀的包需要重命名为 zip。同时,作业需要开启 " -"``odps.isolation.session.enable`` 选项,或者在 Project 级别开启 " -"Isolation。下面的例子展示了如何上传并使用 scipy 中的特殊函数:" +"所有的 whl 包都需要以 archive 格式上传,whl 后缀的包需要重命名为 zip。" +"同时,作业需要开启 ``odps.isolation.session.enable`` 选项,或者在 Project" +" 级别开启 Isolation。下面的例子展示了如何上传并使用 scipy 中的特殊函数:" msgstr "" "All .whl packages need to be uploaded in the archive format, while .whl " "packages must be renamed to .zip files. You also need to enable the " @@ -1473,9 +1558,10 @@ msgstr "" "project. The following example demonstrates how to upload and use the " "special functions in scipy:" -#: ../../source/pyodps-pack.rst:681 +#: ../../source/pyodps-pack.rst:686 msgid "" -"# 对于含有二进制代码的包,必须使用 Archive 方式上传资源,whl 后缀需要改为 zip\n" +"# 对于含有二进制代码的包,必须使用 Archive 方式上传资源,whl 后缀需要改为" +" zip\n" "odps.create_resource('scipy.zip', 'archive', " "file_obj=open('scipy-0.19.0-cp27-cp27m-manylinux1_x86_64.whl', 'rb'))\n" "\n" @@ -1483,7 +1569,8 @@ msgid "" "options.sql.settings = { 'odps.isolation.session.enable': True }\n" "\n" "def psi(value):\n" -" # 建议在函数内部 import 第三方库,以防止不同操作系统下二进制包结构差异造成执行错误\n" +" # 建议在函数内部 import 第三方库,以防止不同操作系统下二进制包结构" +"差异造成执行错误\n" " from scipy.special import psi\n" " return float(psi(value))\n" "\n" @@ -1509,16 +1596,16 @@ msgstr "" ">>>\n" ">>> df.float_col.map(psi).execute(libraries=['scipy.zip'])" -#: ../../source/pyodps-pack.rst:697 +#: ../../source/pyodps-pack.rst:702 msgid "" -"对于只提供源码的二进制包,可以在 Linux Shell 中打包成 Wheel 再上传,Mac 和 Windows 中生成的 Wheel 包无法在" -" MaxCompute 中使用:" +"对于只提供源码的二进制包,可以在 Linux Shell 中打包成 Wheel 再上传,Mac " +"和 Windows 中生成的 Wheel 包无法在 MaxCompute 中使用:" msgstr "" "For binary packages that only contain source code, they can be packaged " "into .whl files and uploaded through Linux shell. .whl files generated in" " Mac and Windows are not usable in MaxCompute:" -#: ../../source/pyodps-pack.rst:700 +#: ../../source/pyodps-pack.rst:705 msgid "python setup.py bdist_wheel" msgstr "" diff --git a/docs/source/norm_zh.py b/docs/source/norm_zh.py new file mode 100644 index 00000000..231959e5 --- /dev/null +++ b/docs/source/norm_zh.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 1999-2024 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This file folds Chinese po files by hacking babel.messages.pofile.normalize +using jieba text segment library instead of regex +""" + +import datetime +import os + +from babel.messages import pofile +from babel.messages.pofile import escape + + +_original_normalize = pofile.normalize + + +def _zh_len(s): + """ + Calculate text length in Chinese + """ + try: + return len(s.encode("gb2312")) + except ValueError: + return len(s) + + +def _zh_split(s): + """ + Split text length in Chinese + """ + import jieba + + try: + s.encode("ascii") + has_zh = False + except ValueError: + has_zh = True + + if has_zh: + return list(jieba.cut(s)) + else: + return pofile.WORD_SEP.split(s) + + +# code modified from babel.messages.pofile (hash 359ecffca479dfe032d0f7210d5cd8160599c816) +def _normalize(string, prefix="", width=76): + r"""Convert a string into a format that is appropriate for .po files. + >>> print(normalize('''Say: + ... "hello, world!" + ... ''', width=None)) + "" + "Say:\n" + " \"hello, world!\"\n" + >>> print(normalize('''Say: + ... "Lorem ipsum dolor sit amet, consectetur adipisicing elit, " + ... ''', width=32)) + "" + "Say:\n" + " \"Lorem ipsum dolor sit " + "amet, consectetur adipisicing" + " elit, \"\n" + :param string: the string to normalize + :param prefix: a string that should be prepended to every line + :param width: the maximum line width; use `None`, 0, or a negative number + to completely disable line wrapping + """ + + try: + string.encode("ascii") + except ValueError: + pass + else: + return _original_normalize(string, prefix=prefix, width=width) + + if width and width > 0: + prefixlen = _zh_len(prefix) + lines = [] + for line in string.splitlines(True): + if _zh_len(escape(line)) + prefixlen > width: + chunks = _zh_split(line) + chunks.reverse() + while chunks: + buf = [] + size = 2 + while chunks: + l = _zh_len(escape(chunks[-1])) - 2 + prefixlen # noqa: E741 + if size + l < width: + buf.append(chunks.pop()) + size += l + else: + if not buf: + # handle long chunks by putting them on a + # separate line + buf.append(chunks.pop()) + break + lines.append("".join(buf)) + else: + lines.append(line) + else: + lines = string.splitlines(True) + + if len(lines) <= 1: + return escape(string) + + # Remove empty trailing line + if lines and not lines[-1]: + del lines[-1] + lines[-1] += "\n" + return '""\n' + "\n".join([(prefix + escape(line)) for line in lines]) + + +def main(): + try: + import jieba # noqa: F401 + except ImportError: + return + + pofile.normalize = _normalize + for root, _dirs, files in os.walk("."): + if "en" not in root: + continue + for f in files: + if not f.endswith(".po"): + continue + path = os.path.join(root, f) + print(path) + + # only modify recent-changed files + modify_time = datetime.datetime.fromtimestamp(os.path.getmtime(path)) + if (datetime.datetime.now() - modify_time).total_seconds() > 120: + continue + + with open(path, "rb") as inpf: + catalog = pofile.read_po(inpf) + with open(path, "wb") as outf: + pofile.write_po(outf, catalog) + + +if __name__ == "__main__": + main() diff --git a/docs/source/options.rst b/docs/source/options.rst index 77a59c65..d9f2fd54 100644 --- a/docs/source/options.rst +++ b/docs/source/options.rst @@ -30,6 +30,7 @@ PyODPS 提供了一系列的配置选项,可通过 ``odps.options`` 获得, "default_project", "默认 Project", "None" "logview_host", "LogView 主机名", "None" "logview_hours", "LogView 保持时间(小时)", "24" + "quota_name", "提交任务时使用的计算 Quota 名称", "None" "local_timezone", "使用的时区,None 表示不处理,True 表示本地时区,False 表示 UTC,也可用 pytz 的时区", "None" "lifecycle", "所有表生命周期", "None" "verify_ssl", "验证服务端 SSL 证书", "True" @@ -50,7 +51,8 @@ PyODPS 提供了一系列的配置选项,可通过 ``odps.options`` 获得, "display.notebook_widget", "使用交互式插件", "True" "sql.settings", "ODPS SQL运行全局hints", "None" "sql.use_odps2_extension", "启用 MaxCompute 2.0 语言扩展", "None" - "sql.always_enable_schema", "在任何情形下启用 MaxCompute Schema", "None" + "sql.enable_schema", "在任何情形下启用 MaxCompute Schema", "None" + "pythonpack.settings", "PythonPack运行全局hints", "None" 数据上传/下载配置 ================== @@ -65,6 +67,7 @@ PyODPS 提供了一系列的配置选项,可通过 ``odps.options`` 获得, "tunnel.string_as_binary", "在 string 类型中使用 bytes 而非 unicode", "False" "tunnel.quota_name", "配置 Tunnel Quota 的名称", "False" "tunnel.block_buffer_size", "配置缓存 Block Writer 的缓存大小", "20 * 1024 ** 2" + "tunnel.tags", "配置使用 Tunnel 所需的标签", "None" DataFrame 配置 ================== diff --git a/docs/source/platform-d2.rst b/docs/source/platform-d2.rst index 48359666..74e52864 100644 --- a/docs/source/platform-d2.rst +++ b/docs/source/platform-d2.rst @@ -32,7 +32,7 @@ DataWorks 的 PyODPS 节点中,将会包含一个全局的变量 ``odps`` 或 执行SQL ========== -可以参考 :ref:`执行SQL文档 ` 。 +可以参考\ :ref:`执行SQL文档 `\ 。 .. note:: Dataworks 上默认没有打开 instance tunnel,即 instance.open_reader 默认走 result 接口(最多一万条)。 @@ -67,7 +67,7 @@ DataFrame 执行 -------- -在 DataWorks 的环境里, :ref:`DataFrame ` 的执行需要显式调用 :ref:`立即执行的方法(如execute,head等) ` 。 +在 DataWorks 的环境里,\ :ref:`DataFrame ` 的执行需要显式调用\ :ref:`立即执行的方法(如execute,head等) `\ 。 .. code-block:: python @@ -147,14 +147,17 @@ lz4 2.1.4 3.1.10 zstandard 0.14.1 0.17.0 ==================== ================== ================== -如果你需要使用上面列表中不存在的包,DataWorks 节点提供了 ``load_resource_package`` 方法,支持从 -MaxCompute 资源下载三方包。使用 ``pyodps-pack`` 打包后,可以直接使用 ``load_resource_package`` -方法加载三方包,此后就可以 import 包中的内容。关于 ``pyodps-pack`` 的文档可见 :ref:`制作和使用三方包 `。 +如果你需要使用上面列表中不存在的包,0.12.0 以上版本的 DataWorks PyODPS Python 3 节点提供了 ``resource_pack`` +注释,支持从 MaxCompute 资源下载三方包。使用 ``pyodps-pack`` 打包后,可以直接使用 ``resource_pack`` +注释加载三方包,此后就可以 import 包中的内容。关于 ``pyodps-pack`` 的文档可见\ :ref:`制作和使用三方包 `。 .. note:: 如果为 Python 2 节点打包,请在打包时为 ``pyodps-pack`` 增加 ``--dwpy27`` 参数。 + 建议使用 PyODPS 包版本至少为 0.11.3,否则部分生成的包可能无法正常加载。关于 PyODPS 包及节点执行组件的升级可参考\ + :ref:`这个章节 `。 + 例如,使用下面的命令打包 .. code-block:: bash @@ -162,11 +165,11 @@ MaxCompute 资源下载三方包。使用 ``pyodps-pack`` 打包后,可以直 pyodps-pack -o ipaddress-bundle.tar.gz ipaddress 并上传 / 提交 ``ipaddress-bundle.tar.gz`` 为资源后,可以在 PyODPS 3 节点中按照下面的方法使用 -ipaddress 包: +ipaddress 包(注意注释是必须的): .. code-block:: python - load_resource_package("ipaddress-bundle.tar.gz") + # -*- resource_pack: ipaddress-bundle.tar.gz import ipaddress DataWorks 限制下载的包总大小为 100MB。如果你需要跳过预装包的打包,可以在打包时使用 ``pyodps-pack`` 提供的 @@ -176,16 +179,23 @@ DataWorks 限制下载的包总大小为 100MB。如果你需要跳过预装包 pyodps-pack -o bundle.tar.gz --exclude numpy --exclude pandas -使用其他账号 -============ +你可以在 ``resource_pack`` 中通过逗号分割的方式引入多个包。 -.. note:: +对于 0.11.3 以上版本的 DataWorks PyODPS Python 3 节点,你也可以使用 ``pyodps-pack`` 打包,并在包加载前\ +使用 ``load_resource_package`` 方法引入三方包: - ``as_account`` 方法从 PyODPS 0.11.3 开始支持。如果你的 DataWorks 未部署该版本,则无法使用该方法。 - 如果你使用的是独享资源组,可以考虑升级资源组中的 PyODPS 版本,具体可见 `该文档 `_ 。 +.. code-block:: python + + load_resource_package('ipaddress-bundle.tar.gz') + import ipaddress -在某些情形下你可能希望使用其他账号(而非平台提供的账号)访问 MaxCompute。此时,可以使用 ODPS 入口对象的 ``as_account`` -方法创建一个使用新账号的入口对象,该对象与系统默认提供的 ``o`` 实例彼此独立。例如: +需要注意的是,如果你需要使用的三方包已经在预装三方包中,使用 ``load_resource_package`` 可能无法加载所需\ +的版本,此时建议使用 ``resource_pack`` 注释的方式。 + +使用其他账号 +============ +在某些情形下你可能希望使用其他账号(而非平台提供的账号)访问 MaxCompute。从 PyODPS 0.11.3 开始,可以使用 MaxCompute +入口对象的 ``as_account`` 方法创建一个使用新账号的入口对象,该对象与系统默认提供的 ``o`` 实例彼此独立。例如: .. code-block:: python @@ -200,7 +210,8 @@ DataWorks 限制下载的包总大小为 100MB。如果你需要跳过预装包 问题诊断 ========= -如果你的代码在执行中卡死且没有任何输出,你可以在代码头部增加以下注释,DataWorks 每隔 30 秒将输出所有线程的堆栈: +如果你的代码在执行中卡死且没有任何输出,你可以在代码头部增加以下注释,0.11.3 以上版本的 DataWorks +PyODPS Python 3 节点每隔 30 秒将输出所有线程的堆栈: .. code-block:: python @@ -214,7 +225,7 @@ DataWorks 限制下载的包总大小为 100MB。如果你需要跳过预装包 - DataFrame的plot函数 DataFrame 自定义函数需要提交到 MaxCompute 执行。由于 Python 沙箱的原因,第三方库只支持所有的纯 Python 库以及 Numpy, -因此不能直接使用 Pandas,可参考 :ref:`第三方库支持 ` 上传和使用所需的库。DataWorks +因此不能直接使用 Pandas,可参考\ :ref:`第三方库支持 `\ 上传和使用所需的库。DataWorks 中执行的非自定义函数代码可以使用平台预装的 Numpy 和 Pandas。其他带有二进制代码的三方包不被支持。 由于兼容性的原因,在 DataWorks 中,`options.tunnel.use_instance_tunnel` 默认设置为 False。如果需要全局开启 Instance Tunnel, @@ -229,4 +240,24 @@ DataFrame 自定义函数需要提交到 MaxCompute 执行。由于 Python 沙 如果看到 **Got killed** ,即内存使用超限,进程被 kill。因此,尽量避免本地的数据操作。 -通过 PyODPS 起的 SQL 和 DataFrame 任务(除 to_pandas) 不受此限制。 \ No newline at end of file +通过 PyODPS 起的 SQL 和 DataFrame 任务(除 to_pandas) 不受此限制。 + +.. _dw_upgrade: + +升级 +====== + +共享资源组中的 DataWorks PyODPS 节点执行组件及 PyODPS 包版本由阿里云维护,并会随着 PyODPS 更新而更新。\ +独享资源组中的节点执行组件及 PyODPS 包则可能在资源组生成时即固定下来。如果你需要使用更新版本 PyODPS 包中\ +提供的功能(通常指本文档以外的 API),可以参考\ `该文档 `_\ +自行升级所需的 PyODPS 版本。需要注意的是,下列功能由 PyODPS 节点执行组件而非 PyODPS 包本身提供。无法通过\ +自行升级进行安装: + +* 调度参数 +* 通过代码注释提供的能力,例如 ``dump_traceback`` 等 +* ``load_resource_package`` +* 错误信息自动提示 + +对于 0.11.5 及后续版本的 PyODPS 节点执行组件,当版本与 PyODPS 版本不一致时,会在执行时在日志中同时显示两个\ +版本号。阿里云将会不定期更新 PyODPS 节点执行组件,更新时间点相比共享资源组存在一定的延后。如你对节点执行组件有\ +更新需求,可以通过工单联系阿里云寻求升级支持。 diff --git a/docs/source/pyodps-pack.rst b/docs/source/pyodps-pack.rst index 71392c58..bd5ffba2 100644 --- a/docs/source/pyodps-pack.rst +++ b/docs/source/pyodps-pack.rst @@ -54,7 +54,7 @@ Docker Desktop / Rancher Desktop 已经安装了跨平台打包所需的 ``binfm 我们建议在打包时,尽量使用 Docker 模式。非 Docker 模式仅用于 Docker 不可用的场景,且生成的包有可能不可用。 -如果你安装 Docker 遇到困难,你可以尝试使用非 Docker 模式。使用方式为新增一个 ``--without-docker`` 参数。该模式需要你的 Python +如果你安装 Docker 遇到困难,你可以尝试使用非 Docker 模式。使用方式为新增一个 ``--no-docker`` 参数。该模式需要你的 Python 环境中已经安装 pip。如果使用该模式出现错误,请改用 Docker 模式。Windows 用户需要安装 Git bash 以使用该模式,Git bash 包含在 `Git for Windows `_ 中。 @@ -79,7 +79,7 @@ Docker Desktop / Rancher Desktop 已经安装了跨平台打包所需的 ``binfm .. code-block:: bash - pyodps-pack --without-docker pandas + pyodps-pack --no-docker pandas 需要指定版本时,可以使用 @@ -364,11 +364,11 @@ Docker Desktop / Rancher Desktop 已经安装了跨平台打包所需的 ``binfm 指定在执行 Docker 命令时需要额外附加的参数。如有多个参数需用引号包裹,例如 ``--docker-args "--ip 192.168.1.10"``。 -- ``--without-docker`` +- ``--no-docker`` 使用无 Docker 模式运行 ``pyodps-pack``。当依赖中存在二进制依赖,可能报错或导致包不可用。 -- ``--without-merge`` +- ``--no-merge`` 下载或生成 Wheel 包后不生成 ``.tar.gz`` 包而是保留 ``.whl`` 文件。 @@ -376,6 +376,11 @@ Docker Desktop / Rancher Desktop 已经安装了跨平台打包所需的 ``binfm 在打包过程中不在包中扫描和解决 ``pkg_resources`` 的依赖,当依赖项较多时可加快打包速度。 +- ``--find-vcs-root`` + + 当需要打包的本地代码需要依赖 Git 等版本管理工具上的 Tag 作为版本信息(例如使用 ``setuptools_scm`` 管理版本号)\ + 且 Python 包根目录与代码根目录不一致时,该选项能自动向上找到版本管理工具中代码的根目录。 + - ``--debug`` 指定后,将输出命令运行的详细信息,用于排查问题。 diff --git a/notebooks/nb_init.py b/notebooks/nb_init.py index 243c4f7f..f7856bda 100644 --- a/notebooks/nb_init.py +++ b/notebooks/nb_init.py @@ -18,6 +18,7 @@ exec_lines to help create pre-defined objects. """ import sys + from six.moves.configparser import ConfigParser sys.path.append('../') diff --git a/odps/__init__.py b/odps/__init__.py index 0672a7b1..67a8b9c2 100644 --- a/odps/__init__.py +++ b/odps/__init__.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,18 +13,21 @@ # limitations under the License. import sys + from ._version import __version__ -__all__ = ['ODPS', 'DataFrame', 'options'] +__all__ = ["ODPS", "DataFrame", "options"] if sys.version_info[0] == 2 and sys.version_info[:2] < (2, 7): - raise Exception('pyodps supports python 2.7+ (including python 3+).') + raise Exception("pyodps supports python 2.7+ (including python 3+).") from .config import options from .core import ODPS -from .df import DataFrame, Scalar, RandomScalar, NullScalar -from .inter import setup, enter, teardown, list_rooms -from .utils import show_versions, write_log as log +from .df import DataFrame, NullScalar, RandomScalar, Scalar +from .inter import enter, list_rooms, setup, teardown +from .utils import show_versions +from .utils import write_log as log + try: from .ipython import load_ipython_extension except ImportError: @@ -32,14 +35,14 @@ try: from sqlalchemy.dialects import registry - registry.register('odps', 'odps.sqlalchemy_odps', 'ODPSDialect') + registry.register("odps", "odps.sqlalchemy_odps", "ODPSDialect") except ImportError: pass def install_plugins(): try: - from .ml import install_plugin, install_mixin + from .ml import install_mixin, install_plugin except (ImportError, SyntaxError): pass diff --git a/odps/_version.py b/odps/_version.py index 8c5a80c4..c93f47c2 100644 --- a/odps/_version.py +++ b/odps/_version.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -version_info = (0, 11, 6, 5) -_num_index = max(idx if isinstance(v, int) else 0 - for idx, v in enumerate(version_info)) -__version__ = '.'.join(map(str, version_info[:_num_index + 1])) + \ - ''.join(version_info[_num_index + 1:]) +version_info = (0, 12, 0) +_num_index = max(idx if isinstance(v, int) else 0 for idx, v in enumerate(version_info)) +__version__ = ".".join(map(str, version_info[: _num_index + 1])) + "".join( + version_info[_num_index + 1 :] +) diff --git a/odps/accounts.py b/odps/accounts.py index af9d5b3d..8d0ef28b 100644 --- a/odps/accounts.py +++ b/odps/accounts.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -39,51 +39,53 @@ class BaseAccount(object): def _build_canonical_str(self, url_components, req): # Build signing string - lines = [req.method, ] + lines = [req.method] headers_to_sign = dict() canonical_resource = url_components.path params = dict() if url_components.query: - params_list = sorted(parse_qsl(url_components.query, True), - key=lambda it: it[0]) + params_list = sorted( + parse_qsl(url_components.query, True), key=lambda it: it[0] + ) assert len(params_list) == len(set(it[0] for it in params_list)) params = dict(params_list) - convert = lambda kv: kv if kv[1] != '' else (kv[0], ) - params_str = '&'.join(['='.join(convert(kv)) for kv in params_list]) + convert = lambda kv: kv if kv[1] != "" else (kv[0],) + params_str = "&".join(["=".join(convert(kv)) for kv in params_list]) - canonical_resource = '%s?%s' % (canonical_resource, params_str) + canonical_resource = "%s?%s" % (canonical_resource, params_str) headers = req.headers - logger.debug('headers before signing: %s', headers) + logger.debug("headers before signing: %s", headers) for k, v in six.iteritems(headers): k = k.lower() - if k in ('content-type', 'content-md5') or k.startswith('x-odps'): + if k in ("content-type", "content-md5") or k.startswith("x-odps"): headers_to_sign[k] = v - for k in ('content-type', 'content-md5'): + for k in ("content-type", "content-md5"): if k not in headers_to_sign: - headers_to_sign[k] = '' - date_str = headers.get('Date') + headers_to_sign[k] = "" + date_str = headers.get("Date") if not date_str: req_date = utils.formatdate(usegmt=True) - headers['Date'] = req_date + headers["Date"] = req_date date_str = req_date - headers_to_sign['date'] = date_str + headers_to_sign["date"] = date_str for param_key, param_value in six.iteritems(params): - if param_key.startswith('x-odps-'): + if param_key.startswith("x-odps-"): headers_to_sign[param_key] = param_value - headers_to_sign = OrderedDict([(k, headers_to_sign[k]) - for k in sorted(headers_to_sign)]) - logger.debug('headers to sign: %s', headers_to_sign) + headers_to_sign = OrderedDict( + [(k, headers_to_sign[k]) for k in sorted(headers_to_sign)] + ) + logger.debug("headers to sign: %s", headers_to_sign) for k, v in six.iteritems(headers_to_sign): - if k.startswith('x-odps-'): - lines.append('%s:%s' % (k, v)) + if k.startswith("x-odps-"): + lines.append("%s:%s" % (k, v)) else: lines.append(v) lines.append(canonical_resource) - return '\n'.join(lines) + return "\n".join(lines) def sign_request(self, req, endpoint, region_name=None): raise NotImplementedError @@ -93,6 +95,7 @@ class AliyunAccount(BaseAccount): """ Account of aliyun.com """ + def __init__(self, access_id, secret_access_key): self.access_id = access_id self.secret_access_key = secret_access_key @@ -105,56 +108,77 @@ def _get_v4_signature_key(self, date_str, region_name): k_secret = utils.to_binary("aliyun_v4" + self.secret_access_key) k_date = hmac.new(k_secret, utils.to_binary(date_str), hashlib.sha256).digest() - k_region = hmac.new(k_date, utils.to_binary(region_name), hashlib.sha256).digest() + k_region = hmac.new( + k_date, utils.to_binary(region_name), hashlib.sha256 + ).digest() k_service = hmac.new(k_region, b"odps", hashlib.sha256).digest() self._last_signature_date = date_str - self._last_signature_key = hmac.new(k_service, b"aliyun_v4_request", hashlib.sha256).digest() + self._last_signature_key = hmac.new( + k_service, b"aliyun_v4_request", hashlib.sha256 + ).digest() return self._last_signature_key def calc_auth_str(self, canonical_str, region_name=None): if region_name is None: # use legacy v2 sign - signature = base64.b64encode(hmac.new( - utils.to_binary(self.secret_access_key), utils.to_binary(canonical_str), - hashlib.sha1).digest()) - return 'ODPS %s:%s' % (self.access_id, utils.to_str(signature)) + signature = base64.b64encode( + hmac.new( + utils.to_binary(self.secret_access_key), + utils.to_binary(canonical_str), + hashlib.sha1, + ).digest() + ) + return "ODPS %s:%s" % (self.access_id, utils.to_str(signature)) else: # use v4 sign date_str = datetime.strftime(datetime.utcnow(), "%Y%m%d") - credential = "/".join([self.access_id, date_str, region_name, "odps/aliyun_v4_request"]) + credential = "/".join( + [self.access_id, date_str, region_name, "odps/aliyun_v4_request"] + ) sign_key = self._get_v4_signature_key(date_str, region_name) - signature = base64.b64encode(hmac.new(sign_key, utils.to_binary(canonical_str), hashlib.sha1).digest()) - return 'ODPS %s:%s' % (credential, utils.to_str(signature)) + signature = base64.b64encode( + hmac.new( + sign_key, utils.to_binary(canonical_str), hashlib.sha1 + ).digest() + ) + return "ODPS %s:%s" % (credential, utils.to_str(signature)) def sign_request(self, req, endpoint, region_name=None): - url = req.url[len(endpoint):] + url = req.url[len(endpoint) :] url_components = urlparse(unquote(url), allow_fragments=False) canonical_str = self._build_canonical_str(url_components, req) - logger.debug('canonical string: %s', canonical_str) + logger.debug("canonical string: %s", canonical_str) - req.headers['Authorization'] = self.calc_auth_str(canonical_str, region_name) - logger.debug('headers after signing: %r', req.headers) + req.headers["Authorization"] = self.calc_auth_str(canonical_str, region_name) + logger.debug("headers after signing: %r", req.headers) class AppAccount(BaseAccount): """ Account for applications. """ + def __init__(self, access_id, secret_access_key): self.access_id = access_id self.secret_access_key = secret_access_key def sign_request(self, req, endpoint, region_name=None): - auth_str = req.headers['Authorization'] - signature = base64.b64encode(hmac.new( - utils.to_binary(self.secret_access_key), utils.to_binary(auth_str), - hashlib.sha1).digest()) - app_auth_str = "account_provider:%s,signature_method:%s,access_id:%s,signature:%s" % ( - 'aliyun', 'hmac-sha1', self.access_id, utils.to_str(signature)) - req.headers['application-authentication'] = app_auth_str - logger.debug('headers after app signing: %r', req.headers) + auth_str = req.headers["Authorization"] + signature = base64.b64encode( + hmac.new( + utils.to_binary(self.secret_access_key), + utils.to_binary(auth_str), + hashlib.sha1, + ).digest() + ) + app_auth_str = ( + "account_provider:%s,signature_method:%s,access_id:%s,signature:%s" + % ("aliyun", "hmac-sha1", self.access_id, utils.to_str(signature)) + ) + req.headers["application-authentication"] = app_auth_str + logger.debug("headers after app signing: %r", req.headers) class SignServer(object): @@ -174,12 +198,14 @@ def do_POST(self): self.end_headers() def _do_POST(self): - ctype, pdict = cgi.parse_header(self.headers.get('content-type')) - if ctype == 'multipart/form-data': + ctype, pdict = cgi.parse_header(self.headers.get("content-type")) + if ctype == "multipart/form-data": postvars = cgi.parse_multipart(self.rfile, pdict) - elif ctype == 'application/x-www-form-urlencoded': - length = int(self.headers.get('content-length')) - postvars = six.moves.urllib.parse.parse_qs(self.rfile.read(length), keep_blank_values=1) + elif ctype == "application/x-www-form-urlencoded": + length = int(self.headers.get("content-length")) + postvars = six.moves.urllib.parse.parse_qs( + self.rfile.read(length), keep_blank_values=1 + ) else: self.send_response(400) self.end_headers() @@ -189,14 +215,14 @@ def _do_POST(self): def _sign(self, postvars): if self.server._token is not None: - auth = self.headers.get('Authorization') + auth = self.headers.get("Authorization") if not auth: self.send_response(401) self.end_headers() return - method, content = auth.split(' ', 1) + method, content = auth.split(" ", 1) method = method.lower() - if method == 'token': + if method == "token": if content != self.server._token: self.send_response(401) self.end_headers() @@ -206,13 +232,13 @@ def _sign(self, postvars): self.end_headers() return - assert len(postvars[b'access_id']) == 1 and len(postvars[b'canonical']) == 1 - access_id = utils.to_str(postvars[b'access_id'][0]) - canonical = utils.to_str(postvars[b'canonical'][0]) - if b'region_name' not in postvars: + assert len(postvars[b"access_id"]) == 1 and len(postvars[b"canonical"]) == 1 + access_id = utils.to_str(postvars[b"access_id"][0]) + canonical = utils.to_str(postvars[b"canonical"][0]) + if b"region_name" not in postvars: region_name = None else: - region_name = utils.to_str(postvars[b'region_name'][0]) + region_name = utils.to_str(postvars[b"region_name"][0]) secret_access_key = self.server._accounts[access_id] account = AliyunAccount(access_id, secret_access_key) @@ -226,10 +252,12 @@ def _sign(self, postvars): def log_message(self, *args): return - class SignServerCore(six.moves.socketserver.ThreadingMixIn, six.moves.BaseHTTPServer.HTTPServer): + class SignServerCore( + six.moves.socketserver.ThreadingMixIn, six.moves.BaseHTTPServer.HTTPServer + ): def __init__(self, *args, **kwargs): - self._accounts = kwargs.pop('accounts', {}) - self._token = kwargs.pop('token', None) + self._accounts = kwargs.pop("accounts", {}) + self._token = kwargs.pop("token", None) self._ready = False six.moves.BaseHTTPServer.HTTPServer.__init__(self, *args, **kwargs) self._ready = True @@ -257,8 +285,12 @@ def token(self): def start(self, endpoint): def starter(): - self._server = self.SignServerCore(endpoint, self.SignServerHandler, - accounts=self.accounts, token=self.token) + self._server = self.SignServerCore( + endpoint, + self.SignServerHandler, + accounts=self.accounts, + token=self.token, + ) self._server.serve_forever() thread = threading.Thread(target=starter) @@ -281,14 +313,16 @@ def __init__(self, msg, code, content): class SignServerAccount(BaseAccount): _session_local = threading.local() - def __init__(self, access_id, sign_endpoint=None, server=None, port=None, token=None): + def __init__( + self, access_id, sign_endpoint=None, server=None, port=None, token=None + ): self.access_id = access_id self.sign_endpoint = sign_endpoint or (server, port) self.token = token @property def session(self): - if not hasattr(type(self)._session_local, '_session'): + if not hasattr(type(self)._session_local, "_session"): adapter_options = dict( pool_connections=options.pool_connections, pool_maxsize=options.pool_maxsize, @@ -296,34 +330,35 @@ def session(self): ) session = requests.Session() # mount adapters with retry times - session.mount( - 'http://', requests.adapters.HTTPAdapter(**adapter_options)) - session.mount( - 'https://', requests.adapters.HTTPAdapter(**adapter_options)) + session.mount("http://", requests.adapters.HTTPAdapter(**adapter_options)) + session.mount("https://", requests.adapters.HTTPAdapter(**adapter_options)) self._session_local._session = session return self._session_local._session def sign_request(self, req, endpoint, region_name=None): - url = req.url[len(endpoint):] + url = req.url[len(endpoint) :] url_components = urlparse(unquote(url), allow_fragments=False) canonical_str = self._build_canonical_str(url_components, req) - logger.debug('canonical string: %s', canonical_str) + logger.debug("canonical string: %s", canonical_str) headers = dict() if self.token: - headers['Authorization'] = 'token ' + self.token + headers["Authorization"] = "token " + self.token sign_content = dict(access_id=self.access_id, canonical=canonical_str) if region_name is not None: sign_content["region_name"] = region_name resp = self.session.request( - 'post', 'http://%s:%s' % self.sign_endpoint, headers=headers, data=sign_content + "post", + "http://%s:%s" % self.sign_endpoint, + headers=headers, + data=sign_content, ) if resp.status_code < 400: - req.headers['Authorization'] = resp.text - logger.debug('headers after signing: %r', req.headers) + req.headers["Authorization"] = resp.text + logger.debug("headers after signing: %r", req.headers) else: try: err_msg = resp_err = resp.text @@ -332,7 +367,7 @@ def sign_request(self, req, endpoint, region_name=None): err_msg = repr(resp_err) raise SignServerError( - 'Sign server returned error code: %d\n%s' % (resp.status_code, err_msg), + "Sign server returned error code: %d\n%s" % (resp.status_code, err_msg), resp.status_code, resp_err, ) @@ -433,7 +468,10 @@ def _reload_account(self): class BearerTokenAccount(TempAccountMixin, BaseAccount): def __init__( - self, token=None, expired_hours=DEFAULT_TEMP_ACCOUNT_HOURS, get_bearer_token_fun=None + self, + token=None, + expired_hours=DEFAULT_TEMP_ACCOUNT_HOURS, + get_bearer_token_fun=None, ): self.token = token self._custom_bearer_token_func = get_bearer_token_fun @@ -441,7 +479,9 @@ def __init__( @classmethod def from_environments(cls): - expired_hours = int(os.getenv('ODPS_BEARER_TOKEN_HOURS', str(DEFAULT_TEMP_ACCOUNT_HOURS))) + expired_hours = int( + os.getenv("ODPS_BEARER_TOKEN_HOURS", str(DEFAULT_TEMP_ACCOUNT_HOURS)) + ) kwargs = {"expired_hours": expired_hours} if "ODPS_BEARER_TOKEN_FILE" in os.environ: return cls(**kwargs) @@ -459,7 +499,7 @@ def _get_bearer_token(self): with open(token_file_name, "r") as token_file: return token_file.read().strip() else: # pragma: no cover - from cupid.runtime import context, RuntimeContext + from cupid.runtime import RuntimeContext, context if not RuntimeContext.is_context_ready(): return @@ -480,11 +520,11 @@ def _reload_account(self): def sign_request(self, req, endpoint, region_name=None): self.reload() - url = req.url[len(endpoint):] + url = req.url[len(endpoint) :] url_components = urlparse(unquote(url), allow_fragments=False) self._build_canonical_str(url_components, req) - req.headers['x-odps-bearer-token'] = self.token - logger.debug('headers after signing: %r', req.headers) + req.headers["x-odps-bearer-token"] = self.token + logger.debug("headers after signing: %r", req.headers) class CredentialProviderAccount(StsAccount): @@ -493,10 +533,10 @@ def __init__(self, credential_provider): super(CredentialProviderAccount, self).__init__(None, None, None) def sign_request(self, req, endpoint, region_name=None): - get_cred_method = getattr(self.provider, "get_credential", None) or getattr( - self.provider, "get_credentials" - ) - credential = get_cred_method() + try: + credential = self.provider.get_credential() + except: + credential = self.provider.get_credentials() self.access_id = credential.get_access_key_id() self.secret_access_key = credential.get_access_key_secret() diff --git a/odps/apis/__init__.py b/odps/apis/__init__.py index a6088b67..7d6a6d7f 100644 --- a/odps/apis/__init__.py +++ b/odps/apis/__init__.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -10,4 +10,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file +# limitations under the License. diff --git a/odps/apis/storage_api/__init__.py b/odps/apis/storage_api/__init__.py index 4b80c798..302ad951 100644 --- a/odps/apis/storage_api/__init__.py +++ b/odps/apis/storage_api/__init__.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,4 +15,4 @@ import sys if sys.version_info[0] == 3: - from .storage_api import * \ No newline at end of file + from .storage_api import * diff --git a/odps/apis/storage_api/conftest.py b/odps/apis/storage_api/conftest.py index e8191cc4..0cff74ee 100644 --- a/odps/apis/storage_api/conftest.py +++ b/odps/apis/storage_api/conftest.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -32,7 +32,7 @@ def storage_api_client(odps): global _test_table_id - options.always_enable_schema = True + options.enable_schema = True test_table_name = tn("test_halo_common_table_" + str(_test_table_id)) _test_table_id += 1 @@ -46,4 +46,4 @@ def storage_api_client(odps): yield StorageApiArrowClient(odps, table) finally: table.drop(async_=True) - options.always_enable_schema = False + options.enable_schema = False diff --git a/odps/apis/storage_api/storage_api.py b/odps/apis/storage_api/storage_api.py index a3f1bec5..cd438a05 100644 --- a/odps/apis/storage_api/storage_api.py +++ b/odps/apis/storage_api/storage_api.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,9 +17,9 @@ import collections import json import logging -from io import IOBase, BytesIO from enum import Enum from hashlib import md5 +from io import BytesIO, IOBase try: import pyarrow as pa @@ -62,7 +62,9 @@ class SplitMode(str, Enum): ROW_OFFSET = "RowOffset" BUCKET = "Bucket" - split_mode = serializers.JSONNodeField('SplitMode', parse_callback=lambda s: SplitOptions.SplitMode(s)) + split_mode = serializers.JSONNodeField( + "SplitMode", parse_callback=lambda s: SplitOptions.SplitMode(s) + ) split_number = serializers.JSONNodeField("SplitNumber") cross_partition = serializers.JSONNodeField("CrossPartition") @@ -70,7 +72,7 @@ def __init__(self, **kwargs): super(SplitOptions, self).__init__(**kwargs) self.split_mode = self.split_mode or SplitOptions.SplitMode.SIZE - self.split_number = self.split_number or 256*1024*1024 + self.split_number = self.split_number or 256 * 1024 * 1024 self.cross_partition = self.cross_partition or True @classmethod @@ -79,7 +81,7 @@ def get_default_options(self, mode): options.cross_partition = True if mode == SplitOptions.SplitMode.SIZE: options.split_mode = SplitOptions.SplitMode.SIZE - options.split_number = 256*1024*1024 + options.split_number = 256 * 1024 * 1024 elif mode == SplitOptions.SplitMode.PARALLELISM: options.split_mode = SplitOptions.SplitMode.PARALLELISM options.split_number = 32 @@ -99,8 +101,12 @@ class TimestampUnit(str, Enum): MICRO = "micro" NANO = "nano" - timestamp_unit = serializers.JSONNodeField('TimestampUnit', parse_callback=lambda s: ArrowOptions.TimestampUnit(s)) - date_time_unit = serializers.JSONNodeField('DatetimeUnit', parse_callback=lambda s: ArrowOptions.TimestampUnit(s)) + timestamp_unit = serializers.JSONNodeField( + "TimestampUnit", parse_callback=lambda s: ArrowOptions.TimestampUnit(s) + ) + date_time_unit = serializers.JSONNodeField( + "DatetimeUnit", parse_callback=lambda s: ArrowOptions.TimestampUnit(s) + ) def __init__(self, **kwargs): super(ArrowOptions, self).__init__(**kwargs) @@ -117,8 +123,8 @@ class Column(JSONRemoteModel): class DataSchema(JSONRemoteModel): - data_columns = serializers.JSONNodesReferencesField(Column, 'DataColumns') - partition_columns = serializers.JSONNodesReferencesField(Column, 'PartitionColumns') + data_columns = serializers.JSONNodesReferencesField(Column, "DataColumns") + partition_columns = serializers.JSONNodesReferencesField(Column, "PartitionColumns") class DataFormat(JSONRemoteModel): @@ -188,16 +194,20 @@ def __init__(self, **kwargs): class TableBatchScanResponse(serializers.JSONSerializableModel): - __slots__ = ['status', 'request_id'] + __slots__ = ["status", "request_id"] session_id = serializers.JSONNodeField("SessionId") session_type = serializers.JSONNodeField("SessionType") - session_status = serializers.JSONNodeField('SessionStatus', parse_callback=lambda s: SessionStatus(s.upper())) + session_status = serializers.JSONNodeField( + "SessionStatus", parse_callback=lambda s: SessionStatus(s.upper()) + ) expiration_time = serializers.JSONNodeField("ExpirationTime") split_count = serializers.JSONNodeField("SplitsCount") record_count = serializers.JSONNodeField("RecordCount") - data_schema = serializers.JSONNodeReferenceField(DataSchema, 'DataSchema') - supported_data_format = serializers.JSONNodesReferencesField(DataFormat, "SupportedDataFormat") + data_schema = serializers.JSONNodeReferenceField(DataSchema, "DataSchema") + supported_data_format = serializers.JSONNodesReferencesField( + DataFormat, "SupportedDataFormat" + ) def __init__(self): super(TableBatchScanResponse, self).__init__() @@ -212,7 +222,9 @@ def __init__(self, session_id): class TableBatchWriteRequest(serializers.JSONSerializableModel): - dynamic_partition_options = serializers.JSONNodeReferenceField(DynamicPartitionOptions, "DynamicPartitionOptions") + dynamic_partition_options = serializers.JSONNodeReferenceField( + DynamicPartitionOptions, "DynamicPartitionOptions" + ) arrow_options = serializers.JSONNodeReferenceField(ArrowOptions, "ArrowOptions") overwrite = serializers.JSONNodeField("Overwrite") partition_spec = serializers.JSONNodeField("PartitionSpec") @@ -223,22 +235,30 @@ def __init__(self, **kwargs): self.partition_spec = self.partition_spec or "" self.arrow_options = self.arrow_options or ArrowOptions() - self.dynamic_partition_options = self.dynamic_partition_options or DynamicPartitionOptions() + self.dynamic_partition_options = ( + self.dynamic_partition_options or DynamicPartitionOptions() + ) self.overwrite = self.overwrite or True self.support_write_cluster = self.support_write_cluster or False class TableBatchWriteResponse(serializers.JSONSerializableModel): - __slots__ = ['status', "request_id"] + __slots__ = ["status", "request_id"] - session_status = serializers.JSONNodeField('SessionStatus', parse_callback=lambda s: SessionStatus(s.upper())) + session_status = serializers.JSONNodeField( + "SessionStatus", parse_callback=lambda s: SessionStatus(s.upper()) + ) expiration_time = serializers.JSONNodeField("ExpirationTime") session_id = serializers.JSONNodeField("SessionId") data_schema = serializers.JSONNodeReferenceField(DataSchema, "DataSchema") - supported_data_format = serializers.JSONNodesReferencesField(DataFormat, "SupportedDataFormat") + supported_data_format = serializers.JSONNodesReferencesField( + DataFormat, "SupportedDataFormat" + ) max_block_num = serializers.JSONNodeField("MaxBlockNumber") required_ordering = serializers.JSONNodesReferencesField(Order, "RequiredOrdering") - required_distribution = serializers.JSONNodeReferenceField(RequiredDistribution, "RequiredDistribution") + required_distribution = serializers.JSONNodeReferenceField( + RequiredDistribution, "RequiredDistribution" + ) def __init__(self): super(TableBatchWriteResponse, self).__init__() @@ -248,10 +268,16 @@ def __init__(self): class ReadRowsRequest(object): - def __init__(self, session_id, split_index=0, - row_index=0, row_count=0, max_batch_rows=4096, - compression=Compression.LZ4_FRAME, - data_format=DataFormat()): + def __init__( + self, + session_id, + split_index=0, + row_index=0, + row_count=0, + max_batch_rows=4096, + compression=Compression.LZ4_FRAME, + data_format=DataFormat(), + ): self.session_id = session_id self.split_index = split_index self.row_index = row_index @@ -268,10 +294,15 @@ def __init__(self): class WriteRowsRequest(object): - def __init__(self, session_id, - block_number=0, attempt_number=0, bucket_id=0, - compression=Compression.LZ4_FRAME, - data_format=DataFormat()): + def __init__( + self, + session_id, + block_number=0, + attempt_number=0, + bucket_id=0, + compression=Compression.LZ4_FRAME, + data_format=DataFormat(), + ): self.session_id = session_id self.block_number = block_number self.attempt_number = attempt_number @@ -333,7 +364,7 @@ def read(self, nbytes=None): Stream data. None means all the data has been read or there is error occurred. """ if self._stopped: - return b'' + return b"" total_size = 0 bufs = [] @@ -352,7 +383,7 @@ def read(self, nbytes=None): bufs.append(buf) total_size += len(buf) - return b''.join(bufs) + return b"".join(bufs) def get_status(self): """Get the status of the stream reader. @@ -375,7 +406,10 @@ def get_request_id(self): logger.error("The reader is not closed yet, please wait") return None - if self._raw_reader is not None and "x-odps-request-id" in self._raw_reader.headers: + if ( + self._raw_reader is not None + and "x-odps-request-id" in self._raw_reader.headers + ): return self._raw_reader.headers["x-odps-request-id"] else: return None @@ -424,11 +458,11 @@ def finish(self): self._stopped = True self._res = self._req_io.finish() - if self._res is not None and self._res.status_code == codes['ok']: + if self._res is not None and self._res.status_code == codes["ok"]: resp_json = self._res.json() return resp_json["CommitMessage"], True else: - return None, False + return None, False def get_status(self): """Get the status of this stream writer. @@ -534,7 +568,9 @@ def write(self, record_batch): self._arrow_writer = pa.ipc.new_stream( self._sink, record_batch.schema, - options=pa.ipc.IpcWriteOptions(compression=self._compression.to_string()), + options=pa.ipc.IpcWriteOptions( + compression=self._compression.to_string() + ), ) self._arrow_writer.write_batch(record_batch) @@ -577,7 +613,13 @@ def get_request_id(self): class StorageApiClient(object): """Client to bundle configuration needed for API requests.""" - def __init__(self, odps: ODPS, table: Table, rest_endpoint: str = None, quota_name: str = None): + def __init__( + self, + odps: ODPS, + table: Table, + rest_endpoint: str = None, + quota_name: str = None, + ): if isinstance(odps, ODPS) and isinstance(table, Table): self._odps = odps self._table = table @@ -598,7 +640,9 @@ def tunnel_rest(self): from ...tunnel.tabletunnel import TableTunnel - tunnel = TableTunnel(self._odps, endpoint=self._rest_endpoint, quota_name=self._quota_name) + tunnel = TableTunnel( + self._odps, endpoint=self._rest_endpoint, quota_name=self._quota_name + ) self._tunnel_rest = tunnel.tunnel_rest return self._tunnel_rest @@ -607,7 +651,19 @@ def _get_resource(self, *args) -> str: url = self._table.table_resource(endpoint=endpoint, force_schema=True) return "/".join([url] + list(args)) - def create_read_session(self, request: TableBatchScanRequest) -> TableBatchScanResponse: + @staticmethod + def _fill_common_headers(raw_headers=None, tags=None): + headers = raw_headers or {} + tags = tags or options.tunnel.tags + if tags: + if isinstance(tags, str): + tags = tags.split(",") + headers["odps-tunnel-tags"] = ",".join(tags) + return headers + + def create_read_session( + self, request: TableBatchScanRequest + ) -> TableBatchScanResponse: """Create a read session. Args: @@ -617,12 +673,14 @@ def create_read_session(self, request: TableBatchScanRequest) -> TableBatchScanR Read session response returned from the server. """ if not isinstance(request, TableBatchScanRequest): - raise ValueError("Use TableBatchScanRequest class to build request for create read session interface") + raise ValueError( + "Use TableBatchScanRequest class to build request for create read session interface" + ) json_str = request.serialize() url = self._get_resource("sessions") - headers = {"Content-Type": "application/json"} + headers = self._fill_common_headers({"Content-Type": "application/json"}) if json_str != "": headers["Content-MD5"] = md5(to_binary(json_str)).hexdigest() params = {"session_type": "batch_read"} @@ -633,7 +691,9 @@ def create_read_session(self, request: TableBatchScanRequest) -> TableBatchScanR response = TableBatchScanResponse() response.parse(res, obj=response) - response.status = Status.OK if res.status_code == codes['created'] else Status.WAIT + response.status = ( + Status.OK if res.status_code == codes["created"] else Status.WAIT + ) update_request_id(response, res) return response @@ -648,10 +708,12 @@ def get_read_session(self, request: SessionRequest) -> TableBatchScanResponse: Read session response returned from the server. """ if not isinstance(request, SessionRequest): - raise ValueError("Use SessionRequest class to build request for get read session interface") + raise ValueError( + "Use SessionRequest class to build request for get read session interface" + ) url = self._get_resource("sessions", request.session_id) - headers = {} + headers = self._fill_common_headers() params = {"session_type": "batch_read"} if self._quota_name: params["quotaName"] = self._quota_name @@ -675,19 +737,25 @@ def read_rows_stream(self, request: ReadRowsRequest) -> StreamReader: Stream reader. """ if not isinstance(request, ReadRowsRequest): - raise ValueError("Use ReadRowsRequest class to build request for read rows interface") + raise ValueError( + "Use ReadRowsRequest class to build request for read rows interface" + ) url = self._get_resource("data") - headers = { - "Connection": "Keep-Alive", - "Accept-Encoding": request.compression.name if request.compression != Compression.UNCOMPRESSED else "" - } + headers = self._fill_common_headers( + { + "Connection": "Keep-Alive", + "Accept-Encoding": request.compression.name + if request.compression != Compression.UNCOMPRESSED + else "", + } + ) params = { "session_id": request.session_id, "max_batch_rows": str(request.max_batch_rows), "split_index": str(request.split_index), "row_count": str(request.row_count), - "row_index": str(request.row_index) + "row_index": str(request.row_index), } if self._quota_name: params["quotaName"] = self._quota_name @@ -697,11 +765,15 @@ def read_rows_stream(self, request: ReadRowsRequest) -> StreamReader: params["data_format_version"] = request.data_format.version def download(): - return self.tunnel_rest.get(url, stream=True, params=params, headers=headers) + return self.tunnel_rest.get( + url, stream=True, params=params, headers=headers + ) return StreamReader(download) - def create_write_session(self, request: TableBatchWriteRequest) -> TableBatchWriteResponse: + def create_write_session( + self, request: TableBatchWriteRequest + ) -> TableBatchWriteResponse: """Create a write session. Args: @@ -711,12 +783,14 @@ def create_write_session(self, request: TableBatchWriteRequest) -> TableBatchWri Write session response returned from the server. """ if not isinstance(request, TableBatchWriteRequest): - raise ValueError("Use TableBatchWriteRequest class to build request for create write session interface") + raise ValueError( + "Use TableBatchWriteRequest class to build request for create write session interface" + ) json_str = request.serialize() url = self._get_resource("sessions") - headers = {"Content-Type": "application/json"} + headers = self._fill_common_headers({"Content-Type": "application/json"}) if json_str != "": headers["Content-MD5"] = md5(to_binary(json_str)).hexdigest() params = {"session_type": "batch_write"} @@ -742,10 +816,12 @@ def get_write_session(self, request: SessionRequest) -> TableBatchWriteResponse: Write session response returned from the server. """ if not isinstance(request, SessionRequest): - raise ValueError("Use SessionRequest class to build request for get write session interface") + raise ValueError( + "Use SessionRequest class to build request for get write session interface" + ) url = self._get_resource("sessions", request.session_id) - headers = {} + headers = self._fill_common_headers() params = {"session_type": "batch_write"} if self._quota_name: params["quotaName"] = self._quota_name @@ -769,17 +845,18 @@ def write_rows_stream(self, request: WriteRowsRequest) -> StreamWriter: Stream writer. """ if not isinstance(request, WriteRowsRequest): - raise ValueError("Use WriteRowsRequest class to build request for write rows interface") + raise ValueError( + "Use WriteRowsRequest class to build request for write rows interface" + ) url = self._get_resource("sessions", request.session_id, "data") - headers = { - "Content-Type": "application/octet-stream", - "Transfer-Encoding": "chunked" - } + headers = self._fill_common_headers( + {"Content-Type": "application/octet-stream", "Transfer-Encoding": "chunked"} + ) params = { "attempt_number": str(request.attempt_number), - "block_number": str(request.block_number) + "block_number": str(request.block_number), } if self._quota_name: params["quotaName"] = self._quota_name @@ -793,7 +870,9 @@ def upload(data): return StreamWriter(upload) - def commit_write_session(self, request: SessionRequest, commit_msg: list) -> TableBatchWriteResponse: + def commit_write_session( + self, request: SessionRequest, commit_msg: list + ) -> TableBatchWriteResponse: """Commit the write session after write the last stream data. Args: @@ -804,7 +883,9 @@ def commit_write_session(self, request: SessionRequest, commit_msg: list) -> Tab Write session response returned from the server. """ if not isinstance(request, SessionRequest): - raise ValueError("Use SessionRequest class to build request for commit write session interface") + raise ValueError( + "Use SessionRequest class to build request for commit write session interface" + ) if not isinstance(commit_msg, list): raise ValueError("Use list for commit message") @@ -812,7 +893,7 @@ def commit_write_session(self, request: SessionRequest, commit_msg: list) -> Tab json_str = json.dumps(commit_message_dict) url = self._get_resource("commit") - headers = {"Content-Type": "application/json"} + headers = self._fill_common_headers({"Content-Type": "application/json"}) params = {"session_id": request.session_id} if self._quota_name: params["quotaName"] = self._quota_name @@ -821,7 +902,9 @@ def commit_write_session(self, request: SessionRequest, commit_msg: list) -> Tab response = TableBatchWriteResponse() response.parse(res, obj=response) - response.status = Status.OK if res.status_code == codes['created'] else Status.WAIT + response.status = ( + Status.OK if res.status_code == codes["created"] else Status.WAIT + ) update_request_id(response, res) return response @@ -829,6 +912,7 @@ def commit_write_session(self, request: SessionRequest, commit_msg: list) -> Tab class StorageApiArrowClient(StorageApiClient): """Arrow batch client to bundle configuration needed for API requests.""" + def read_rows_arrow(self, request: ReadRowsRequest) -> ArrowReader: """Read one split of the read session. @@ -839,7 +923,9 @@ def read_rows_arrow(self, request: ReadRowsRequest) -> ArrowReader: Arrow batch reader. """ if not isinstance(request, ReadRowsRequest): - raise ValueError("Use ReadRowsRequest class to build request for read rows interface") + raise ValueError( + "Use ReadRowsRequest class to build request for read rows interface" + ) return ArrowReader(self.read_rows_stream(request)) @@ -853,6 +939,8 @@ def write_rows_arrow(self, request: WriteRowsRequest) -> ArrowWriter: Arrow batch writer. """ if not isinstance(request, WriteRowsRequest): - raise ValueError("Use WriteRowsRequest class to build request for write rows interface") + raise ValueError( + "Use WriteRowsRequest class to build request for write rows interface" + ) return ArrowWriter(self.write_rows_stream(request), request.compression) diff --git a/odps/apis/storage_api/tests/__init__.py b/odps/apis/storage_api/tests/__init__.py index 6ee90d86..f7aef0af 100644 --- a/odps/apis/storage_api/tests/__init__.py +++ b/odps/apis/storage_api/tests/__init__.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest import sys +import pytest + try: import pyarrow as pa except ImportError: @@ -22,4 +23,4 @@ pytestmark = pytest.mark.skip("Need pyarrow to run this test") if sys.version_info[0] == 2: - pytestmark = pytest.mark.skip("Need python3.5+ to run this test") \ No newline at end of file + pytestmark = pytest.mark.skip("Need python3.5+ to run this test") diff --git a/odps/apis/storage_api/tests/data_item.conf b/odps/apis/storage_api/tests/data_item.conf index e909eb49..a686d888 100644 --- a/odps/apis/storage_api/tests/data_item.conf +++ b/odps/apis/storage_api/tests/data_item.conf @@ -10,4 +10,4 @@ "batch_size": 4096, "batch_count": 300 } -} \ No newline at end of file +} diff --git a/odps/apis/storage_api/tests/record_batch_generate.py b/odps/apis/storage_api/tests/record_batch_generate.py index 1dc96b96..2388caa6 100644 --- a/odps/apis/storage_api/tests/record_batch_generate.py +++ b/odps/apis/storage_api/tests/record_batch_generate.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,9 +21,11 @@ RANDOM_STRING_LENGTH = 10 logger = logging.getLogger(__name__) + def generate_bigint_value(item, column_index, row_index): return (item.val + row_index) * (column_index + 1) + def generate_bigint_list(item, column_index): pylist = [] for i in range(0, item.batch_size): @@ -31,14 +33,16 @@ def generate_bigint_list(item, column_index): return pylist + def generate_string_value(item, column_index, row_index): str = "" for idx in range(0, RANDOM_STRING_LENGTH): - c = chr(ord('a') + (item.val + row_index) * (column_index + idx) % 26) + c = chr(ord("a") + (item.val + row_index) * (column_index + idx) % 26) str += c return str + def generate_string_list(item, column_index): pylist = [] for i in range(0, item.batch_size): @@ -46,6 +50,7 @@ def generate_string_list(item, column_index): return pylist + def build_list_from_schema(item, type, column_index): if type == "bigint": return generate_bigint_list(item, column_index) @@ -54,28 +59,40 @@ def build_list_from_schema(item, type, column_index): else: raise ValueError("Type " + type + " not supported yet") + def check_array_based_on_type_info(array, column_index, row_index, item, offset): if type(array) == pa.Int64Array: - if array[row_index - offset].as_py() != generate_bigint_value(item, column_index, row_index % item.batch_size): + if array[row_index - offset].as_py() != generate_bigint_value( + item, column_index, row_index % item.batch_size + ): return False elif type(array) == pa.StringArray: - if array[row_index - offset].as_py() != generate_string_value(item, column_index, row_index % item.batch_size): + if array[row_index - offset].as_py() != generate_string_value( + item, column_index, row_index % item.batch_size + ): return False else: raise ValueError("Type " + str(type(array)) + " not supported yet") return True + def verify_data(record_batch, item, total_line): offset = total_line % item.batch_size for i in range(0, len(item.data_columns)): - for j in range(total_line % item.batch_size, total_line % item.batch_size + record_batch.num_rows): - if not check_array_based_on_type_info(record_batch.column(i), i, j, item, offset): + for j in range( + total_line % item.batch_size, + total_line % item.batch_size + record_batch.num_rows, + ): + if not check_array_based_on_type_info( + record_batch.column(i), i, j, item, offset + ): logger.info("Row value is not correct") return False return True + def generate_data_based_on_odps_schema(item): build_arrays = [] generate_data_columns_count = len(item.data_columns) @@ -101,5 +118,6 @@ def generate_data_based_on_odps_schema(item): return record_batch + def generate_base_table(item): - return generate_data_based_on_odps_schema(item) \ No newline at end of file + return generate_data_based_on_odps_schema(item) diff --git a/odps/apis/storage_api/tests/test_storage_api.py b/odps/apis/storage_api/tests/test_storage_api.py index 1ca2167f..d8da8769 100644 --- a/odps/apis/storage_api/tests/test_storage_api.py +++ b/odps/apis/storage_api/tests/test_storage_api.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,11 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging import sys import time -import logging import pytest + try: import pyarrow as pa except ImportError: @@ -51,7 +52,10 @@ def test_storage_api(storage_api_client): raise IOError("Get write session failed") return - if resp.session_status != SessionStatus.NORMAL and resp.session_status != SessionStatus.COMMITTED: + if ( + resp.session_status != SessionStatus.NORMAL + and resp.session_status != SessionStatus.COMMITTED + ): logger.info("Wait...") time.sleep(1) continue @@ -63,8 +67,13 @@ def test_storage_api(storage_api_client): bigint_list = list(range(4096)) record_batch = pa.RecordBatch.from_arrays( - [pa.array(bigint_list), pa.array(bigint_list), pa.array(bigint_list), pa.array(bigint_list)], - names=["a", "b", "c", "d"] + [ + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(bigint_list), + ], + names=["a", "b", "c", "d"], ) try: writer = storage_api_client.write_rows_stream(req) @@ -84,7 +93,7 @@ def test_storage_api(storage_api_client): raise IOError("write arrow record batch failed") # write EOS given https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format - suc = writer.write(b'\xff\xff\xff\xff\x00\x00\x00\x00') + suc = writer.write(b"\xff\xff\xff\xff\x00\x00\x00\x00") if not suc: raise IOError("write EOS failed") commit_message, suc = writer.finish() @@ -115,7 +124,10 @@ def test_storage_api(storage_api_client): if resp.status != Status.OK: raise IOError("Get write session failed") - if resp.session_status != SessionStatus.NORMAL and resp.session_status != SessionStatus.COMMITTED: + if ( + resp.session_status != SessionStatus.NORMAL + and resp.session_status != SessionStatus.COMMITTED + ): logger.info("Wait...") time.sleep(1) continue @@ -150,7 +162,7 @@ def test_storage_api(storage_api_client): req = ReadRowsRequest(session_id=resp.session_id, max_batch_rows=4096) read_size = 65536 - buf = b'' + buf = b"" for i in range(0, split_count): req.split_index = i start = time.time() diff --git a/odps/apis/storage_api/tests/test_storage_api_arrow.py b/odps/apis/storage_api/tests/test_storage_api_arrow.py index 47e8bc4c..f9b31f75 100644 --- a/odps/apis/storage_api/tests/test_storage_api_arrow.py +++ b/odps/apis/storage_api/tests/test_storage_api_arrow.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ import threading import pytest + try: import pyarrow as pa except ImportError: @@ -43,7 +44,9 @@ def test_split_limit(storage_api_client): assert resp is None - req.split_options = SplitOptions.get_default_options(SplitOptions.SplitMode.ROW_OFFSET) + req.split_options = SplitOptions.get_default_options( + SplitOptions.SplitMode.ROW_OFFSET + ) resp = storage_api_client.create_read_session(req) if resp.status != Status.OK and resp.status != Status.WAIT: logger.info("Create read session by row offset split option failed") @@ -77,7 +80,15 @@ def test_write_rows_with_partition_by_diff_schema_neg1(storage_api_client): bigint_list = list(range(item.batch_size)) string_list = ["test_write_1"] * item.batch_size - record_batch = pa.RecordBatch.from_arrays([pa.array(bigint_list), pa.array(bigint_list), pa.array(bigint_list), pa.array(string_list)], names=["a", "b", "c", "d"]) + record_batch = pa.RecordBatch.from_arrays( + [ + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(string_list), + ], + names=["a", "b", "c", "d"], + ) writer = storage_api_client.write_rows_arrow(req) write_rows_exception = None @@ -97,7 +108,10 @@ def test_write_rows_with_partition_by_diff_schema_neg1(storage_api_client): logger.info("RecordBatch's schema is not right") logger.info(write_rows_exception) - record_batch = pa.RecordBatch.from_arrays([pa.array(bigint_list), pa.array(bigint_list), pa.array(bigint_list)], names=["a", "b", "c"]) + record_batch = pa.RecordBatch.from_arrays( + [pa.array(bigint_list), pa.array(bigint_list), pa.array(bigint_list)], + names=["a", "b", "c"], + ) writer = storage_api_client.write_rows_arrow(req) write_rows_exception = None @@ -118,8 +132,14 @@ def test_write_rows_with_partition_by_diff_schema_neg1(storage_api_client): logger.info(write_rows_exception) record_batch = pa.RecordBatch.from_arrays( - [pa.array(bigint_list), pa.array(bigint_list), pa.array(bigint_list), pa.array(bigint_list), pa.array(bigint_list)], - names=["a", "b", "c", "d", "e"] + [ + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(bigint_list), + ], + names=["a", "b", "c", "d", "e"], ) writer = storage_api_client.write_rows_arrow(req) @@ -141,8 +161,14 @@ def test_write_rows_with_partition_by_diff_schema_neg1(storage_api_client): logger.info(write_rows_exception) record_batch = pa.RecordBatch.from_arrays( - [pa.array(bigint_list), pa.array(bigint_list), pa.array(bigint_list), pa.array(bigint_list), pa.array(string_list)], - names=["a", "b", "c", "d", "e"] + [ + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(string_list), + ], + names=["a", "b", "c", "d", "e"], ) writer = storage_api_client.write_rows_arrow(req) @@ -182,7 +208,12 @@ def test_write_rows_without_partition_by_diff_schema_neg1(storage_api_client): string_list = ["test_write_1"] * item.batch_size record_batch = pa.RecordBatch.from_arrays( - [pa.array(bigint_list), pa.array(bigint_list), pa.array(bigint_list), pa.array(bigint_list)], + [ + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(bigint_list), + ], names=["a", "b", "c", "d"], ) writer = storage_api_client.write_rows_arrow(req) @@ -223,8 +254,16 @@ def test_write_rows_without_partition_by_diff_schema_neg1(storage_api_client): end = time.time() logger.info("Write rows cost: " + str(end - start) + "s") - record_batch = pa.RecordBatch.from_arrays([pa.array(bigint_list), pa.array(bigint_list), pa.array(bigint_list), pa.array(bigint_list), pa.array(string_list)], - names=["a", "b", "c", "d", "e"]) + record_batch = pa.RecordBatch.from_arrays( + [ + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(string_list), + ], + names=["a", "b", "c", "d", "e"], + ) start = time.time() writer = storage_api_client.write_rows_arrow(req) @@ -245,8 +284,16 @@ def test_write_rows_without_partition_by_diff_schema_neg1(storage_api_client): unique_bigint_list = [10] * item.batch_size - record_batch = pa.RecordBatch.from_arrays([pa.array(bigint_list), pa.array(bigint_list), pa.array(bigint_list), pa.array(bigint_list), pa.array(unique_bigint_list)], - names=["a", "b", "c", "d", "e"]) + record_batch = pa.RecordBatch.from_arrays( + [ + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(bigint_list), + pa.array(unique_bigint_list), + ], + names=["a", "b", "c", "d", "e"], + ) start = time.time() writer = storage_api_client.write_rows_arrow(req) @@ -371,7 +418,9 @@ def test_split_row(storage_api_client): assert write_rows(item, storage_api_client) is True assert commit_write_session(item, storage_api_client) is True - item.split_options = SplitOptions.get_default_options(SplitOptions.SplitMode.ROW_OFFSET) + item.split_options = SplitOptions.get_default_options( + SplitOptions.SplitMode.ROW_OFFSET + ) assert create_read_session(item, storage_api_client) is True assert get_read_session(item, storage_api_client) is True diff --git a/odps/apis/storage_api/tests/util.py b/odps/apis/storage_api/tests/util.py index 33c3fb59..42381948 100644 --- a/odps/apis/storage_api/tests/util.py +++ b/odps/apis/storage_api/tests/util.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -37,7 +37,9 @@ def __init__(self): self.data_columns = None self.test_data_format = False self.data_format = DataFormat() - self.split_options = SplitOptions.get_default_options(SplitOptions.SplitMode.SIZE) + self.split_options = SplitOptions.get_default_options( + SplitOptions.SplitMode.SIZE + ) self.has_partition = False @@ -45,7 +47,9 @@ def push_commit_message(self, commit_message): self.commit_messages.append(commit_message) def load_conf(self, tag): - with open(os.path.dirname(os.path.abspath(__file__)) + '/data_item.conf', 'r') as conf_file: + with open( + os.path.dirname(os.path.abspath(__file__)) + "/data_item.conf", "r" + ) as conf_file: conf = json.load(conf_file) tag_conf = conf[tag] self.val = tag_conf["val"] @@ -97,7 +101,10 @@ def get_write_session(item, storage_api_client): logger.info("get write session failed") return False - if resp.session_status != SessionStatus.NORMAL and resp.session_status != SessionStatus.COMMITTED: + if ( + resp.session_status != SessionStatus.NORMAL + and resp.session_status != SessionStatus.COMMITTED + ): logger.info("Wait...") time.sleep(1) continue @@ -222,7 +229,9 @@ def get_read_session(item, storage_api_client): def read_rows(item, storage_api_client, compression=Compression.UNCOMPRESSED): - req = ReadRowsRequest(session_id=item.read_session_id, max_batch_rows=4096, compression=compression) + req = ReadRowsRequest( + session_id=item.read_session_id, max_batch_rows=4096, compression=compression + ) if item.test_data_format: req.data_format = item.data_format @@ -263,7 +272,12 @@ def read_rows(item, storage_api_client, compression=Compression.UNCOMPRESSED): logger.info("Read rows cost (index " + str(i) + "): " + str(end - start) + "s") if total_line != item.total_count: - logger.info("read rows number incorrect:" + str(total_line) + " != " + str(item.total_count)) + logger.info( + "read rows number incorrect:" + + str(total_line) + + " != " + + str(item.total_count) + ) return False return True diff --git a/odps/compat.py b/odps/compat.py index 4dbd1c20..c10c10ed 100644 --- a/odps/compat.py +++ b/odps/compat.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,11 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys -import logging.config import itertools +import logging.config +import os import platform +import sys import warnings + try: if sys.version_info[:2] < (3, 3): import xml.etree.cElementTree as ElementTree @@ -26,13 +28,14 @@ except ImportError: import xml.etree.ElementTree as ElementTree try: - ElementTreeParseError = getattr(ElementTree, 'ParseError') + ElementTreeParseError = getattr(ElementTree, "ParseError") except AttributeError: - ElementTreeParseError = getattr(ElementTree, 'XMLParserError') + ElementTreeParseError = getattr(ElementTree, "XMLParserError") try: from collections.abc import Iterable except ImportError: from collections import Iterable + from unicodedata import east_asian_width from .lib import six @@ -42,7 +45,7 @@ LESS_PY33 = sys.version_info[:2] < (3, 3) LESS_PY34 = sys.version_info[:2] < (3, 4) LESS_PY35 = sys.version_info[:2] < (3, 5) -PYPY = platform.python_implementation().lower() == 'pypy' +PYPY = platform.python_implementation().lower() == "pypy" SEEK_SET = 0 SEEK_CUR = 1 @@ -51,17 +54,20 @@ # Definition of East Asian Width # http://unicode.org/reports/tr11/ # Ambiguous width can be changed by option -_EAW_MAP = {'Na': 1, 'N': 1, 'W': 2, 'F': 2, 'H': 1} +_EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1} import decimal -DECIMAL_TYPES = [decimal.Decimal, ] + +DECIMAL_TYPES = [ + decimal.Decimal, +] import json # don't remove try: TimeoutError = TimeoutError except NameError: - TimeoutError = type('TimeoutError', (RuntimeError,), {}) + TimeoutError = type("TimeoutError", (RuntimeError,), {}) if six.PY3: @@ -77,6 +83,7 @@ long_type = int import io + StringIO = io.StringIO BytesIO = io.BytesIO @@ -107,7 +114,9 @@ def east_asian_len(data, encoding=None, ambiguous_width=1): Calculate display width considering unicode East Asian Width """ if isinstance(data, six.text_type): - return sum([_EAW_MAP.get(east_asian_width(c), ambiguous_width) for c in data]) + return sum( + [_EAW_MAP.get(east_asian_width(c), ambiguous_width) for c in data] + ) else: return len(data) @@ -142,6 +151,7 @@ def east_asian_len(data, encoding=None, ambiguous_width=1): try: import cdecimal as decimal + DECIMAL_TYPES.append(decimal.Decimal) except ImportError: import decimal @@ -171,23 +181,26 @@ def east_asian_len(data, encoding=None, ambiguous_width=1): data = data.decode(encoding) except UnicodeError: pass - return sum([_EAW_MAP.get(east_asian_width(c), ambiguous_width) for c in data]) + return sum( + [_EAW_MAP.get(east_asian_width(c), ambiguous_width) for c in data] + ) else: return len(data) dictconfig = lambda config: logging.config.dictConfig(config) + import cgi + import __builtin__ as builtins # don't remove + from .lib import futures # don't remove - import cgi UnsupportedOperation = type("UnsupportedOperation", (OSError, ValueError), {}) from distutils.version import LooseVersion as Version - from threading import _Semaphore as _PySemaphore - from .lib.monotonic import monotonic + from .lib.monotonic import monotonic class Semaphore(_PySemaphore): def acquire(self, blocking=True, timeout=None): @@ -212,9 +225,11 @@ def acquire(self, blocking=True, timeout=None): rc = True return rc + if LESS_PY32: try: from .tests.dictconfig import dictConfig + dictconfig = lambda config: dictConfig(config) except ImportError: pass @@ -231,20 +246,23 @@ def suppress(*exceptions): except exceptions: pass + Enum = enum.Enum DECIMAL_TYPES = tuple(DECIMAL_TYPES) Decimal = decimal.Decimal try: import pandas as pd - if not hasattr(pd.DataFrame, 'sort_values'): + + if not hasattr(pd.DataFrame, "sort_values"): pd.DataFrame.sort_values = pd.DataFrame.sort from pandas.core.internals import blocks as pd_blocks + if not hasattr(pd_blocks, "new_block"): pd_blocks.new_block = pd_blocks.make_block - if not hasattr(pd.RangeIndex, 'start'): + if not hasattr(pd.RangeIndex, "start"): pd.RangeIndex.start = property(fget=lambda x: x._start) pd.RangeIndex.stop = property(fget=lambda x: x._stop) pd.RangeIndex.step = property(fget=lambda x: x._step) @@ -262,21 +280,36 @@ def suppress(*exceptions): except ImportError: pass -from .lib.lib_utils import isvalidattr, dir2, getargspec, getfullargspec - -from .lib.six.moves import reduce -from .lib.six.moves import reload_module -from .lib.six.moves.queue import Queue, Empty -from .lib.six.moves.urllib.request import urlretrieve -from .lib.six.moves import cPickle as pickle -from .lib.six.moves.urllib.parse import urlencode, urlparse, unquote, quote, quote_plus, parse_qsl -from .lib.six.moves import configparser as ConfigParser - -from .lib.ext_types import Monthdelta +if sys.version_info[0] > 2: + # workaround for polluted sys.path due to some packages + try: + import http.client + except ImportError: + sys.modules.pop("http", None) + old_path = list(sys.path) + sys.path = [os.path.dirname(os.__file__)] + sys.path + import http.client + sys.path = old_path import datetime +from .lib.ext_types import Monthdelta +from .lib.lib_utils import dir2, getargspec, getfullargspec, isvalidattr +from .lib.six.moves import configparser as ConfigParser +from .lib.six.moves import cPickle as pickle +from .lib.six.moves import reduce, reload_module +from .lib.six.moves.queue import Empty, Queue +from .lib.six.moves.urllib.parse import ( + parse_qsl, + quote, + quote_plus, + unquote, + urlencode, + urlparse, +) +from .lib.six.moves.urllib.request import urlretrieve + class _FixedOffset(datetime.tzinfo): """ @@ -284,6 +317,7 @@ class _FixedOffset(datetime.tzinfo): Note that FixedOffset(0, "UTC") is a different way to build a UTC tzinfo object. """ + def __init__(self, offset, name=None): self.__offset = datetime.timedelta(minutes=offset) self.__name = name @@ -300,17 +334,19 @@ def dst(self, dt): try: import zoneinfo + utc = zoneinfo.ZoneInfo("UTC") FixedOffset = _FixedOffset except ImportError: try: import pytz + utc = pytz.utc FixedOffset = pytz._FixedOffset except ImportError: _ZERO_TIMEDELTA = datetime.timedelta(0) FixedOffset = _FixedOffset - utc = FixedOffset(0, 'UTC') + utc = FixedOffset(0, "UTC") try: @@ -328,8 +364,38 @@ def parsedate_to_datetime(data): return datetime.datetime(*dtuple[:6], tzinfo=FixedOffset(tz / 60.0)) -__all__ = ['sys', 'builtins', 'logging.config', 'dictconfig', 'suppress', - 'reduce', 'reload_module', 'Queue', 'Empty', 'ElementTree', 'ElementTreeParseError', - 'urlretrieve', 'pickle', 'urlencode', 'urlparse', 'unquote', 'quote', 'quote_plus', 'parse_qsl', - 'Enum', 'ConfigParser', 'decimal', 'Decimal', 'DECIMAL_TYPES', 'FixedOffset', 'utc', 'Monthdelta', - 'Iterable', 'TimeoutError', 'cgi', 'parsedate_to_datetime', 'Version', 'Semaphore'] +__all__ = [ + "sys", + "builtins", + "logging.config", + "dictconfig", + "suppress", + "reduce", + "reload_module", + "Queue", + "Empty", + "ElementTree", + "ElementTreeParseError", + "urlretrieve", + "pickle", + "urlencode", + "urlparse", + "unquote", + "quote", + "quote_plus", + "parse_qsl", + "Enum", + "ConfigParser", + "decimal", + "Decimal", + "DECIMAL_TYPES", + "FixedOffset", + "utc", + "Monthdelta", + "Iterable", + "TimeoutError", + "cgi", + "parsedate_to_datetime", + "Version", + "Semaphore", +] diff --git a/odps/config.py b/odps/config.py index 74bbab58..0c10aaea 100644 --- a/odps/config.py +++ b/odps/config.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,7 +30,7 @@ contextvars = None -DEFAULT_BLOCK_BUFFER_SIZE = 20 * 1024 ** 2 +DEFAULT_BLOCK_BUFFER_SIZE = 20 * 1024**2 DEFAULT_CHUNK_SIZE = 65536 DEFAULT_CONNECT_RETRY_TIMES = 4 DEFAULT_CONNECT_TIMEOUT = 120 @@ -38,16 +38,16 @@ DEFAULT_POOL_CONNECTIONS = 10 DEFAULT_POOL_MAXSIZE = 10 DEFAULT_RETRY_DELAY = 0.1 -_DEFAULT_REDIRECT_WARN = 'Option {source} has been replaced by {target} and might be removed in a future release.' +_DEFAULT_REDIRECT_WARN = "Option {source} has been replaced by {target} and might be removed in a future release." -class OptionError(Exception): +class OptionError(AttributeError): pass class Redirection(object): def __init__(self, item, warn=None): - self._items = item.split('.') + self._items = item.split(".") self._warn = warn self._warned = True self._parent = None @@ -59,8 +59,9 @@ def bind(self, attr_dict): def getvalue(self, silent=False): if not silent and self._warn and not self._warned: - in_completer = any(1 for st in traceback.extract_stack() - if 'completer' in st[0].lower()) + in_completer = any( + 1 for st in traceback.extract_stack() if "completer" in st[0].lower() + ) if not in_completer: self._warned = True warnings.warn(self._warn) @@ -93,8 +94,9 @@ def __init__(self, *args, **kw): def getvalue(self, silent=False): if self._use_pd: import pandas as pd + try: - return pd.get_option('.'.join(self._items)) + return pd.get_option(".".join(self._items)) except (KeyError, LookupError, AttributeError): self._use_pd = False else: @@ -103,7 +105,8 @@ def getvalue(self, silent=False): def setvalue(self, value, silent=False): if self._use_pd: import pandas as pd - key = '.'.join(self._items) + + key = ".".join(self._items) if value != pd.get_option(key): pd.set_option(key, value) else: @@ -113,7 +116,7 @@ def setvalue(self, value, silent=False): class AttributeDict(dict): def __init__(self, *args, **kwargs): self._inited = False - self._parent = kwargs.pop('_parent', None) + self._parent = kwargs.pop("_parent", None) self._root = None super(AttributeDict, self).__init__(*args, **kwargs) self._inited = True @@ -153,7 +156,9 @@ def unregister(self, key): def add_validator(self, key, validator): value, old_validator = self[key] validators = getattr( - old_validator, "validators", [old_validator] if callable(old_validator) else [] + old_validator, + "validators", + [old_validator] if callable(old_validator) else [], ) validators.append(validator) self[key] = (value, all_validator(*validators)) @@ -169,7 +174,7 @@ def _setattr(self, key, value, silent=False): validate = self[key][1] if validate is not None: if not validate(value): - raise ValueError('Cannot set value %s' % value) + raise ValueError("Cannot set value %s" % value) if isinstance(val[0], Redirection): val[0].setvalue(value) else: @@ -180,7 +185,7 @@ def _setattr(self, key, value, silent=False): self[key] = value def __setattr__(self, key, value): - if key == '_inited': + if key == "_inited": super(AttributeDict, self).__setattr__(key, value) return try: @@ -198,8 +203,8 @@ def __setattr__(self, key, value): def loads(self, d): dispatches = collections.defaultdict(dict) for k, v in six.iteritems(d): - if '.' in k: - sk, rk = k.split('.', 1) + if "." in k: + sk, rk = k.split(".", 1) dispatches[sk][rk] = v elif isinstance(self[k][0], Redirection): self[k][0].setvalue(v, silent=True) @@ -214,7 +219,9 @@ def dumps(self): result_dict = dict() for k, v in six.iteritems(self): if isinstance(v, AttributeDict): - result_dict.update((k + '.' + sk, sv) for sk, sv in six.iteritems(v.dumps())) + result_dict.update( + (k + "." + sk, sv) for sk, sv in six.iteritems(v.dumps()) + ) elif isinstance(v[0], BaseAccount) or callable(v[0]): # ignore accounts in config dumps result_dict[k] = None @@ -236,14 +243,14 @@ def __getattr__(self, item): return getattr(self._config, item) def __setattr__(self, key, value): - if key == '_config': + if key == "_config": object.__setattr__(self, key, value) return setattr(self._config, key, value) def register_option(self, option, value, validator=None): assert validator is None or callable(validator) - splits = option.split('.') + splits = option.split(".") conf = self._config for name in splits[:-1]: @@ -254,14 +261,14 @@ def register_option(self, option, value, validator=None): conf = val elif not isinstance(config, dict): raise AttributeError( - 'Fail to set option: %s, conflict has encountered' % option) + "Fail to set option: %s, conflict has encountered" % option + ) else: conf = config key = splits[-1] if conf.get(key) is not None: - raise AttributeError( - 'Fail to set option: %s, option has been set' % option) + raise AttributeError("Fail to set option: %s, option has been set" % option) conf.register(key, value, validator) @@ -276,19 +283,22 @@ def redirect_option(self, option, target, warn=_DEFAULT_REDIRECT_WARN): self.register_option(option, redir) def unregister_option(self, option): - splits = option.split('.') + splits = option.split(".") conf = self._config for name in splits[:-1]: config = conf.get(name) if not isinstance(config, dict): raise AttributeError( - 'Fail to unregister option: %s, conflict has encountered' % option) + "Fail to unregister option: %s, conflict has encountered" % option + ) else: conf = config key = splits[-1] if key not in conf: - raise AttributeError('Option %s not configured, thus failed to unregister.' % option) + raise AttributeError( + "Option %s not configured, thus failed to unregister." % option + ) conf.unregister(key) def update(self, new_config): @@ -301,19 +311,22 @@ def update(self, new_config): setattr(self, option, value) def add_validator(self, option, validator): - splits = option.split('.') + splits = option.split(".") conf = self._config for name in splits[:-1]: config = conf.get(name) if not isinstance(config, dict): raise AttributeError( - 'Fail to add validator: %s, conflict has encountered' % option) + "Fail to add validator: %s, conflict has encountered" % option + ) else: conf = config key = splits[-1] if key not in conf: - raise AttributeError('Option %s not configured, thus failed to set validator.' % option) + raise AttributeError( + "Option %s not configured, thus failed to set validator." % option + ) conf.add_validator(key, validator) def loads(self, d): @@ -325,13 +338,15 @@ def dumps(self): def is_interactive(): import __main__ as main - return not hasattr(main, '__file__') + + return not hasattr(main, "__file__") # validators def any_validator(*validators): def validate(x): return any(validator(x) for validator in validators) + return validate @@ -349,11 +364,13 @@ def validate(x): is_integer = lambda x: isinstance(x, six.integer_types) is_string = lambda x: isinstance(x, six.string_types) is_dict = lambda x: isinstance(x, dict) +is_list = lambda x: isinstance(x, list) def is_in(vals): def validate(x): return x in vals + return validate @@ -414,173 +431,262 @@ def emit(self, record): default_options = Config() -default_options.register_option('is_global_account_overwritable', True, validator=is_bool) -default_options.register_option('account', None) -default_options.register_option('endpoint', None) -default_options.redirect_option('end_point', 'endpoint') -default_options.register_option('default_project', None) -default_options.register_option('default_schema', None) -default_options.register_option('app_account', None) -default_options.register_option('region_name', None) -default_options.register_option('local_timezone', None) -default_options.register_option('use_legacy_parsedate', False) -default_options.register_option('allow_antique_date', False) -default_options.register_option( - 'user_agent_pattern', '$pyodps_version $python_version $os_version $maxframe_version' -) -default_options.register_option('logview_host', None) -default_options.register_option('logview_hours', 24 * 30, validator=is_integer) -default_options.redirect_option('log_view_host', 'logview_host') -default_options.redirect_option('log_view_hours', 'logview_hours') -default_options.register_option('api_proxy', None) -default_options.register_option('data_proxy', None) -default_options.redirect_option('tunnel_proxy', 'data_proxy') -default_options.register_option('seahawks_url', None) -default_options.register_option('biz_id', None) -default_options.register_option('priority', None, validator=any_validator(is_null, is_integer)) -default_options.register_option('get_priority', None) -default_options.register_option('temp_lifecycle', 1, validator=is_integer) -default_options.register_option('lifecycle', None, validator=any_validator(is_null, is_integer)) -default_options.register_option('table_read_limit', None, validator=any_validator(is_null, is_integer)) -default_options.register_option('completion_size', 10, validator=is_integer) -default_options.register_option('default_task_settings', None, validator=any_validator(is_null, is_dict)) -default_options.register_option('resource_chunk_size', 64 << 20, validator=is_integer) -default_options.register_option('upload_resource_in_chunks', True, validator=is_bool) -default_options.register_option('verify_ssl', True) -default_options.register_option('always_enable_schema', False, validator=is_bool) -default_options.register_option('table_auto_flush_time', 150, validator=is_integer) -default_options.register_option('struct_as_dict', False, validator=is_bool) -default_options.register_option('progress_time_interval', 5 * 60, validator=any_validator(is_float, is_integer)) -default_options.register_option('progress_percentage_gap', 5, validator=is_integer) -default_options.register_option('enable_v4_sign', False, validator=is_bool) +default_options.register_option( + "is_global_account_overwritable", True, validator=is_bool +) +default_options.register_option("account", None) +default_options.register_option("endpoint", None) +default_options.redirect_option("end_point", "endpoint") +default_options.register_option("default_project", None) +default_options.register_option("default_schema", None) +default_options.register_option("app_account", None) +default_options.register_option("region_name", None) +default_options.register_option("quota_name", None) +default_options.register_option("local_timezone", None) +default_options.register_option("use_legacy_parsedate", False) +default_options.register_option("allow_antique_date", False) +default_options.register_option( + "user_agent_pattern", + "$pyodps_version $python_version $os_version $maxframe_version", +) +default_options.register_option("logview_host", None) +default_options.register_option("logview_hours", 24 * 30, validator=is_integer) +default_options.redirect_option("log_view_host", "logview_host") +default_options.redirect_option("log_view_hours", "logview_hours") +default_options.register_option("api_proxy", None) +default_options.register_option("data_proxy", None) +default_options.redirect_option("tunnel_proxy", "data_proxy") +default_options.register_option("seahawks_url", None) +default_options.register_option("biz_id", None) +default_options.register_option( + "priority", None, validator=any_validator(is_null, is_integer) +) +default_options.register_option("get_priority", None) +default_options.register_option("temp_lifecycle", 1, validator=is_integer) +default_options.register_option( + "lifecycle", None, validator=any_validator(is_null, is_integer) +) +default_options.register_option( + "table_read_limit", None, validator=any_validator(is_null, is_integer) +) +default_options.register_option("completion_size", 10, validator=is_integer) +default_options.register_option( + "default_task_settings", None, validator=any_validator(is_null, is_dict) +) +default_options.register_option("resource_chunk_size", 64 << 20, validator=is_integer) +default_options.register_option("upload_resource_in_chunks", True, validator=is_bool) +default_options.register_option("verify_ssl", True) +default_options.register_option("enable_schema", False, validator=is_bool) +default_options.redirect_option("always_enable_schema", "enable_schema") +default_options.register_option("table_auto_flush_time", 150, validator=is_integer) +default_options.register_option("struct_as_dict", False, validator=is_bool) +default_options.register_option( + "progress_time_interval", 5 * 60, validator=any_validator(is_float, is_integer) +) +default_options.register_option("progress_percentage_gap", 5, validator=is_integer) +default_options.register_option("enable_v4_sign", False, validator=is_bool) +default_options.register_option("align_supported_python_tag", True, validator=is_bool) # c or python mode, use for UT, in other cases, please do not modify the value -default_options.register_option('force_c', False, validator=is_integer) -default_options.register_option('force_py', False, validator=is_integer) +default_options.register_option("force_c", False, validator=is_integer) +default_options.register_option("force_py", False, validator=is_integer) # callbacks for wrappers -default_options.register_option('instance_create_callback', None) +default_options.register_option("instance_create_callback", None) default_options.register_option("tunnel_session_create_callback", None) default_options.register_option("tunnel_session_create_timeout_callback", None) default_options.register_option("result_reader_create_callback", None) -default_options.register_option('tunnel_read_timeout_callback', None) +default_options.register_option("tunnel_read_timeout_callback", None) default_options.register_option("skipped_survey_regexes", []) # network connections -default_options.register_option('chunk_size', DEFAULT_CHUNK_SIZE, validator=is_integer) -default_options.register_option('retry_times', DEFAULT_CONNECT_RETRY_TIMES, validator=is_integer) -default_options.register_option('retry_delay', DEFAULT_RETRY_DELAY, validator=any_validator(is_integer, is_float)) -default_options.register_option('connect_timeout', DEFAULT_CONNECT_TIMEOUT, validator=is_integer) -default_options.register_option('read_timeout', DEFAULT_READ_TIMEOUT, validator=is_integer) -default_options.register_option('pool_connections', DEFAULT_POOL_CONNECTIONS, validator=is_integer) -default_options.register_option('pool_maxsize', DEFAULT_POOL_MAXSIZE, validator=is_integer) +default_options.register_option("chunk_size", DEFAULT_CHUNK_SIZE, validator=is_integer) +default_options.register_option( + "retry_times", DEFAULT_CONNECT_RETRY_TIMES, validator=is_integer +) +default_options.register_option( + "retry_delay", DEFAULT_RETRY_DELAY, validator=any_validator(is_integer, is_float) +) +default_options.register_option( + "connect_timeout", DEFAULT_CONNECT_TIMEOUT, validator=is_integer +) +default_options.register_option( + "read_timeout", DEFAULT_READ_TIMEOUT, validator=is_integer +) +default_options.register_option( + "pool_connections", DEFAULT_POOL_CONNECTIONS, validator=is_integer +) +default_options.register_option( + "pool_maxsize", DEFAULT_POOL_MAXSIZE, validator=is_integer +) # Tunnel -default_options.register_option('tunnel.endpoint', None) -default_options.register_option('tunnel.string_as_binary', False, validator=is_bool) -default_options.register_option('tunnel.use_instance_tunnel', True, validator=is_bool) +default_options.register_option("tunnel.endpoint", None) +default_options.register_option("tunnel.string_as_binary", False, validator=is_bool) +default_options.register_option("tunnel.use_instance_tunnel", True, validator=is_bool) +default_options.register_option( + "tunnel.limit_instance_tunnel", None, validator=any_validator(is_null, is_bool) +) +default_options.register_option( + "tunnel.legacy_fallback_timeout", None, validator=any_validator(is_null, is_integer) +) +default_options.register_option( + "tunnel.pd_mem_cache_size", 1024 * 4, validator=is_integer +) +default_options.register_option( + "tunnel.pd_row_cache_size", 1024 * 16, validator=is_integer +) +default_options.register_option( + "tunnel.read_row_batch_size", 1024, validator=is_integer +) +default_options.register_option( + "tunnel.write_row_batch_size", 1024, validator=is_integer +) +default_options.register_option( + "tunnel.batch_merge_threshold", 128, validator=is_integer +) +default_options.register_option( + "tunnel.overflow_date_as_none", False, validator=is_bool +) +default_options.register_option( + "tunnel.quota_name", None, validator=any_validator(is_null, is_string) +) default_options.register_option( - 'tunnel.limit_instance_tunnel', None, validator=any_validator(is_null, is_bool) + "tunnel.block_buffer_size", DEFAULT_BLOCK_BUFFER_SIZE, validator=is_integer ) default_options.register_option( - 'tunnel.legacy_fallback_timeout', None, validator=any_validator(is_null, is_integer) + "tunnel.use_block_writer_by_default", False, validator=is_bool ) -default_options.register_option('tunnel.pd_mem_cache_size', 1024 * 4, validator=is_integer) -default_options.register_option('tunnel.pd_row_cache_size', 1024 * 16, validator=is_integer) -default_options.register_option('tunnel.read_row_batch_size', 1024, validator=is_integer) -default_options.register_option('tunnel.write_row_batch_size', 1024, validator=is_integer) -default_options.register_option('tunnel.batch_merge_threshold', 128, validator=is_integer) -default_options.register_option('tunnel.overflow_date_as_none', False, validator=is_bool) default_options.register_option( - 'tunnel.quota_name', None, validator=any_validator(is_null, is_string) + "tunnel.tags", None, validator=any_validator(is_null, is_string, is_list) ) -default_options.register_option('tunnel.block_buffer_size', DEFAULT_BLOCK_BUFFER_SIZE, validator=is_integer) -default_options.register_option('tunnel.use_block_writer_by_default', False, validator=is_bool) -default_options.redirect_option('tunnel_endpoint', 'tunnel.endpoint') -default_options.redirect_option('use_instance_tunnel', 'tunnel.use_instance_tunnel') -default_options.redirect_option('limited_instance_tunnel', 'tunnel.limit_instance_tunnel') -default_options.redirect_option('tunnel.limited_instance_tunnel', 'tunnel.limit_instance_tunnel') +default_options.redirect_option("tunnel_endpoint", "tunnel.endpoint") +default_options.redirect_option("use_instance_tunnel", "tunnel.use_instance_tunnel") +default_options.redirect_option( + "limited_instance_tunnel", "tunnel.limit_instance_tunnel" +) +default_options.redirect_option( + "tunnel.limited_instance_tunnel", "tunnel.limit_instance_tunnel" +) # terminal -default_options.register_option('console.max_lines', None) -default_options.register_option('console.max_width', None) -default_options.register_option('console.use_color', False, validator=is_bool) +default_options.register_option("console.max_lines", None) +default_options.register_option("console.max_width", None) +default_options.register_option("console.use_color", False, validator=is_bool) # SQL -default_options.register_option('sql.settings', None, validator=any_validator(is_null, is_dict)) -default_options.register_option('sql.ignore_fields_not_null', False, validator=is_bool) -default_options.register_option('sql.use_odps2_extension', None, validator=any_validator(is_null, is_bool)) +default_options.register_option( + "sql.settings", None, validator=any_validator(is_null, is_dict) +) +default_options.register_option("sql.ignore_fields_not_null", False, validator=is_bool) +default_options.register_option( + "sql.use_odps2_extension", None, validator=any_validator(is_null, is_bool) +) # sqlalchemy -default_options.register_option('sqlalchemy.project_as_schema', False, validator=is_bool) +default_options.register_option( + "sqlalchemy.project_as_schema", None, validator=any_validator(is_null, is_bool) +) # DataFrame -default_options.register_option('interactive', is_interactive(), validator=is_bool) -default_options.register_option('verbose', False, validator=all_validator(is_bool, verbose_log_validator)) -default_options.register_option('verbose_log', None) -default_options.register_option('df.optimize', True, validator=is_bool) -default_options.register_option('df.optimizes.cp', True, validator=is_bool) -default_options.register_option('df.optimizes.pp', True, validator=is_bool) -default_options.register_option('df.optimizes.tunnel', True, validator=is_bool) -default_options.register_option('df.analyze', True, validator=is_bool) -default_options.register_option('df.use_cache', True, validator=is_bool) -default_options.register_option('df.quote', True, validator=is_bool) -default_options.register_option('df.dump_udf', False, validator=is_bool) -default_options.register_option('df.supersede_libraries', True, validator=is_bool) -default_options.register_option('df.libraries', None) -default_options.register_option('df.image', None) -default_options.register_option('df.odps.sort.limit', 10000) -default_options.register_option('df.odps.nan_handler', 'py') # None for not handled, builtin for built-in ISNAN function -default_options.register_option('df.sqlalchemy.execution_options', None, validator=any_validator(is_null, is_dict)) -default_options.register_option('df.seahawks.max_size', 10 * 1024 * 1024 * 1024) # 10G -default_options.register_option('df.delete_udfs', True, validator=is_bool) -default_options.register_option('df.use_xflow_sample', False, validator=is_bool) -default_options.register_option('df.writer_count_limit', 50, validator=is_integer) +default_options.register_option("interactive", is_interactive(), validator=is_bool) +default_options.register_option( + "verbose", False, validator=all_validator(is_bool, verbose_log_validator) +) +default_options.register_option("verbose_log", None) +default_options.register_option("df.optimize", True, validator=is_bool) +default_options.register_option("df.optimizes.cp", True, validator=is_bool) +default_options.register_option("df.optimizes.pp", True, validator=is_bool) +default_options.register_option("df.optimizes.tunnel", True, validator=is_bool) +default_options.register_option("df.analyze", True, validator=is_bool) +default_options.register_option("df.use_cache", True, validator=is_bool) +default_options.register_option("df.quote", True, validator=is_bool) +default_options.register_option("df.dump_udf", False, validator=is_bool) +default_options.register_option("df.supersede_libraries", True, validator=is_bool) +default_options.register_option("df.libraries", None) +default_options.register_option("df.image", None) +default_options.register_option("df.odps.sort.limit", 10000) +default_options.register_option( + "df.odps.nan_handler", "py" +) # None for not handled, builtin for built-in ISNAN function +default_options.register_option( + "df.sqlalchemy.execution_options", None, validator=any_validator(is_null, is_dict) +) +default_options.register_option("df.seahawks.max_size", 10 * 1024 * 1024 * 1024) # 10G +default_options.register_option("df.delete_udfs", True, validator=is_bool) +default_options.register_option("df.use_xflow_sample", False, validator=is_bool) +default_options.register_option("df.writer_count_limit", 50, validator=is_integer) # PyODPS ML -default_options.register_option('ml.xflow_project', 'algo_public', validator=is_string) -default_options.register_option('ml.xflow_settings', None, validator=any_validator(is_null, is_dict)) -default_options.register_option('ml.dry_run', False, validator=is_bool) -default_options.register_option('ml.use_model_transfer', False, validator=is_bool) -default_options.register_option('ml.use_old_metrics', True, validator=is_bool) -default_options.register_option('ml.model_volume', 'pyodps_volume', validator=is_string) +default_options.register_option("ml.xflow_project", "algo_public", validator=is_string) +default_options.register_option( + "ml.xflow_settings", None, validator=any_validator(is_null, is_dict) +) +default_options.register_option("ml.dry_run", False, validator=is_bool) +default_options.register_option("ml.use_model_transfer", False, validator=is_bool) +default_options.register_option("ml.use_old_metrics", True, validator=is_bool) +default_options.register_option("ml.model_volume", "pyodps_volume", validator=is_string) # Runner -default_options.redirect_option('runner.dry_run', 'ml.dry_run') +default_options.redirect_option("runner.dry_run", "ml.dry_run") # display from .console import detect_console_encoding -default_options.register_pandas('display.encoding', detect_console_encoding(), validator=is_string) -default_options.register_pandas('display.max_rows', 60, validator=any_validator(is_null, is_integer)) -default_options.register_pandas('display.max_columns', 20, validator=any_validator(is_null, is_integer)) -default_options.register_pandas('display.large_repr', 'truncate', validator=is_in(['truncate', 'info'])) -default_options.register_pandas('display.notebook_repr_html', True, validator=is_bool) -default_options.register_pandas('display.precision', 6, validator=is_integer) -default_options.register_pandas('display.float_format', None) -default_options.register_pandas('display.chop_threshold', None) -default_options.register_pandas('display.column_space', 12, validator=is_integer) -default_options.register_pandas('display.pprint_nest_depth', 3, validator=is_integer) -default_options.register_pandas('display.max_seq_items', 100, validator=is_integer) -default_options.register_pandas('display.max_colwidth', 50, validator=is_integer) -default_options.register_pandas('display.multi_sparse', True, validator=is_bool) -default_options.register_pandas('display.colheader_justify', 'right', validator=is_string) -default_options.register_pandas('display.unicode.ambiguous_as_wide', False, validator=is_bool) -default_options.register_pandas('display.unicode.east_asian_width', False, validator=is_bool) -default_options.redirect_option('display.height', 'display.max_rows') -default_options.register_pandas('display.width', 80, validator=any_validator(is_null, is_integer)) -default_options.register_pandas('display.expand_frame_repr', True) -default_options.register_pandas('display.show_dimensions', 'truncate', validator=is_in([True, False, 'truncate'])) - -default_options.register_option('display.notebook_widget', True, validator=is_bool) -default_options.redirect_option('display.notebook_repr_widget', 'display.notebook_widget') +default_options.register_pandas( + "display.encoding", detect_console_encoding(), validator=is_string +) +default_options.register_pandas( + "display.max_rows", 60, validator=any_validator(is_null, is_integer) +) +default_options.register_pandas( + "display.max_columns", 20, validator=any_validator(is_null, is_integer) +) +default_options.register_pandas( + "display.large_repr", "truncate", validator=is_in(["truncate", "info"]) +) +default_options.register_pandas("display.notebook_repr_html", True, validator=is_bool) +default_options.register_pandas("display.precision", 6, validator=is_integer) +default_options.register_pandas("display.float_format", None) +default_options.register_pandas("display.chop_threshold", None) +default_options.register_pandas("display.column_space", 12, validator=is_integer) +default_options.register_pandas("display.pprint_nest_depth", 3, validator=is_integer) +default_options.register_pandas("display.max_seq_items", 100, validator=is_integer) +default_options.register_pandas("display.max_colwidth", 50, validator=is_integer) +default_options.register_pandas("display.multi_sparse", True, validator=is_bool) +default_options.register_pandas( + "display.colheader_justify", "right", validator=is_string +) +default_options.register_pandas( + "display.unicode.ambiguous_as_wide", False, validator=is_bool +) +default_options.register_pandas( + "display.unicode.east_asian_width", False, validator=is_bool +) +default_options.redirect_option("display.height", "display.max_rows") +default_options.register_pandas( + "display.width", 80, validator=any_validator(is_null, is_integer) +) +default_options.register_pandas("display.expand_frame_repr", True) +default_options.register_pandas( + "display.show_dimensions", "truncate", validator=is_in([True, False, "truncate"]) +) + +default_options.register_option("display.notebook_widget", True, validator=is_bool) +default_options.redirect_option( + "display.notebook_repr_widget", "display.notebook_widget" +) # Mars -default_options.register_option('mars.use_common_proxy', True, validator=is_bool) -default_options.register_option('mars.launch_notebook', False, validator=is_bool) -default_options.register_option('mars.to_dataframe_memory_scale', None, validator=any_validator(is_null, is_integer)) -default_options.register_option('mars.container_status_timeout', 120, validator=is_integer) +default_options.register_option("mars.use_common_proxy", True, validator=is_bool) +default_options.register_option("mars.launch_notebook", False, validator=is_bool) +default_options.register_option( + "mars.to_dataframe_memory_scale", None, validator=any_validator(is_null, is_integer) +) +default_options.register_option( + "mars.container_status_timeout", 120, validator=is_integer +) _options_local = threading.local() diff --git a/odps/conftest.py b/odps/conftest.py index c10962a5..b8aecbbe 100644 --- a/odps/conftest.py +++ b/odps/conftest.py @@ -1,6 +1,6 @@ import pytest -from odps.tests.core import get_config, drop_test_tables +from odps.tests.core import drop_test_tables, get_config @pytest.fixture(scope="session") @@ -41,6 +41,14 @@ def odps_with_tunnel_quota(): pytest.skip("ODPS project with quota not defined") +@pytest.fixture(scope="session") +def odps_with_long_string(): + try: + return get_config().odps_with_long_string + except AttributeError: + pytest.skip("ODPS project with quota not defined") + + @pytest.fixture(scope="session") def config(): return get_config() @@ -71,6 +79,4 @@ def pytest_html_results_table_html(report, data): if report.passed: del data[:] - data.append( - html.div("Logs disabled for passed tests.", **{"class": "log"}) - ) + data.append(html.div("Logs disabled for passed tests.", **{"class": "log"})) diff --git a/odps/console.py b/odps/console.py index d3f2914f..cd666b1f 100644 --- a/odps/console.py +++ b/odps/console.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,15 +14,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys +import codecs +import locale +import math import multiprocessing +import os +import struct +import sys import threading import time -import struct -import os -import math -import locale -import codecs import warnings try: @@ -32,15 +32,16 @@ try: import fcntl - import termios import signal + import termios + _CAN_RESIZE_TERMINAL = True except ImportError: _CAN_RESIZE_TERMINAL = False try: from IPython import get_ipython -except ImportError: +except (ImportError, AttributeError): pass try: get_ipython() @@ -50,6 +51,7 @@ widgets = None else: from IPython import version_info + ipython_major_version = version_info[0] try: @@ -68,6 +70,7 @@ if OutStream is not None: from IPython.utils import io as ipyio + # On Windows in particular this is necessary, as the io.stdout stream # in IPython gets hooked up to some pyreadline magic to handle colors try: @@ -85,7 +88,8 @@ from pyreadyline.console import Console as PyreadlineConsole except ImportError: # Just define a dummy class - class PyreadlineConsole(object): pass + class PyreadlineConsole(object): + pass # import widgets and display try: @@ -99,15 +103,16 @@ class PyreadlineConsole(object): pass # ignore widgets deprecated message def _ignore_deprecated_warnings(): - warnings.filterwarnings('ignore', category=DeprecationWarning, module=r'.*widget.*') + warnings.filterwarnings( + "ignore", category=DeprecationWarning, module=r".*widget.*" + ) if get_ipython and get_ipython(): - get_ipython().events.register('pre_execute', _ignore_deprecated_warnings) + get_ipython().events.register("pre_execute", _ignore_deprecated_warnings) from .compat import six - -_DEFAULT_ENCODING = 'utf-8' +_DEFAULT_ENCODING = "utf-8" _initial_defencoding = None @@ -117,6 +122,7 @@ def detect_console_encoding(): slighly modified from the way IPython handles the same issue. """ import locale + global _initial_defencoding encoding = None @@ -126,14 +132,14 @@ def detect_console_encoding(): pass # try again for something better - if not encoding or 'ascii' in encoding.lower(): + if not encoding or "ascii" in encoding.lower(): try: encoding = locale.getpreferredencoding() except Exception: pass # when all else fails. this will usually be "ascii" - if not encoding or 'ascii' in encoding.lower(): + if not encoding or "ascii" in encoding.lower(): encoding = sys.getdefaultencoding() # GH3360, save the reported defencoding at import time @@ -152,30 +158,35 @@ def get_terminal_size(): IPython zmq frontends, or IDLE do not run in a terminal, """ import platform + current_os = platform.system() tuple_xy = None - if current_os == 'Windows': + if current_os == "Windows": tuple_xy = _get_terminal_size_windows() if tuple_xy is None: tuple_xy = _get_terminal_size_tput() # needed for window's python in cygwin's xterm! - if current_os == 'Linux' or \ - current_os == 'Darwin' or \ - current_os.startswith('CYGWIN'): + if ( + current_os == "Linux" + or current_os == "Darwin" + or current_os.startswith("CYGWIN") + ): tuple_xy = _get_terminal_size_linux() if tuple_xy is None: - tuple_xy = (80, 25) # default value + tuple_xy = (80, 25) # default value return tuple_xy def in_interactive_session(): - """ check if we're running in an interactive shell + """check if we're running in an interactive shell returns True if running under python/ipython interactive shell """ + def check_main(): import __main__ as main - return not hasattr(main, '__file__') + + return not hasattr(main, "__file__") try: return __IPYTHON__ or check_main() # noqa: F821 @@ -189,7 +200,7 @@ def in_ipython_frontend(): """ try: ip = get_ipython() - return 'zmq' in str(type(ip)).lower() + return "zmq" in str(type(ip)).lower() except: pass @@ -204,26 +215,28 @@ def get_notebook_backend(): if _backend_name is not None: return _backend_name - if 'VSCODE_PID' in os.environ: - _backend_name = 'VSCode' + if "VSCODE_PID" in os.environ: + _backend_name = "VSCode" return _backend_name # DSW always use jupyter lab for env_name in os.environ.keys(): - if env_name.lower().startswith('dsw_'): - _backend_name = 'DSW' + if env_name.lower().startswith("dsw_"): + _backend_name = "DSW" return _backend_name for arg in sys.argv: - if arg.endswith('.json') and os.path.exists(arg): + if arg.endswith(".json") and os.path.exists(arg): ipy_json_path = os.path.dirname(arg) # jupyter lab will generate a jpserver-{pid}.json, while # jupyter notebook will generate a nbserver-{pid}.json instead - lab_cfg_path = os.path.join(ipy_json_path, 'jpserver-%d.json' % os.getppid()) + lab_cfg_path = os.path.join( + ipy_json_path, "jpserver-%d.json" % os.getppid() + ) if os.path.exists(lab_cfg_path): - _backend_name = 'JupyterLab' + _backend_name = "JupyterLab" return _backend_name - _backend_name = 'JupyterNotebook' + _backend_name = "JupyterNotebook" return _backend_name @@ -234,11 +247,11 @@ def is_widgets_available(): return False # todo when widget for lab or vscode ready, change this - if get_notebook_backend() in ('DSW', 'JupyterLab', 'VSCode'): + if get_notebook_backend() in ("DSW", "JupyterLab", "VSCode"): return False - if hasattr(widgets.Widget, '_version_validated'): - return bool(getattr(widgets.Widget, '_version_validated', None)) + if hasattr(widgets.Widget, "_version_validated"): + return bool(getattr(widgets.Widget, "_version_validated", None)) else: return True @@ -251,11 +264,10 @@ def in_qtconsole(): """ try: ip = get_ipython() - front_end = ( - ip.config.get('KernelApp', {}).get('parent_appname', "") or - ip.config.get('IPKernelApp', {}).get('parent_appname', "") - ) - if 'qtconsole' in front_end.lower(): + front_end = ip.config.get("KernelApp", {}).get( + "parent_appname", "" + ) or ip.config.get("IPKernelApp", {}).get("parent_appname", "") + if "qtconsole" in front_end.lower(): return True except: return False @@ -289,8 +301,9 @@ def get_console_size(): # match default for width,height in config_init try: from pandas.core.config import get_default_val - terminal_width = get_default_val('display.width') - terminal_height = get_default_val('display.max_rows') + + terminal_width = get_default_val("display.width") + terminal_height = get_default_val("display.max_rows") except ImportError: terminal_width, terminal_height = None, None else: @@ -308,7 +321,7 @@ def get_console_size(): def _get_terminal_size_windows(): res = None try: - from ctypes import windll, create_string_buffer + from ctypes import create_string_buffer, windll # stdin handle is -10 # stdout handle is -11 @@ -321,8 +334,20 @@ def _get_terminal_size_windows(): return None if res: import struct - (bufx, bufy, curx, cury, wattr, left, top, right, bottom, maxx, - maxy) = struct.unpack("hhhhHhhhhhh", csbi.raw) + + ( + bufx, + bufy, + curx, + cury, + wattr, + left, + top, + right, + bottom, + maxx, + maxy, + ) = struct.unpack("hhhhHhhhhhh", csbi.raw) sizex = right - left + 1 sizey = bottom - top + 1 return sizex, sizey @@ -336,14 +361,15 @@ def _get_terminal_size_tput(): # -height-of-a-terminal-window try: import subprocess - proc = subprocess.Popen(["tput", "cols"], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE) + + proc = subprocess.Popen( + ["tput", "cols"], stdin=subprocess.PIPE, stdout=subprocess.PIPE + ) output = proc.communicate(input=None) cols = int(output[0]) - proc = subprocess.Popen(["tput", "lines"], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE) + proc = subprocess.Popen( + ["tput", "lines"], stdin=subprocess.PIPE, stdout=subprocess.PIPE + ) output = proc.communicate(input=None) rows = int(output[0]) return (cols, rows) @@ -355,13 +381,14 @@ def _get_terminal_size_linux(): def ioctl_GWINSZ(fd): try: import fcntl - import termios import struct + import termios - cr = struct.unpack('hh', fcntl.ioctl(fd, termios.TIOCGWINSZ, '1234')) + cr = struct.unpack("hh", fcntl.ioctl(fd, termios.TIOCGWINSZ, "1234")) except: return None return cr + cr = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2) if not cr: try: @@ -373,7 +400,8 @@ def ioctl_GWINSZ(fd): if not cr or cr == (0, 0): try: from os import environ as env - cr = (env['LINES'], env['COLUMNS']) + + cr = (env["LINES"], env["COLUMNS"]) except: return None return int(cr[1]), int(cr[0]) @@ -390,9 +418,9 @@ def _get_stdout(stderr=False): """ if stderr: - stream = 'stderr' + stream = "stderr" else: - stream = 'stdout' + stream = "stdout" sys_stream = getattr(sys, stream) @@ -424,16 +452,19 @@ def isatty(file): except AttributeError: # pragma: no cover thread_name = threading.current_thread().getName() - if proc_name != 'MainProcess' or thread_name != 'MainThread': + if proc_name != "MainProcess" or thread_name != "MainThread": return False - if hasattr(file, 'isatty'): + if hasattr(file, "isatty"): return file.isatty() - elif (OutStream is not None and - isinstance(file, (OutStream, IPythonIOStream)) and - ((hasattr(file, 'name') and file.name == 'stdout') or - (hasattr(file, 'stream') and - isinstance(file.stream, PyreadlineConsole)))): + elif ( + OutStream is not None + and isinstance(file, (OutStream, IPythonIOStream)) + and ( + (hasattr(file, "name") and file.name == "stdout") + or (hasattr(file, "stream") and isinstance(file.stream, PyreadlineConsole)) + ) + ): # File is an IPython OutStream or IOStream and # File name is 'stdout' or # File wraps a Console @@ -467,8 +498,7 @@ def _terminal_size(file=None): except: try: # see if POSIX standard variables will work - return (int(os.environ.get('LINES')), - int(os.environ.get('COLUMNS'))) + return (int(os.environ.get("LINES")), int(os.environ.get("COLUMNS"))) except TypeError: # fall back on configuration variables, or if not # set, (25, 80) @@ -502,30 +532,31 @@ def _color_text(text, color): lightmagenta, lightcyan, white, or '' (the empty string). """ color_mapping = { - 'black': '0;30', - 'red': '0;31', - 'green': '0;32', - 'brown': '0;33', - 'blue': '0;34', - 'magenta': '0;35', - 'cyan': '0;36', - 'lightgrey': '0;37', - 'default': '0;39', - 'darkgrey': '1;30', - 'lightred': '1;31', - 'lightgreen': '1;32', - 'yellow': '1;33', - 'lightblue': '1;34', - 'lightmagenta': '1;35', - 'lightcyan': '1;36', - 'white': '1;37'} - - if sys.platform == 'win32' and OutStream is None: + "black": "0;30", + "red": "0;31", + "green": "0;32", + "brown": "0;33", + "blue": "0;34", + "magenta": "0;35", + "cyan": "0;36", + "lightgrey": "0;37", + "default": "0;39", + "darkgrey": "1;30", + "lightred": "1;31", + "lightgreen": "1;32", + "yellow": "1;33", + "lightblue": "1;34", + "lightmagenta": "1;35", + "lightcyan": "1;36", + "white": "1;37", + } + + if sys.platform == "win32" and OutStream is None: # On Windows do not colorize text unless in IPython return text - color_code = color_mapping.get(color, '0;39') - return '\033[{0}m{1}\033[0m'.format(color_code, text) + color_code = color_mapping.get(color, "0;39") + return "\033[{0}m{1}\033[0m".format(color_code, text) def _decode_preferred_encoding(s): @@ -543,7 +574,7 @@ def _decode_preferred_encoding(s): enc = _DEFAULT_ENCODING return s.decode(enc) except UnicodeDecodeError: - return s.decode('latin-1') + return s.decode("latin-1") def _write_with_fallback(s, write, fileobj): @@ -581,7 +612,7 @@ def _write_with_fallback(s, write, fileobj): write(s) return write except UnicodeEncodeError: - Writer = codecs.getwriter('latin-1') + Writer = codecs.getwriter("latin-1") f = Writer(fileobj) write = f.write @@ -622,16 +653,16 @@ def color_print(*args, **kwargs): """ from .config import options - file = kwargs.get('file', _get_stdout()) + file = kwargs.get("file", _get_stdout()) - end = kwargs.get('end', '\n') + end = kwargs.get("end", "\n") write = file.write if isatty(file) and options.console.use_color: for i in range(0, len(args), 2): msg = args[i] if i + 1 == len(args): - color = '' + color = "" else: color = args[i + 1] @@ -686,26 +717,26 @@ def human_time(seconds): that is always exactly 6 characters. """ units = [ - ('y', 60 * 60 * 24 * 7 * 52), - ('w', 60 * 60 * 24 * 7), - ('d', 60 * 60 * 24), - ('h', 60 * 60), - ('m', 60), - ('s', 1), + ("y", 60 * 60 * 24 * 7 * 52), + ("w", 60 * 60 * 24 * 7), + ("d", 60 * 60 * 24), + ("h", 60 * 60), + ("m", 60), + ("s", 1), ] seconds = int(seconds) if seconds < 60: - return ' {0:2d}s'.format(seconds) + return " {0:2d}s".format(seconds) for i in range(len(units) - 1): unit1, limit1 = units[i] unit2, limit2 = units[i + 1] if seconds >= limit1: - return '{0:2d}{1}{2:2d}{3}'.format( - seconds // limit1, unit1, - (seconds % limit1) // limit2, unit2) - return ' ~inf' + return "{0:2d}{1}{2:2d}{3}".format( + seconds // limit1, unit1, (seconds % limit1) // limit2, unit2 + ) + return " ~inf" def human_file_size(size): @@ -730,23 +761,23 @@ def human_file_size(size): size : str A human-friendly representation of the size of the file """ - suffixes = ' kMGTPEZY' + suffixes = " kMGTPEZY" if size == 0: num_scale = 0 else: num_scale = int(math.floor(math.log(size) / math.log(1000))) num_scale = max(num_scale, 0) if num_scale >= len(suffixes): - suffix = '?' + suffix = "?" else: suffix = suffixes[num_scale] num_scale = int(math.pow(1000, num_scale)) value = float(size) / num_scale str_value = str(value) - if suffix == ' ': - if '.' in str_value: - str_value = str_value[:str_value.index('.')] - elif str_value[2] == '.': + if suffix == " ": + if "." in str_value: + str_value = str_value[: str_value.index(".")] + elif str_value[2] == ".": str_value = str_value[:2] else: str_value = str_value[:3] @@ -765,8 +796,8 @@ def create_progress_widget(): from .ui.common import build_trait class TransientProgressBar(widget_cls): - _view_name = build_trait(Unicode, 'TransientProgressView', sync=True) - _view_module = build_trait(Unicode, 'pyodps/progress', sync=True) + _view_name = build_trait(Unicode, "TransientProgressView", sync=True) + _view_module = build_trait(Unicode, "pyodps/progress", sync=True) return TransientProgressBar() @@ -786,6 +817,7 @@ class ProgressBar(six.Iterator): for item in ProgressBar(items): item.process() """ + def __init__(self, total_or_items, ipython_widget=False, file=None): """ Parameters @@ -844,11 +876,9 @@ def __init__(self, total_or_items, ipython_widget=False, file=None): self._human_total = human_file_size(self._total) self._ipython_widget = ipython_widget - self._signal_set = False if not ipython_widget: - self._should_handle_resize = ( - _CAN_RESIZE_TERMINAL and self._file.isatty()) + self._should_handle_resize = _CAN_RESIZE_TERMINAL and self._file.isatty() self._handle_resize() if self._should_handle_resize: signal.signal(signal.SIGWINCH, self._handle_resize) @@ -867,7 +897,7 @@ def __exit__(self, exc_type, exc_value, traceback): if not self._silent: if exc_type is None: self.update(self._total) - self._file.write('\n') + self._file.write("\n") self._file.flush() if self._signal_set: signal.signal(signal.SIGWINCH, signal.SIG_DFL) @@ -928,26 +958,24 @@ def _update_console(self, value=None): bar_fill = int(self._bar_length) else: bar_fill = int(float(self._bar_length) * frac) - write('\r|') - color_print('=' * bar_fill, 'blue', file=file, end='') + write("\r|") + color_print("=" * bar_fill, "blue", file=file, end="") if bar_fill < self._bar_length: - color_print('>', 'green', file=file, end='') - write('-' * (self._bar_length - bar_fill - 1)) - write('|') + color_print(">", "green", file=file, end="") + write("-" * (self._bar_length - bar_fill - 1)) + write("|") if value >= self._total: t = time.time() - self._start_time - prefix = ' ' + prefix = " " elif value <= 0: t = None - prefix = '' + prefix = "" else: t = ((time.time() - self._start_time) * (1.0 - frac)) / frac - prefix = ' ETA ' - write(' {0:>4s}/{1:>4s}'.format( - human_file_size(value), - self._human_total)) - write(' ({0:>6s}%)'.format('{0:.2f}'.format(frac * 100.0))) + prefix = " ETA " + write(" {0:>4s}/{1:>4s}".format(human_file_size(value), self._human_total)) + write(" ({0:>6s}%)".format("{0:.2f}".format(frac * 100.0))) write(prefix) if t is not None: write(human_time(t)) @@ -963,16 +991,16 @@ def _update_ipython_widget(self, value=None): # Create and display an empty progress bar widget, # if none exists. - if not hasattr(self, '_widget'): + if not hasattr(self, "_widget"): self._widget = create_progress_widget() if in_ipython_frontend() and is_widgets_available(): display(self._widget) self._widget.value = 0 # Calculate percent completion, and update progress bar - percent = (float(value)/self._total) * 100.0 + percent = (float(value) / self._total) * 100.0 self._widget.value = percent - self._widget.description =' ({0:>6s}%)'.format('{0:.2f}'.format(percent)) + self._widget.description = " ({0:>6s}%)".format("{0:.2f}".format(percent)) def _silent_update(self, value=None): pass @@ -1025,8 +1053,7 @@ def work(i): bar.update(i) else: p = multiprocessing.Pool() - for i, result in enumerate( - p.imap_unordered(function, items, steps)): + for i, result in enumerate(p.imap_unordered(function, items, steps)): bar.update(i) results.append(result) p.close() diff --git a/odps/core.py b/odps/core.py index ba8116ef..ea9dea20 100644 --- a/odps/core.py +++ b/odps/core.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,45 +12,45 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy import functools -import hashlib +import inspect import json # noqa: F401 import os -import random import re -import time import warnings import weakref -from . import models, accounts, errors, utils -from .compat import six, Iterable, urlparse +from . import accounts, errors, models, utils +from .compat import six, urlparse from .config import options -from .errors import NoSuchObject, ODPSError from .rest import RestClient from .tempobj import clean_stored_objects - -DEFAULT_ENDPOINT = 'http://service.odps.aliyun.com/api' -DEFAULT_REGION_NAME = 'cn' -LOGVIEW_HOST_DEFAULT = 'http://logview.aliyun.com' +DEFAULT_ENDPOINT = "http://service.odps.aliyun.com/api" +DEFAULT_REGION_NAME = "cn" +LOGVIEW_HOST_DEFAULT = "http://logview.aliyun.com" _ALTER_TABLE_REGEX = re.compile( - r'^\s*(drop|alter)\s+table\s*(|if\s+exists)\s+(?P[^\s;]+)', re.I -) -_MERGE_SMALL_FILES_REGEX = re.compile( - r'^alter\s+table\s+(?P[^\s;]+)\s+' - r'(|partition\s*\((?P[^)]+)\s*\))\s*' - r'(merge\s+smallfiles|compact\s+(?P[^\s;]+))[\s;]*$', - re.I, + r"^\s*(drop|alter)\s+table\s*(|if\s+exists)\s+(?P[^\s;]+)", re.I ) _ENDPOINT_HOST_WITH_REGION_REGEX = re.compile( - r'service\.([^\.]+)\.(odps|maxcompute)\.aliyun(|-inc)\.com', re.I + r"service\.([^\.]+)\.(odps|maxcompute)\.aliyun(|-inc)\.com", re.I ) _logview_host_cache = dict() +def _wrap_model_func(func): + @six.wraps(func) + def wrapped(self, *args, **kw): + return func(self, *args, **kw) + + # keep method signature to avoid doc issue + if hasattr(inspect, "signature"): + wrapped.__signature__ = inspect.signature(func) + return wrapped + + @utils.attach_internal class ODPS(object): """ @@ -91,10 +91,20 @@ class ODPS(object): >>> >>> odps.delete_table('test_table') """ + def __init__( - self, access_id=None, secret_access_key=None, project=None, - endpoint=None, schema=None, app_account=None, logview_host=None, - tunnel_endpoint=None, region_name=None, **kw + self, + access_id=None, + secret_access_key=None, + project=None, + endpoint=None, + schema=None, + app_account=None, + logview_host=None, + tunnel_endpoint=None, + region_name=None, + quota_name=None, + **kw ): # avoid polluted copy sources :( access_id = utils.strip_if_str(access_id) @@ -105,27 +115,43 @@ def __init__( logview_host = utils.strip_if_str(logview_host) tunnel_endpoint = utils.strip_if_str(tunnel_endpoint) region_name = utils.strip_if_str(region_name) + quota_name = utils.strip_if_str(quota_name) if isinstance(access_id, accounts.BaseAccount): - assert secret_access_key is None, "Cannot supply secret_access_key with an account" + assert ( + secret_access_key is None + ), "Cannot supply secret_access_key with an account" kw["account"], access_id = access_id, None self._init( - access_id=access_id, secret_access_key=secret_access_key, - project=project, endpoint=endpoint, schema=schema, - app_account=app_account, logview_host=logview_host, - tunnel_endpoint=tunnel_endpoint, region_name=region_name, **kw + access_id=access_id, + secret_access_key=secret_access_key, + project=project, + endpoint=endpoint, + schema=schema, + app_account=app_account, + logview_host=logview_host, + tunnel_endpoint=tunnel_endpoint, + region_name=region_name, + quota_name=quota_name, + **kw ) clean_stored_objects(self) def _init( - self, access_id=None, secret_access_key=None, project=None, - endpoint=None, schema=None, region_name=None, **kw + self, + access_id=None, + secret_access_key=None, + project=None, + endpoint=None, + schema=None, + region_name=None, + **kw ): self._property_update_callbacks = set() - account = kw.pop('account', None) - self.app_account = kw.pop('app_account', None) + account = kw.pop("account", None) + self.app_account = kw.pop("app_account", None) if account is None: if access_id is not None: @@ -135,7 +161,7 @@ def _init( else: self.account = accounts.from_environments() if self.account is None: - raise TypeError( + raise TypeError( "`access_id` and `secret_access_key` should be provided." ) else: @@ -143,30 +169,36 @@ def _init( self.endpoint = ( endpoint or options.endpoint - or os.getenv('ODPS_ENDPOINT') + or os.getenv("ODPS_ENDPOINT") or DEFAULT_ENDPOINT ) self.project = ( - project or options.default_project or os.getenv('ODPS_PROJECT_NAME') + project or options.default_project or os.getenv("ODPS_PROJECT_NAME") ) self.region_name = region_name or self._get_region_from_endpoint(self.endpoint) + self._quota_name = kw.pop("quota_name", None) self._schema = schema self.rest = RestClient( - self.account, self.endpoint, project, schema, - app_account=self.app_account, proxy=options.api_proxy, - region_name=self.region_name, tag="ODPS", + self.account, + self.endpoint, + project, + schema, + app_account=self.app_account, + proxy=options.api_proxy, + region_name=self.region_name, + tag="ODPS", ) self._tunnel_endpoint = ( - kw.pop('tunnel_endpoint', None) + kw.pop("tunnel_endpoint", None) or options.tunnel.endpoint - or os.getenv('ODPS_TUNNEL_ENDPOINT') + or os.getenv("ODPS_TUNNEL_ENDPOINT") ) self._logview_host = ( kw.pop("logview_host", None) or options.logview_host - or os.getenv('ODPS_LOGVIEW_HOST') + or os.getenv("ODPS_LOGVIEW_HOST") or self.get_logview_host() ) @@ -176,6 +208,10 @@ def _init( if project: self._project = self.get_project() + self._quotas = models.Quotas(client=self.rest) + if self._quota_name: + self._quota = self.get_quota() + self._seahawks_url = kw.pop("seahawks_url", None) if self._seahawks_url: options.seahawks_url = self._seahawks_url @@ -209,31 +245,39 @@ def __getstate__(self): tunnel_endpoint=self._tunnel_endpoint, logview_host=self._logview_host, schema=self.schema, - seahawks_url=self._seahawks_url + seahawks_url=self._seahawks_url, ) if isinstance(self.account, accounts.AliyunAccount): - params.update(dict(access_id=self.account.access_id, - secret_access_key=self.account.secret_access_key)) + params.update( + dict( + access_id=self.account.access_id, + secret_access_key=self.account.secret_access_key, + ) + ) return params def __setstate__(self, state): - if 'secret_access_key' in state: - if os.environ.get('ODPS_ENDPOINT', None) is not None: - state['endpoint'] = os.environ['ODPS_ENDPOINT'] + if "secret_access_key" in state: + if os.environ.get("ODPS_ENDPOINT", None) is not None: + state["endpoint"] = os.environ["ODPS_ENDPOINT"] self._init(**state) return bearer_token_account = accounts.BearerTokenAccount.from_environments() if bearer_token_account is not None: - state['project'] = os.environ.get('ODPS_PROJECT_NAME') - state['endpoint'] = os.environ.get('ODPS_RUNTIME_ENDPOINT') or os.environ['ODPS_ENDPOINT'] - state.pop('access_id', None) - state.pop('secret_access_key', None) + state["project"] = os.environ.get("ODPS_PROJECT_NAME") + state["endpoint"] = ( + os.environ.get("ODPS_RUNTIME_ENDPOINT") or os.environ["ODPS_ENDPOINT"] + ) + state.pop("access_id", None) + state.pop("secret_access_key", None) self._init(None, None, account=bearer_token_account, **state) else: self._init(**state) - def as_account(self, access_id=None, secret_access_key=None, account=None, app_account=None): + def as_account( + self, access_id=None, secret_access_key=None, account=None, app_account=None + ): """ Creates a new ODPS entry object with a new account information @@ -265,26 +309,34 @@ def __mars_tokenize__(self): return self.__getstate__() @classmethod - def _from_account(cls, account, project, endpoint=DEFAULT_ENDPOINT, - tunnel_endpoint=None, **kwargs): - return cls(None, None, project, endpoint=endpoint, - tunnel_endpoint=tunnel_endpoint, account=account, **kwargs) - - @property - def default_tenant(self): - return self._default_tenant + def _from_account( + cls, account, project, endpoint=DEFAULT_ENDPOINT, tunnel_endpoint=None, **kwargs + ): + return cls( + None, + None, + project, + endpoint=endpoint, + tunnel_endpoint=tunnel_endpoint, + account=account, + **kwargs + ) def is_schema_namespace_enabled(self, settings=None): settings = settings or {} setting = str( settings.get("odps.namespace.schema") or (options.sql.settings or {}).get("odps.namespace.schema") - or ("true" if options.always_enable_schema else None) + or ("true" if options.enable_schema else None) or self.default_tenant.get_parameter("odps.namespace.schema") or "false" ) return setting.lower() == "true" + @property + def default_tenant(self): + return self._default_tenant + @property def projects(self): return self._projects @@ -303,6 +355,20 @@ def schema(self, value): for cb in self._property_update_callbacks: cb(self) + @property + def quota_name(self): + return self._quota_name or options.quota_name or os.getenv("QUOTA_NAME") + + @quota_name.setter + def quota_name(self, value): + self._quota_name = value + for cb in self._property_update_callbacks: + cb(self) + + @property + def quotas(self): + return self._quotas + @property def tunnel_endpoint(self): """ @@ -317,8 +383,14 @@ def tunnel_endpoint(self, value): cb(self) def list_projects( - self, owner=None, user=None, group=None, prefix=None, max_items=None, - region_id=None, tenant_id=None + self, + owner=None, + user=None, + group=None, + prefix=None, + max_items=None, + region_id=None, + tenant_id=None, ): """ List projects. @@ -332,20 +404,56 @@ def list_projects( :rtype: generator """ return self.projects.iterate( - owner=owner, user=user, group=group, max_items=max_items, name=prefix, - region_id=region_id, tenant_id=tenant_id + owner=owner, + user=user, + group=group, + max_items=max_items, + name=prefix, + region_id=region_id, + tenant_id=tenant_id, ) @property def logview_host(self): return self._logview_host + def get_quota(self, name=None): + """ + Get quota by name + + :param str name: quota name, if not provided, will be the name in ODPS entry + """ + if name is None: + name = name or self.quota_name + if name is None: + raise TypeError("Need to provide quota name") + return self._quotas[name] + + def exist_quota(self, name): + """ + If quota name which provided exists or not. + + :param name: quota name + :return: True if exists or False + :rtype: bool + """ + return name in self._quotas + + def list_quotas(self, region_id=None): + """ + List quotas by region id + + :param str region_id: Region ID + :return: quotas + """ + return self._quotas.iterate(region_id=region_id) + def get_project(self, name=None, default_schema=None): """ Get project by given name. - :param name: project name, if not provided, will be the default project - :param default_schema: default schema name, if not provided, will be + :param str name: project name, if not provided, will be the default project + :param str default_schema: default schema name, if not provided, will be the schema specified in ODPS object :return: the right project :rtype: :class:`odps.models.Project` @@ -363,6 +471,7 @@ def get_project(self, name=None, default_schema=None): proj._logview_host = self._logview_host # use _schema to avoid requesting for tenant options proj._default_schema = default_schema or self._schema + proj._quota_name = self._quota_name proj_ref = weakref.ref(proj) @@ -371,13 +480,18 @@ def project_update_callback(odps, update_schema=True): if proj_obj: if update_schema: proj_obj._default_schema = odps.schema + proj_obj._quota_name = odps._quota_name proj_obj._tunnel_endpoint = odps.tunnel_endpoint else: - self._property_update_callbacks.difference_update([project_update_callback]) + self._property_update_callbacks.difference_update( + [project_update_callback] + ) # we need to update default schema value on the project self._property_update_callbacks.add( - functools.partial(project_update_callback, update_schema=default_schema is None) + functools.partial( + project_update_callback, update_schema=default_schema is None + ) ) return proj @@ -462,7 +576,7 @@ def _get_project_or_schema(self, project=None, schema=None): return self.get_project(project) def _split_object_dots(self, name): - parts = [x.strip() for x in name.split('.')] + parts = [x.strip() for x in name.split(".")] if len(parts) == 1: project, schema, name = None, None, parts[0] elif len(parts) == 2: @@ -479,7 +593,13 @@ def _split_object_dots(self, name): return project, schema, name def list_tables( - self, project=None, prefix=None, owner=None, schema=None, type=None, extended=False + self, + project=None, + prefix=None, + owner=None, + schema=None, + type=None, + extended=False, ): """ List all tables of a project. @@ -496,7 +616,9 @@ def list_tables( :rtype: generator """ parent = self._get_project_or_schema(project, schema) - return parent.tables.iterate(name=prefix, owner=owner, type=type, extended=extended) + return parent.tables.iterate( + name=prefix, owner=owner, type=type, extended=extended + ) def get_table(self, name, project=None, schema=None): """ @@ -512,7 +634,7 @@ def get_table(self, name, project=None, schema=None): .. seealso:: :class:`odps.models.Table` """ - if isinstance(name, six.string_types) and '.' in name: + if isinstance(name, six.string_types) and "." in name: project, schema, name = self._split_object_dots(name) parent = self._get_project_or_schema(project, schema) @@ -530,7 +652,7 @@ def exist_table(self, name, project=None, schema=None): :rtype: bool """ - if isinstance(name, six.string_types) and '.' in name: + if isinstance(name, six.string_types) and "." in name: project, schema, name = self._split_object_dots(name) parent = self._get_project_or_schema(project, schema) @@ -538,10 +660,22 @@ def exist_table(self, name, project=None, schema=None): @utils.with_wait_argument def create_table( - self, name, table_schema=None, project=None, schema=None, comment=None, - if_not_exists=False, lifecycle=None, shard_num=None, hub_lifecycle=None, - hints=None, transactional=False, primary_key=None, storage_tier=None, - async_=False, **kw + self, + name, + table_schema=None, + project=None, + schema=None, + comment=None, + if_not_exists=False, + lifecycle=None, + shard_num=None, + hub_lifecycle=None, + hints=None, + transactional=False, + primary_key=None, + storage_tier=None, + async_=False, + **kw ): """ Create a table by given schema and other optional parameters. @@ -572,7 +706,7 @@ def create_table( if ( isinstance(schema, OdpsSchema) or isinstance(schema, tuple) - or (isinstance(schema, six.string_types) and ' ' in schema) + or (isinstance(schema, six.string_types) and " " in schema) ): table_schema, schema = schema, None warnings.warn( @@ -580,14 +714,14 @@ def create_table( "the original parameter now represents schema name. Please " "change your code.", DeprecationWarning, - stacklevel=2 + stacklevel=2, ) utils.add_survey_call("ODPS.create_table(schema='schema_name')") if table_schema is None: raise TypeError("`table_schema` argument not filled") - if isinstance(name, six.string_types) and '.' in name: + if isinstance(name, six.string_types) and "." in name: project, schema, name = self._split_object_dots(name) if lifecycle is None and options.lifecycle is not None: @@ -595,14 +729,43 @@ def create_table( parent = self._get_project_or_schema(project, schema) return parent.tables.create( - name, table_schema, comment=comment, if_not_exists=if_not_exists, - lifecycle=lifecycle, shard_num=shard_num, hub_lifecycle=hub_lifecycle, - hints=hints, transactional=transactional, primary_key=primary_key, - storage_tier=storage_tier, async_=async_, **kw + name, + table_schema, + comment=comment, + if_not_exists=if_not_exists, + lifecycle=lifecycle, + shard_num=shard_num, + hub_lifecycle=hub_lifecycle, + hints=hints, + transactional=transactional, + primary_key=primary_key, + storage_tier=storage_tier, + async_=async_, + **kw + ) + + def _delete_table( + self, + name, + project=None, + if_exists=False, + schema=None, + hints=None, + async_=False, + table_type=None, + ): + if isinstance(name, six.string_types) and "." in name: + project, schema, name = self._split_object_dots(name) + + parent = self._get_project_or_schema(project, schema) + return parent.tables.delete( + name, if_exists=if_exists, hints=hints, async_=async_, table_type=table_type ) @utils.with_wait_argument - def delete_table(self, name, project=None, if_exists=False, schema=None, hints=None, async_=False): + def delete_table( + self, name, project=None, if_exists=False, schema=None, hints=None, async_=False + ): """ Delete the table with given name @@ -614,131 +777,69 @@ def delete_table(self, name, project=None, if_exists=False, schema=None, hints=N :param bool async_: if True, will run asynchronously :return: None if not async else odps instance """ + return self._delete_table( + name, + project=project, + if_exists=if_exists, + schema=schema, + hints=hints, + async_=async_, + table_type="managed_table", + ) - if isinstance(name, six.string_types) and '.' in name: - project, schema, name = self._split_object_dots(name) - - parent = self._get_project_or_schema(project, schema) - return parent.tables.delete(name, if_exists=if_exists, hints=hints, async_=async_) - - def read_table(self, name, limit=None, start=0, step=None, - project=None, schema=None, partition=None, **kw): + @utils.with_wait_argument + def delete_view( + self, name, project=None, if_exists=False, schema=None, hints=None, async_=False + ): """ - Read table's records. + Delete the view with given name - :param name: table or table name - :type name: :class:`odps.models.table.Table` or str - :param limit: the records' size, if None will read all records from the table - :param start: the record where read starts with - :param step: default as 1 + :param name: view name :param project: project name, if not provided, will be the default project - :param schema: schema name, if not provided, will be the default schema - :type schema: str - :param partition: the partition of this table to read - :param columns: the columns' names which are the parts of table's columns - :type columns: list - :param compress: if True, the data will be compressed during downloading - :type compress: bool - :param compress_option: the compression algorithm, level and strategy - :type compress_option: :class:`odps.tunnel.CompressOption` - :param endpoint: tunnel service URL - :param reopen: reading the table will reuse the session which opened last time, - if set to True will open a new download session, default as False - :return: records - :rtype: generator - - :Example: - - >>> for record in odps.read_table('test_table', 100): - >>> # deal with such 100 records - >>> for record in odps.read_table('test_table', partition='pt=test', start=100, limit=100): - >>> # read the `pt=test` partition, skip 100 records and read 100 records - - .. seealso:: :class:`odps.models.Record` + :param bool if_exists: will not raise errors when the view does not exist, default False + :param str schema: schema name, if not provided, will be the default schema + :param dict hints: hints for the task + :param bool async_: if True, will run asynchronously + :return: None if not async else odps instance """ + return self._delete_table( + name, + project=project, + if_exists=if_exists, + schema=schema, + hints=hints, + async_=async_, + table_type="virtual_view", + ) - if isinstance(name, six.string_types) and '.' in name: - project, schema, name = self._split_object_dots(name) - - if not isinstance(name, six.string_types): - if name.get_schema(): - schema = name.get_schema().name - project, name = name.project.name, name.name - - parent = self._get_project_or_schema(project, schema) - table = parent.tables[name] - - compress = kw.pop('compress', False) - columns = kw.pop('columns', None) - - with table.open_reader(partition=partition, **kw) as reader: - for record in reader.read(start, limit, step=step, - compress=compress, columns=columns): - yield record - - def write_table(self, name, *block_records, **kw): + @utils.with_wait_argument + def delete_materialized_view( + self, name, project=None, if_exists=False, schema=None, hints=None, async_=False + ): """ - Write records into given table. + Delete the materialized view with given name - :param name: table or table name - :type name: :class:`.models.table.Table` or str - :param block_records: if given records only, the block id will be 0 as default. + :param name: materialized view name :param project: project name, if not provided, will be the default project - :param schema: schema name, if not provided, will be the default schema - :type schema: str - :param partition: the partition of this table to write - :param bool overwrite: if True, will overwrite existing data - :param compress: if True, the data will be compressed during uploading - :type compress: bool - :param compress_option: the compression algorithm, level and strategy - :type compress_option: :class:`odps.tunnel.CompressOption` - :param endpoint: tunnel service URL - :param reopen: writing the table will reuse the session which opened last time, - if set to True will open a new upload session, default as False - :return: None - - :Example: - - >>> odps.write_table('test_table', records) # write to block 0 as default - >>> - >>> odps.write_table('test_table', 0, records) # write to block 0 explicitly - >>> - >>> odps.write_table('test_table', 0, records1, 1, records2) # write to multi-blocks - >>> - >>> odps.write_table('test_table', records, partition='pt=test') # write to certain partition - - .. seealso:: :class:`odps.models.Record` + :param bool if_exists: will not raise errors when the materialized view + does not exist, default False + :param str schema: schema name, if not provided, will be the default schema + :param dict hints: hints for the task + :param bool async_: if True, will run asynchronously + :return: None if not async else odps instance """ + return self._delete_table( + name, + project=project, + if_exists=if_exists, + schema=schema, + hints=hints, + async_=async_, + table_type="materialized_view", + ) - project = kw.pop('project', None) - schema = kw.pop('schema', None) - - if isinstance(name, six.string_types) and '.' in name: - project, schema, name = self._split_object_dots(name) - - if not isinstance(name, six.string_types): - if name.get_schema(): - schema = name.get_schema().name - project, name = name.project.name, name.name - - parent = self._get_project_or_schema(project, schema) - table = parent.tables[name] - partition = kw.pop('partition', None) - - if len(block_records) == 1 and isinstance(block_records[0], Iterable): - blocks = [0, ] - records_iterators = block_records - else: - blocks = block_records[::2] - records_iterators = block_records[1::2] - - if len(blocks) != len(records_iterators): - raise ValueError('Should invoke like ' - 'odps.write_table(block_id, records, block_id2, records2, ..., **kw)') - - with table.open_writer(partition=partition, blocks=blocks, **kw) as writer: - for block, records in zip(blocks, records_iterators): - writer.write(block, records) + read_table = _wrap_model_func(models.TableIOMethods.read_table) + write_table = _wrap_model_func(models.TableIOMethods.write_table) def list_resources(self, project=None, prefix=None, owner=None, schema=None): """ @@ -789,8 +890,16 @@ def exist_resource(self, name, project=None, schema=None): return name in parent.resources def open_resource( - self, name, project=None, mode='r+', encoding='utf-8', schema=None, - type="file", stream=False, comment=None, temp=False + self, + name, + project=None, + mode="r+", + encoding="utf-8", + schema=None, + type="file", + stream=False, + comment=None, + temp=False, ): """ Open a file resource as file-like object. @@ -846,9 +955,9 @@ def open_resource( return name.open(mode=mode) parent = self._get_project_or_schema(project, schema) - return parent.resources.get_typed(name, type=type, comment=comment, temp=temp).open( - mode=mode, encoding=encoding, stream=stream - ) + return parent.resources.get_typed( + name, type=type, comment=comment, temp=temp + ).open(mode=mode, encoding=encoding, stream=stream) def create_resource(self, name, type=None, project=None, schema=None, **kwargs): """ @@ -893,7 +1002,7 @@ def create_resource(self, name, type=None, project=None, schema=None, **kwargs): :class:`odps.models.TableResource` """ - type_ = kwargs.get('typo') or type + type_ = kwargs.get("typo") or type parent = self._get_project_or_schema(project, schema) return parent.resources.create(name=name, type=type_, **kwargs) @@ -994,8 +1103,16 @@ def delete_function(self, name, project=None, schema=None): parent = self._get_project_or_schema(project, schema) return parent.functions.delete(name) - def list_instances(self, project=None, start_time=None, end_time=None, - status=None, only_owner=None, quota_index=None, **kw): + def list_instances( + self, + project=None, + start_time=None, + end_time=None, + status=None, + only_owner=None, + quota_index=None, + **kw + ): """ List instances of a project by given optional conditions including start time, end time, status and if only the owner. @@ -1013,20 +1130,25 @@ def list_instances(self, project=None, start_time=None, end_time=None, :return: instances :rtype: list """ - if 'from_time' in kw: - start_time = kw['from_time'] + if "from_time" in kw: + start_time = kw["from_time"] warnings.warn( - 'The keyword argument `from_time` has been replaced by `start_time`.', + "The keyword argument `from_time` has been replaced by `start_time`.", DeprecationWarning, ) project = self.get_project(name=project) return project.instances.iterate( - start_time=start_time, end_time=end_time, - status=status, only_owner=only_owner, quota_index=quota_index) + start_time=start_time, + end_time=end_time, + status=status, + only_owner=only_owner, + quota_index=quota_index, + ) - def list_instance_queueing_infos(self, project=None, status=None, only_owner=None, - quota_index=None): + def list_instance_queueing_infos( + self, project=None, status=None, only_owner=None, quota_index=None + ): """ List instance queueing information. @@ -1042,7 +1164,8 @@ def list_instance_queueing_infos(self, project=None, status=None, only_owner=Non project = self.get_project(name=project) return project.instance_queueing_infos.iterate( - status=status, only_owner=only_owner, quota_index=quota_index) + status=status, only_owner=only_owner, quota_index=quota_index + ) def get_instance(self, id_, project=None): """ @@ -1087,19 +1210,25 @@ def stop_instance(self, id_, project=None): stop_job = stop_instance # to keep compatible - def execute_sql(self, sql, project=None, priority=None, running_cluster=None, - hints=None, **kwargs): + def execute_sql( + self, + sql, + project=None, + priority=None, + running_cluster=None, + hints=None, + quota_name=None, + **kwargs + ): """ Run a given SQL statement and block until the SQL executed successfully. - :param sql: SQL statement - :type sql: str + :param str sql: SQL statement :param project: project name, if not provided, will be the default project - :param priority: instance priority, 9 as default - :type priority: int - :param running_cluster: cluster to run this instance - :param hints: settings for SQL, e.g. `odps.mapred.map.split.size` - :type hints: dict + :param int priority: instance priority, 9 as default + :param str running_cluster: cluster to run this instance + :param dict hints: settings for SQL, e.g. `odps.mapred.map.split.size` + :param str quota_name: name of quota to use for SQL job :return: instance :rtype: :class:`odps.models.Instance` @@ -1117,72 +1246,94 @@ def execute_sql(self, sql, project=None, priority=None, running_cluster=None, .. seealso:: :class:`odps.models.Instance` """ - async_ = kwargs.pop('async_', kwargs.pop('async', False)) + async_ = kwargs.pop("async_", kwargs.pop("async", False)) inst = self.run_sql( - sql, project=project, priority=priority, running_cluster=running_cluster, - hints=hints, **kwargs) + sql, + project=project, + priority=priority, + running_cluster=running_cluster, + hints=hints, + quota_name=quota_name, + **kwargs + ) if not async_: inst.wait_for_success() return inst - def run_sql(self, sql, project=None, priority=None, running_cluster=None, - hints=None, aliases=None, default_schema=None, **kwargs): + def run_sql( + self, + sql, + project=None, + priority=None, + running_cluster=None, + hints=None, + aliases=None, + default_schema=None, + quota_name=None, + **kwargs + ): """ Run a given SQL statement asynchronously - :param sql: SQL statement - :type sql: str - :param project: project name, if not provided, will be the default project - :param priority: instance priority, 9 as default - :type priority: int - :param running_cluster: cluster to run this instance - :param hints: settings for SQL, e.g. `odps.mapred.map.split.size` - :type hints: dict - :param aliases: - :type aliases: dict + :param str sql: SQL statement + :param str project: project name, if not provided, will be the default project + :param int priority: instance priority, 9 as default + :param str running_cluster: cluster to run this instance + :param dict hints: settings for SQL, e.g. `odps.mapred.map.split.size` + :param dict aliases: + :param str quota_name: name of quota to use for SQL job :return: instance :rtype: :class:`odps.models.Instance` .. seealso:: :class:`odps.models.Instance` """ + on_instance_create = kwargs.pop("on_instance_create", None) sql = utils.to_text(sql) - merge_small_files_match = _MERGE_SMALL_FILES_REGEX.match(sql) - if merge_small_files_match: - kwargs = merge_small_files_match.groupdict().copy() - kwargs.update({ - "project": project, - "schema": default_schema, - "hints": hints, - "running_cluster": running_cluster, - "priority": priority, - }) - return self.run_merge_files(**kwargs) - - priority = priority if priority is not None else options.priority - if priority is None and options.get_priority is not None: - priority = options.get_priority(self) - on_instance_create = kwargs.pop('on_instance_create', None) - alter_table_match = _ALTER_TABLE_REGEX.match(sql) if alter_table_match: - drop_table_name = alter_table_match.group('table_name') + drop_table_name = alter_table_match.group("table_name") sql_project, sql_schema, sql_name = self._split_object_dots(drop_table_name) sql_project = sql_project or project sql_schema = sql_schema or default_schema del self._get_project_or_schema(sql_project, sql_schema).tables[sql_name] + merge_instance = models.MergeTask.submit_alter_table_instance( + self, + sql, + project=project, + schema=default_schema, + priority=priority, + running_cluster=running_cluster, + hints=hints, + quota_name=quota_name, + create_callback=on_instance_create, + ) + if merge_instance is not None: + return merge_instance + + priority = priority if priority is not None else options.priority + if priority is None and options.get_priority is not None: + priority = options.get_priority(self) + task = models.SQLTask(query=sql, **kwargs) task.update_sql_settings(hints) + schema_hints = {} default_schema = default_schema or self.schema - if default_schema is not None: - task.update_sql_settings({ + if self.is_schema_namespace_enabled(hints) or default_schema is not None: + schema_hints = { "odps.sql.allow.namespace.schema": "true", "odps.namespace.schema": "true", - "odps.default.schema": default_schema, - }) + } + if default_schema is not None: + schema_hints["odps.default.schema"] = default_schema + task.update_sql_settings(schema_hints) + + if quota_name or self.quota_name: + quota_hints = {"odps.task.wlm.quota": quota_name or self.quota_name} + task.update_sql_settings(quota_hints) if aliases: task.update_aliases(aliases) @@ -1190,8 +1341,10 @@ def run_sql(self, sql, project=None, priority=None, running_cluster=None, project = self.get_project(name=project) try: return project.instances.create( - task=task, priority=priority, running_cluster=running_cluster, - create_callback=on_instance_create + task=task, + priority=priority, + running_cluster=running_cluster, + create_callback=on_instance_create, ) except errors.ParseError as ex: ex.statement = sql @@ -1226,152 +1379,17 @@ def execute_sql_cost(self, sql, project=None, hints=None, **kwargs): inst.wait_for_success() return inst.get_sql_task_cost() - def _parse_partition_string(self, partition): + @staticmethod + def _parse_partition_string(partition): parts = [] - for p in utils.split_quoted(partition, ','): - kv = [pp.strip() for pp in utils.split_quoted(p, '=')] + for p in utils.split_quoted(partition, ","): + kv = [pp.strip() for pp in utils.split_quoted(p, "=")] if len(kv) != 2: - raise ValueError('Partition representation malformed.') + raise ValueError("Partition representation malformed.") if not kv[1].startswith('"') and not kv[1].startswith("'"): kv[1] = repr(kv[1]) - parts.append('%s=%s' % tuple(kv)) - return ','.join(parts) - - def execute_merge_files(self, table, partition=None, project=None, schema=None, - hints=None, priority=None, running_cluster=None, compact_type=None): - """ - Execute a task to merge multiple files in tables and wait for termination. - - :param table: name of the table to optimize - :param partition: partition to optimize - :param project: project name, if not provided, will be the default project - :param str schema: schema name, if not provided, will be the default schema - :param hints: settings for merge task. - :param priority: instance priority, 9 as default - :param running_cluster: cluster to run this instance - :param compact_type: compact option for transactional table, can be major or minor. - :return: instance - :rtype: :class:`odps.models.Instance` - """ - inst = self.run_merge_files( - table, partition=partition, project=project, hints=hints, - schema=schema, priority=priority, running_cluster=running_cluster, - compact_type=compact_type, - ) - inst.wait_for_success() - return inst - - def run_merge_files(self, table, partition=None, project=None, schema=None, hints=None, - priority=None, running_cluster=None, compact_type=None): - """ - Start running a task to merge multiple files in tables. - - :param table: name of the table to optimize - :param partition: partition to optimize - :param project: project name, if not provided, will be the default project - :param str schema: schema name, if not provided, will be the default schema - :param hints: settings for merge task. - :param priority: instance priority, 9 as default - :param running_cluster: cluster to run this instance - :param compact_type: compact option for transactional table, can be major or minor. - :return: instance - :rtype: :class:`odps.models.Instance` - """ - from .models.tasks import MergeTask - - _COMPACT_TYPES = { - "major": "major_compact", - "minor": "minor_compact", - } - - schema = schema or self.schema - if not isinstance(table, six.string_types): - if table.get_schema(): - schema = table.get_schema().name - table_name = table.full_table_name - else: - table_name = table - _, schema, _ = self._split_object_dots(table) - - if partition: - table_name += " partition(%s)" % (self._parse_partition_string(partition)) - - priority = priority if priority is not None else options.priority - if priority is None and options.get_priority is not None: - priority = options.get_priority(self) - - hints = hints or dict() - if options.default_task_settings: - hints.update(options.default_task_settings) - - compact_type = _COMPACT_TYPES.get(compact_type) - if compact_type: - hints.update({ - "odps.merge.txn.table.compact": compact_type, - "odps.merge.restructure.action": "hardlink", - }) - if schema is not None: - hints.update({ - "odps.sql.allow.namespace.schema": "true", - "odps.namespace.schema": "true", - "odps.default.schema": schema, - }) - - task = MergeTask(table=table_name.replace("`", "")) - task.update_settings(hints) - - project = self.get_project(name=project) - return project.instances.create( - task=task, running_cluster=running_cluster, priority=priority - ) - - def execute_archive_table(self, table, partition=None, project=None, schema=None, - hints=None, priority=None): - """ - Execute a task to archive tables and wait for termination. - - :param table: name of the table to archive - :param partition: partition to archive - :param project: project name, if not provided, will be the default project - :param hints: settings for table archive task. - :param priority: instance priority, 9 as default - :return: instance - :rtype: :class:`odps.models.Instance` - """ - inst = self.run_archive_table(table, partition=partition, project=project, hints=hints, - priority=priority) - inst.wait_for_success() - return inst - - def run_archive_table(self, table, partition=None, project=None, schema=None, - hints=None, priority=None): - """ - Start running a task to archive tables. - - :param table: name of the table to archive - :param partition: partition to archive - :param project: project name, if not provided, will be the default project - :param hints: settings for table archive task. - :param priority: instance priority, 9 as default - :return: instance - :rtype: :class:`odps.models.Instance` - """ - from .models.tasks import MergeTask - - if partition: - table += " partition(%s)" % (self._parse_partition_string(partition)) - - priority = priority if priority is not None else options.priority - if priority is None and options.get_priority is not None: - priority = options.get_priority(self) - - name = 'archive_task_{0}_{1}'.format(int(time.time()), random.randint(100000, 999999)) - task = MergeTask(name=name, table=table) - task.update_settings(hints) - task._update_property_json('archiveSettings', {'odps.merge.archive.flag': True}) - - project = self.get_project(name=project) - return project.instances.create(task=task, priority=priority) + parts.append("%s=%s" % tuple(kv)) + return ",".join(parts) def list_volumes(self, project=None, schema=None, owner=None): """ @@ -1386,7 +1404,9 @@ def list_volumes(self, project=None, schema=None, owner=None): parent = self._get_project_or_schema(project, schema) return parent.volumes.iterate(owner=owner) - @utils.deprecated('`create_volume` is deprecated. Use `created_parted_volume` instead.') + @utils.deprecated( + "`create_volume` is deprecated. Use `created_parted_volume` instead." + ) def create_volume(self, name, project=None, **kwargs): self.create_parted_volume(name, project=project, **kwargs) @@ -1421,7 +1441,14 @@ def create_fs_volume(self, name, project=None, schema=None, **kwargs): return parent.volumes.create_fs(name=name, **kwargs) def create_external_volume( - self, name, project=None, schema=None, location=None, rolearn=None, **kwargs + self, + name, + project=None, + schema=None, + location=None, + rolearn=None, + auto_create_dir=False, + **kwargs ): """ Create a file system volume based on external storage (for instance, OSS) in a project. @@ -1429,6 +1456,9 @@ def create_external_volume( :param str name: volume name :param str project: project name, if not provided, will be the default project :param str schema: schema name, if not provided, will be the default schema + :param str location: location of OSS dir, should be oss://endpoint/bucket/path + :param str rolearn: role arn of the account hosting the OSS bucket + :param bool auto_create_dir: if True, will create directory automatically :return: volume :rtype: :class:`odps.models.FSVolume` @@ -1436,7 +1466,11 @@ def create_external_volume( """ parent = self._get_project_or_schema(project, schema) return parent.volumes.create_external( - name=name, location=location, rolearn=rolearn, **kwargs + name=name, + location=location, + rolearn=rolearn, + auto_create_dir=auto_create_dir, + **kwargs ) def exist_volume(self, name, schema=None, project=None): @@ -1465,17 +1499,23 @@ def get_volume(self, name, project=None, schema=None): parent = self._get_project_or_schema(project, schema) return parent.volumes[name] - def delete_volume(self, name, project=None, schema=None): + def delete_volume( + self, name, project=None, schema=None, auto_remove_dir=False, recursive=False + ): """ Delete volume by given name. :param name: volume name - :param project: project name, if not provided, will be the default project + :param str project: project name, if not provided, will be the default project :param str schema: schema name, if not provided, will be the default schema + :param bool auto_remove_dir: if True, directory created by external volume will be deleted + :param bool recursive: if True, directory deletion should be recursive :return: None """ parent = self._get_project_or_schema(project, schema) - return parent.volumes.delete(name) + return parent.volumes.delete( + name, auto_remove_dir=auto_remove_dir, recursive=recursive + ) def list_volume_partitions(self, volume, project=None, schema=None): """ @@ -1502,9 +1542,11 @@ def get_volume_partition(self, volume, partition=None, project=None, schema=None :rtype: :class:`odps.models.VolumePartition` """ if partition is None: - if not volume.startswith('/') or '/' not in volume.lstrip('/'): - raise ValueError('You should provide a partition name or use partition path instead.') - volume, partition = volume.lstrip('/').split('/', 1) + if not volume.startswith("/") or "/" not in volume.lstrip("/"): + raise ValueError( + "You should provide a partition name or use partition path instead." + ) + volume, partition = volume.lstrip("/").split("/", 1) volume = self.get_volume(volume, project, schema=schema) return volume.partitions[partition] @@ -1518,16 +1560,20 @@ def exist_volume_partition(self, volume, partition=None, project=None, schema=No :param str schema: schema name, if not provided, will be the default schema """ if partition is None: - if not volume.startswith('/') or '/' not in volume.lstrip('/'): - raise ValueError('You should provide a partition name or use partition path instead.') - volume, partition = volume.lstrip('/').split('/', 1) + if not volume.startswith("/") or "/" not in volume.lstrip("/"): + raise ValueError( + "You should provide a partition name or use partition path instead." + ) + volume, partition = volume.lstrip("/").split("/", 1) try: volume = self.get_volume(volume, project, schema=schema) except errors.NoSuchObject: return False return partition in volume.partitions - def delete_volume_partition(self, volume, partition=None, project=None, schema=None): + def delete_volume_partition( + self, volume, partition=None, project=None, schema=None + ): """ Delete partition in a volume by given name @@ -1537,9 +1583,11 @@ def delete_volume_partition(self, volume, partition=None, project=None, schema=N :param str schema: schema name, if not provided, will be the default schema """ if partition is None: - if not volume.startswith('/') or '/' not in volume.lstrip('/'): - raise ValueError('You should provide a partition name or use partition path instead.') - volume, partition = volume.lstrip('/').split('/', 1) + if not volume.startswith("/") or "/" not in volume.lstrip("/"): + raise ValueError( + "You should provide a partition name or use partition path instead." + ) + volume, partition = volume.lstrip("/").split("/", 1) volume = self.get_volume(volume, project, schema=schema) return volume.delete_partition(partition) @@ -1563,17 +1611,18 @@ def list_volume_files(self, volume, partition=None, project=None, schema=None): >>> odps.list_volume_files('fs_volume', 'dir1/dir2') >>> odps.list_volume_files('/fs_volume/dir1/dir2') """ - from .models import PartedVolume if partition is None: - if not volume.startswith('/'): - raise ValueError('You should provide a partition name or use partition / path instead.') - volume = volume.lstrip('/') - if '/' in volume: - volume, partition = volume.split('/', 1) + if not volume.startswith("/"): + raise ValueError( + "You should provide a partition name or use partition / path instead." + ) + volume = volume.lstrip("/") + if "/" in volume: + volume, partition = volume.split("/", 1) volume = self.get_volume(volume, project, schema=schema) - if isinstance(volume, PartedVolume): + if isinstance(volume, models.PartedVolume): if not partition: - raise ValueError('Malformed partition url.') + raise ValueError("Malformed partition url.") return volume.partitions[partition].files.iterate() else: return volume[partition].objects.iterate() @@ -1588,16 +1637,15 @@ def create_volume_directory(self, volume, path=None, project=None, schema=None): :param str schema: schema name, if not provided, will be the default schema :return: directory object. """ - from .models import PartedVolume if path is None: - if not volume.startswith('/'): - raise ValueError('You should provide a valid path.') - volume = volume.lstrip('/') - if '/' in volume: - volume, path = volume.split('/', 1) + if not volume.startswith("/"): + raise ValueError("You should provide a valid path.") + volume = volume.lstrip("/") + if "/" in volume: + volume, path = volume.split("/", 1) volume = self.get_volume(volume, project, schema=schema) - if isinstance(volume, PartedVolume): - raise ValueError('Only supported under file system volumes.') + if isinstance(volume, models.PartedVolume): + raise ValueError("Only supported under file system volumes.") else: return volume.create_dir(path) @@ -1611,23 +1659,24 @@ def get_volume_file(self, volume, path=None, project=None, schema=None): :param str schema: schema name, if not provided, will be the default schema :return: directory object. """ - from .models import PartedVolume if path is None: - if not volume.startswith('/'): - raise ValueError('You should provide a valid path.') - volume = volume.lstrip('/') - if '/' in volume: - volume, path = volume.split('/', 1) + if not volume.startswith("/"): + raise ValueError("You should provide a valid path.") + volume = volume.lstrip("/") + if "/" in volume: + volume, path = volume.split("/", 1) volume = self.get_volume(volume, project, schema=schema) - if isinstance(volume, PartedVolume): - if '/' not in path: - raise ValueError('Partition/File format malformed.') - part, file_name = path.split('/', 1) + if isinstance(volume, models.PartedVolume): + if "/" not in path: + raise ValueError("Partition/File format malformed.") + part, file_name = path.split("/", 1) return volume.get_partition(part).files[file_name] else: return volume[path] - def move_volume_file(self, old_path, new_path, replication=None, project=None, schema=None): + def move_volume_file( + self, old_path, new_path, replication=None, project=None, schema=None + ): """ Move a file / directory object under a file system volume to another location in the same volume. @@ -1638,29 +1687,29 @@ def move_volume_file(self, old_path, new_path, replication=None, project=None, s :param str schema: schema name, if not provided, will be the default schema :return: directory object. """ - from .models import PartedVolume - - if not new_path.startswith('/'): + if not new_path.startswith("/"): # make relative path absolute - old_root, _ = old_path.rsplit('/', 1) - new_path = old_root + '/' + new_path + old_root, _ = old_path.rsplit("/", 1) + new_path = old_root + "/" + new_path - if not old_path.startswith('/'): - raise ValueError('You should provide a valid path.') - old_volume, old_path = old_path.lstrip('/').split('/', 1) + if not old_path.startswith("/"): + raise ValueError("You should provide a valid path.") + old_volume, old_path = old_path.lstrip("/").split("/", 1) - new_volume, _ = new_path.lstrip('/').split('/', 1) + new_volume, _ = new_path.lstrip("/").split("/", 1) if old_volume != new_volume: - raise ValueError('Moving between different volumes is not supported.') + raise ValueError("Moving between different volumes is not supported.") volume = self.get_volume(old_volume, project, schema=schema) - if isinstance(volume, PartedVolume): - raise ValueError('Only supported under file system volumes.') + if isinstance(volume, models.PartedVolume): + raise ValueError("Only supported under file system volumes.") else: volume[old_path].move(new_path, replication=replication) - def delete_volume_file(self, volume, path=None, recursive=False, project=None, schema=None): + def delete_volume_file( + self, volume, path=None, recursive=False, project=None, schema=None + ): """ Delete a file / directory object under a file system volume. @@ -1671,21 +1720,29 @@ def delete_volume_file(self, volume, path=None, recursive=False, project=None, s :param str schema: schema name, if not provided, will be the default schema :return: directory object. """ - from .models import PartedVolume if path is None: - if not volume.startswith('/'): - raise ValueError('You should provide a valid path.') - volume = volume.lstrip('/') - if '/' in volume: - volume, path = volume.split('/', 1) + if not volume.startswith("/"): + raise ValueError("You should provide a valid path.") + volume = volume.lstrip("/") + if "/" in volume: + volume, path = volume.split("/", 1) volume = self.get_volume(volume, project, schema=schema) - if isinstance(volume, PartedVolume): - raise ValueError('Only supported under file system volumes.') + if isinstance(volume, models.PartedVolume): + raise ValueError("Only supported under file system volumes.") else: volume[path].delete(recursive=recursive) - def open_volume_reader(self, volume, partition=None, file_name=None, project=None, - schema=None, start=None, length=None, **kwargs): + def open_volume_reader( + self, + volume, + partition=None, + file_name=None, + project=None, + schema=None, + start=None, + length=None, + **kwargs + ): """ Open a volume file for read. A file-like object will be returned which can be used to read contents from volume files. @@ -1704,25 +1761,32 @@ def open_volume_reader(self, volume, partition=None, file_name=None, project=Non >>> with odps.open_volume_reader('parted_volume', 'partition', 'file') as reader: >>> [print(line) for line in reader] """ - from .models import PartedVolume if partition is None: - if not volume.startswith('/'): - raise ValueError('You should provide a partition name or use partition / path instead.') - volume = volume.lstrip('/') - volume, partition = volume.split('/', 1) - if '/' in partition: - partition, file_name = partition.rsplit('/', 1) + if not volume.startswith("/"): + raise ValueError( + "You should provide a partition name or use partition / path instead." + ) + volume = volume.lstrip("/") + volume, partition = volume.split("/", 1) + if "/" in partition: + partition, file_name = partition.rsplit("/", 1) else: partition, file_name = None, partition volume = self.get_volume(volume, project, schema=schema) - if isinstance(volume, PartedVolume): + if isinstance(volume, models.PartedVolume): if not partition: - raise ValueError('Malformed partition url.') - return volume.partitions[partition].open_reader(file_name, start=start, length=length, **kwargs) + raise ValueError("Malformed partition url.") + return volume.partitions[partition].open_reader( + file_name, start=start, length=length, **kwargs + ) else: - return volume[partition].open_reader(file_name, start=start, length=length, **kwargs) + return volume[partition].open_reader( + file_name, start=start, length=length, **kwargs + ) - def open_volume_writer(self, volume, partition=None, project=None, schema=None, **kwargs): + def open_volume_writer( + self, volume, partition=None, project=None, schema=None, **kwargs + ): """ Write data into a volume. This function behaves differently under different types of volumes. @@ -1749,18 +1813,19 @@ def open_volume_writer(self, volume, partition=None, project=None, schema=None, >>> with odps.open_volume_writer('/fs_volume/dir1/file_name') as writer: >>> writer.write('some content') """ - from .models import PartedVolume if partition is None: - if not volume.startswith('/'): - raise ValueError('You should provide a partition name or use partition / path instead.') - volume = volume.lstrip('/') - volume, partition = volume.split('/', 1) + if not volume.startswith("/"): + raise ValueError( + "You should provide a partition name or use partition / path instead." + ) + volume = volume.lstrip("/") + volume, partition = volume.split("/", 1) volume = self.get_volume(volume, project, schema=schema) - if isinstance(volume, PartedVolume): + if isinstance(volume, models.PartedVolume): return volume.partitions[partition].open_writer(**kwargs) else: - if '/' in partition: - partition, file_name = partition.rsplit('/', 1) + if "/" in partition: + partition, file_name = partition.rsplit("/", 1) else: partition, file_name = None, partition return volume[partition].open_writer(file_name, **kwargs) @@ -1807,8 +1872,15 @@ def exist_xflow(self, name, project=None): project = self.get_project(name=project) return name in project.xflows - def run_xflow(self, xflow_name, xflow_project=None, parameters=None, project=None, - hints=None, priority=None): + def run_xflow( + self, + xflow_name, + xflow_project=None, + parameters=None, + project=None, + hints=None, + priority=None, + ): """ Run xflow by given name, xflow project, paremeters asynchronously. @@ -1833,11 +1905,23 @@ def run_xflow(self, xflow_name, xflow_project=None, parameters=None, project=Non if isinstance(xflow_project, models.Project): xflow_project = xflow_project.name return project.xflows.run_xflow( - xflow_name=xflow_name, xflow_project=xflow_project, project=project, parameters=parameters, - hints=hints, priority=priority) + xflow_name=xflow_name, + xflow_project=xflow_project, + project=project, + parameters=parameters, + hints=hints, + priority=priority, + ) - def execute_xflow(self, xflow_name, xflow_project=None, parameters=None, project=None, - hints=None, priority=None): + def execute_xflow( + self, + xflow_name, + xflow_project=None, + parameters=None, + project=None, + hints=None, + priority=None, + ): """ Run xflow by given name, xflow project, paremeters, block until xflow executed successfully. @@ -1859,8 +1943,13 @@ def execute_xflow(self, xflow_name, xflow_project=None, parameters=None, project """ inst = self.run_xflow( - xflow_name, xflow_project=xflow_project, parameters=parameters, project=project, - hints=hints, priority=priority) + xflow_name, + xflow_project=xflow_project, + parameters=parameters, + project=project, + hints=hints, + priority=priority, + ) inst.wait_for_success() return inst @@ -1878,6 +1967,7 @@ def get_xflow_results(self, instance, project=None): project = self.get_project(name=project) from .models import Instance + if not isinstance(instance, Instance): instance = project.instances[instance] @@ -1966,7 +2056,9 @@ def exist_offline_model(self, name, project=None): return name in project.offline_models @utils.with_wait_argument - def copy_offline_model(self, name, new_name, project=None, new_project=None, async_=False): + def copy_offline_model( + self, name, new_name, project=None, new_project=None, async_=False + ): """ Copy current model into a new location. @@ -1974,8 +2066,9 @@ def copy_offline_model(self, name, new_name, project=None, new_project=None, asy :param new_project: new project name. if absent, original project name will be used :param async_: if True, return the copy instance. otherwise return the newly-copied model """ - return self.get_offline_model(name, project=project) \ - .copy(new_name, new_project=new_project, async_=async_) + return self.get_offline_model(name, project=project).copy( + new_name, new_project=new_project, async_=async_ + ) def delete_offline_model(self, name, project=None, if_exists=False): """ @@ -1990,7 +2083,7 @@ def delete_offline_model(self, name, project=None, if_exists=False): project = self.get_project(name=project) try: return project.offline_models.delete(name) - except NoSuchObject: + except errors.NoSuchObject: if not if_exists: raise @@ -2004,12 +2097,14 @@ def get_logview_host(self): try: logview_host = utils.to_str( - self.rest.get(self.endpoint + '/logview/host').content + self.rest.get(self.endpoint + "/logview/host").content ) except: logview_host = None if not logview_host: - logview_host = utils.get_default_logview_endpoint(LOGVIEW_HOST_DEFAULT, self.endpoint) + logview_host = utils.get_default_logview_endpoint( + LOGVIEW_HOST_DEFAULT, self.endpoint + ) _logview_host_cache[self.endpoint] = logview_host return logview_host @@ -2195,7 +2290,7 @@ def get_security_option(self, option_name, project=None): option_name = utils.camel_to_underline(option_name) sec_options = self.get_security_options(project=project) if not hasattr(sec_options, option_name): - raise ValueError('Option does not exists.') + raise ValueError("Option does not exists.") return getattr(sec_options, option_name) def set_security_option(self, option_name, value, project=None): @@ -2209,7 +2304,7 @@ def set_security_option(self, option_name, value, project=None): option_name = utils.camel_to_underline(option_name) sec_options = self.get_security_options(project=project) if not hasattr(sec_options, option_name): - raise ValueError('Option does not exists.') + raise ValueError("Option does not exists.") setattr(sec_options, option_name, value) sec_options.update() @@ -2249,268 +2344,17 @@ def execute_security_query( from .models import Project instance_or_result = self.run_security_query( - query, project=project, schema=schema, token=token, hints=hints, output_json=output_json + query, + project=project, + schema=schema, + token=token, + hints=hints, + output_json=output_json, ) if not isinstance(instance_or_result, Project.AuthQueryInstance): return instance_or_result return instance_or_result.wait_for_success() - @utils.deprecated( - 'You no longer have to manipulate session instances to use MaxCompute QueryAcceleration. ' - 'Try `run_sql_interactive`.' - ) - def attach_session(self, session_name, taskname=None, hints=None): - """ - Attach to an existing session. - - :param session_name: The session name. - :param taskname: The created sqlrt task name. If not provided, the default value is used. - Mostly doesn't matter, default works. - :return: A SessionInstance you may execute select tasks within. - """ - return self._attach_mcqa_session(session_name, task_name=taskname, hints=hints) - - def _attach_mcqa_session(self, session_name=None, task_name=None, hints=None): - session_name = session_name or models.session.PUBLIC_SESSION_NAME - task_name = task_name or models.session.DEFAULT_TASK_NAME - - task = models.tasks.SQLRTTask(name=task_name) - task.update_sql_rt_settings(hints) - task.update_sql_rt_settings( - {"odps.sql.session.share.id": session_name, "odps.sql.submit.mode": "script"} - ) - project = self.get_project() - return project.instances.create( - task=task, session_project=project, session_name=session_name - ) - - @utils.deprecated( - 'You no longer have to manipulate session instances to use MaxCompute QueryAcceleration. ' - 'Try `run_sql_interactive`.' - ) - def default_session(self): - """ - Attach to the default session of your project. - - :return: A SessionInstance you may execute select tasks within. - """ - return self._get_default_mcqa_session(wait=False) - - def _get_default_mcqa_session( - self, session_name=None, hints=None, wait=True, service_startup_timeout=60 - ): - session_name = session_name or models.session.PUBLIC_SESSION_NAME - if self._default_session is None: - self._default_session = self._attach_mcqa_session(session_name, hints=hints) - self._default_session_name = session_name - if wait: - self._default_session.wait_for_startup( - 0.1, service_startup_timeout, max_interval=1 - ) - return self._default_session - - @utils.deprecated( - 'You no longer have to manipulate session instances to use MaxCompute QueryAcceleration. ' - 'Try `run_sql_interactive`.' - ) - def create_session( - self, session_worker_count, session_worker_memory, session_name=None, - worker_spare_span=None, taskname=None, hints=None - ): - """ - Create session. - - :param session_worker_count: How much workers assigned to the session. - :param session_worker_memory: How much memory each worker consumes. - :param session_name: The session name. Not specifying to use its ID as name. - :param worker_spare_span: format "00-24", allocated workers will be reduced during this time. - Not specifying to disable this. - :param taskname: The created sqlrt task name. If not provided, the default value is used. - Mostly doesn't matter, default works. - :param hints: Extra hints provided to the session. Parameters of this method will override - certain hints. - :return: A SessionInstance you may execute select tasks within. - """ - return self._create_mcqa_session( - session_worker_count, session_worker_memory, session_name, - worker_spare_span, taskname, hints - ) - - def _create_mcqa_session( - self, session_worker_count, session_worker_memory, session_name=None, - worker_spare_span=None, task_name=None, hints=None - ): - if not task_name: - task_name = models.session.DEFAULT_TASK_NAME - - session_hints = { - "odps.sql.session.worker.count": str(session_worker_count), - "odps.sql.session.worker.memory": str(session_worker_memory), - "odps.sql.submit.mode": "script", - } - if session_name: - session_hints["odps.sql.session.name"] = session_name - if worker_spare_span: - session_hints["odps.sql.session.worker.sparespan"] = worker_spare_span - task = models.tasks.SQLRTTask(name=task_name) - task.update_sql_rt_settings(hints) - task.update_sql_rt_settings(session_hints) - project = self.get_project() - return project.instances.create( - task=task, session_project=project, session_name=session_name - ) - - def _get_mcqa_session_file(self): - try: - dir_name = utils.build_pyodps_dir() - if not os.path.exists(dir_name): - os.makedirs(dir_name) - access_id_digest = hashlib.md5(utils.to_binary(self.account.access_id)).hexdigest() - return os.path.join( - dir_name, "mcqa-session-" + access_id_digest - ) - except: - return None - - def run_sql_interactive(self, sql, hints=None, **kwargs): - """ - Run SQL query in interactive mode (a.k.a MaxCompute QueryAcceleration). - Won't fallback to offline mode automatically if query not supported or fails - :param sql: the sql query. - :param hints: settings for sql query. - :return: instance. - """ - cached_is_running = False - service_name = kwargs.pop('service_name', models.session.PUBLIC_SESSION_NAME) - task_name = kwargs.pop('task_name', None) - service_startup_timeout = kwargs.pop('service_startup_timeout', 60) - force_reattach = kwargs.pop('force_reattach', False) - - session_file_name = self._get_mcqa_session_file() - if ( - self._default_session is None - and session_file_name - and os.path.exists(session_file_name) - ): - try: - with open(session_file_name, "r") as session_file: - session_info = json.loads(session_file.read()) - instance_obj = self.get_instance(session_info.pop("id")) - session_project = self.get_project(session_info.pop("session_project_name")) - self._default_session_name = session_info["session_name"] - self._default_session = models.SessionInstance.from_instance( - instance_obj, session_project=session_project, **session_info - ) - except: - pass - - if self._default_session is not None: - try: - cached_is_running = self._default_session.is_running() - except: - pass - if ( - force_reattach - or not cached_is_running - or self._default_session_name != service_name - ): - # should reattach, for whatever reason (timed out, terminated, never created, - # forced using another session) - self._default_session = self._attach_mcqa_session(service_name, task_name=task_name) - self._default_session.wait_for_startup(0.1, service_startup_timeout, max_interval=1) - self._default_session_name = service_name - - if session_file_name: - try: - with open(session_file_name, "w") as session_file: - session_file.write( - json.dumps(self._default_session._extract_json_info()) - ) - except: - pass - return self._default_session.run_sql(sql, hints, **kwargs) - - @utils.deprecated( - 'The method `run_sql_interactive_with_fallback` is deprecated. ' - 'Try `execute_sql_interactive` with fallback=True argument instead.' - ) - def run_sql_interactive_with_fallback(self, sql, hints=None, **kwargs): - return self.execute_sql_interactive( - sql, hints=hints, fallback='all', wait_fallback=False, **kwargs - ) - - def execute_sql_interactive( - self, sql, hints=None, fallback=True, wait_fallback=True, **kwargs - ): - """ - Run SQL query in interactive mode (a.k.a MaxCompute QueryAcceleration). - If query is not supported or fails, and fallback is True, - will fallback to offline mode automatically - - :param sql: the sql query. - :param hints: settings for sql query. - :param fallback: fallback query to non-interactive mode, True by default. - Both boolean type and policy names separated by commas are acceptable. - :param bool wait_fallback: wait fallback instance to finish, True by default. - :return: instance. - """ - from .models.session import FallbackMode, FallbackPolicy - - if isinstance(fallback, (six.string_types, set, list, tuple)): - fallback_policy = FallbackPolicy(fallback) - elif fallback is False: - fallback_policy = None - elif fallback is True: - fallback_policy = FallbackPolicy("all") - else: - assert isinstance(fallback, FallbackPolicy) - fallback_policy = fallback - - inst = None - use_tunnel = kwargs.pop("tunnel", True) - fallback_callback = kwargs.pop("fallback_callback", None) - offline_hints = kwargs.pop("offline_hints", None) or {} - try: - inst = self.run_sql_interactive(sql, hints=hints, **kwargs) - inst.wait_for_success(interval=0.1, max_interval=1) - try: - rd = inst.open_reader(tunnel=use_tunnel, limit=True) - if not rd: - raise ODPSError('Get sql result fail') - except errors.InstanceTypeNotSupported: - # sql is not a select, just skip creating reader - pass - return inst - except BaseException as ex: - if fallback_policy is None: - raise - fallback_mode = fallback_policy.get_mode_from_exception(ex) - if fallback_mode is None: - raise - elif fallback_mode == FallbackMode.INTERACTIVE: - kwargs["force_reattach"] = True - return self.execute_sql_interactive( - sql, hints=hints, fallback=fallback, wait_fallback=wait_fallback, **kwargs - ) - else: - kwargs.pop("service_name", None) - kwargs.pop("force_reattach", None) - kwargs.pop("service_startup_timeout", None) - hints = copy.copy(offline_hints or hints or {}) - hints["odps.task.sql.sqa.enable"] = "false" - - if fallback_callback is not None: - fallback_callback(inst, ex) - - if inst is not None: - hints["odps.sql.session.fallback.instance"] = "%s_%s" % (inst.id, inst.subquery_id) - else: - hints["odps.sql.session.fallback.instance"] = "fallback4AttachFailed" - inst = self.execute_sql(sql, hints=hints, **kwargs) - if wait_fallback: - inst.wait_for_success() - return inst - @classmethod def _build_account(cls, access_id, secret_access_key): return accounts.AliyunAccount(access_id, secret_access_key) @@ -2560,6 +2404,29 @@ def from_environments(cls): except KeyError: return None + _attach_mcqa_session = _wrap_model_func(models.SessionMethods._attach_mcqa_session) + attach_session = _wrap_model_func(models.SessionMethods.attach_session) + _create_mcqa_session = _wrap_model_func(models.SessionMethods._create_mcqa_session) + create_session = _wrap_model_func(models.SessionMethods.create_session) + default_session = _wrap_model_func(models.SessionMethods.default_session) + _get_default_mcqa_session = _wrap_model_func( + models.SessionMethods._get_default_mcqa_session + ) + run_sql_interactive = _wrap_model_func(models.SessionMethods.run_sql_interactive) + run_sql_interactive_with_fallback = _wrap_model_func( + models.SessionMethods.run_sql_interactive_with_fallback + ) + execute_sql_interactive = _wrap_model_func( + models.SessionMethods.execute_sql_interactive + ) + + run_merge_files = _wrap_model_func(models.MergeTask.run_merge_files) + execute_merge_files = _wrap_model_func(models.MergeTask.execute_merge_files) + run_archive_table = _wrap_model_func(models.MergeTask.run_archive_table) + execute_archive_table = _wrap_model_func(models.MergeTask.execute_archive_table) + run_freeze_command = _wrap_model_func(models.MergeTask.run_freeze_command) + execute_freeze_command = _wrap_model_func(models.MergeTask.execute_freeze_command) + def _get_odps_from_model(self): cur = self @@ -2573,44 +2440,27 @@ def _get_odps_from_model(self): try: from .internal.core import * # noqa: F401 -except ImportError: +except ImportError: # pragma: no cover pass try: - from .mars_extension import create_mars_cluster - setattr(ODPS, 'create_mars_cluster', create_mars_cluster) -except ImportError: - pass -try: - from .mars_extension import persist_mars_dataframe - setattr(ODPS, 'persist_mars_dataframe', persist_mars_dataframe) -except ImportError: - pass -try: - from .mars_extension import to_mars_dataframe - setattr(ODPS, 'to_mars_dataframe', to_mars_dataframe) -except ImportError: - pass -try: - from .mars_extension import run_script_in_mars - setattr(ODPS, 'run_script_in_mars', run_script_in_mars) -except ImportError: - pass -try: - from .mars_extension import run_mars_job - setattr(ODPS, 'run_mars_job', run_mars_job) -except ImportError: - pass -try: - from .mars_extension import list_mars_instances - setattr(ODPS, 'list_mars_instances', list_mars_instances) -except ImportError: - pass -try: - from .mars_extension import sql_to_mars_dataframe - setattr(ODPS, 'sql_to_mars_dataframe', sql_to_mars_dataframe) + from . import mars_extension + + for _mars_attr in ( + "create_mars_cluster", + "persist_mars_dataframe", + "to_mars_dataframe", + "run_script_in_mars", + "run_mars_job", + "list_mars_instances", + "sql_to_mars_dataframe", + ): + setattr(ODPS, _mars_attr, getattr(mars_extension, _mars_attr)) except ImportError: pass -DEFAULT_ENDPOINT = os.getenv('ODPS_ENDPOINT', os.getenv('PYODPS_ENDPOINT', DEFAULT_ENDPOINT)) +DEFAULT_ENDPOINT = os.getenv( + "ODPS_ENDPOINT", os.getenv("PYODPS_ENDPOINT", DEFAULT_ENDPOINT) +) +del _wrap_model_func diff --git a/odps/counters.py b/odps/counters.py index 4d384da1..921bdbaa 100644 --- a/odps/counters.py +++ b/odps/counters.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,14 +13,14 @@ # limitations under the License. import json -__all__ = ['Counter', 'CounterGroup', 'Counters'] +__all__ = ["Counter", "CounterGroup", "Counters"] class Counter(object): def __init__(self, name, value=0): self.name = name self.value = value - + def get_name(self): return self.name @@ -34,10 +34,7 @@ def increment(self, incr): self.value += incr def _get_data_obj(self): - data = { - 'name' : self.name, - 'value' : self.value - } + data = {"name": self.name, "value": self.value} return data @@ -61,9 +58,9 @@ def size(self): def _get_data_obj(self): data = { - 'name' : self.name, - 'counters' : [c._get_data_obj() for c in self.counters.values()] - } + "name": self.name, + "counters": [c._get_data_obj() for c in self.counters.values()], + } return data @@ -83,7 +80,7 @@ def size(self): def to_json_string(self, **json_options): data = dict() - for (k, v) in self.groups.items(): + for k, v in self.groups.items(): data[k] = v._get_data_obj() return json.dumps(data, **json_options) diff --git a/odps/crc.py b/odps/crc.py index c605a01c..ecaed7b6 100644 --- a/odps/crc.py +++ b/odps/crc.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,7 +28,7 @@ def update(self, buf, off=None, length=None): assert isinstance(buf, (six.binary_type, bytearray)) off = off or 0 length = length or len(buf) - to_crc = buf[off: off+length] + to_crc = buf[off : off + length] if isinstance(to_crc, bytearray): to_crc = bytes(to_crc) if self.crc: @@ -55,78 +55,79 @@ def getvalue(self): if Crc32c is None: + # fmt: off _CRC_TABLE = ( - 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, - 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, - 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, - 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, - 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, - 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, - 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, - 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, - 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, - 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, - 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, - 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, - 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, - 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, - 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, - 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, - 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, - 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, - 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, - 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, - 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, - 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, - 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, - 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, - 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, - 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, - 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, - 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, - 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, - 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, - 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, - 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, - 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, - 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, - 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, - 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, - 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, - 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, - 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, - 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, - 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, - 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, - 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, - 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, - 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, - 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, - 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, - 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, - 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, - 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, - 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, - 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, - 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, - 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, - 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, - 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, - 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, - 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, - 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, - 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, - 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, - 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, - 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, - 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351, + 0X00000000, 0XF26B8303, 0XE13B70F7, 0X1350F3F4, + 0XC79A971F, 0X35F1141C, 0X26A1E7E8, 0XD4CA64EB, + 0X8AD958CF, 0X78B2DBCC, 0X6BE22838, 0X9989AB3B, + 0X4D43CFD0, 0XBF284CD3, 0XAC78BF27, 0X5E133C24, + 0X105EC76F, 0XE235446C, 0XF165B798, 0X030E349B, + 0XD7C45070, 0X25AFD373, 0X36FF2087, 0XC494A384, + 0X9A879FA0, 0X68EC1CA3, 0X7BBCEF57, 0X89D76C54, + 0X5D1D08BF, 0XAF768BBC, 0XBC267848, 0X4E4DFB4B, + 0X20BD8EDE, 0XD2D60DDD, 0XC186FE29, 0X33ED7D2A, + 0XE72719C1, 0X154C9AC2, 0X061C6936, 0XF477EA35, + 0XAA64D611, 0X580F5512, 0X4B5FA6E6, 0XB93425E5, + 0X6DFE410E, 0X9F95C20D, 0X8CC531F9, 0X7EAEB2FA, + 0X30E349B1, 0XC288CAB2, 0XD1D83946, 0X23B3BA45, + 0XF779DEAE, 0X05125DAD, 0X1642AE59, 0XE4292D5A, + 0XBA3A117E, 0X4851927D, 0X5B016189, 0XA96AE28A, + 0X7DA08661, 0X8FCB0562, 0X9C9BF696, 0X6EF07595, + 0X417B1DBC, 0XB3109EBF, 0XA0406D4B, 0X522BEE48, + 0X86E18AA3, 0X748A09A0, 0X67DAFA54, 0X95B17957, + 0XCBA24573, 0X39C9C670, 0X2A993584, 0XD8F2B687, + 0X0C38D26C, 0XFE53516F, 0XED03A29B, 0X1F682198, + 0X5125DAD3, 0XA34E59D0, 0XB01EAA24, 0X42752927, + 0X96BF4DCC, 0X64D4CECF, 0X77843D3B, 0X85EFBE38, + 0XDBFC821C, 0X2997011F, 0X3AC7F2EB, 0XC8AC71E8, + 0X1C661503, 0XEE0D9600, 0XFD5D65F4, 0X0F36E6F7, + 0X61C69362, 0X93AD1061, 0X80FDE395, 0X72966096, + 0XA65C047D, 0X5437877E, 0X4767748A, 0XB50CF789, + 0XEB1FCBAD, 0X197448AE, 0X0A24BB5A, 0XF84F3859, + 0X2C855CB2, 0XDEEEDFB1, 0XCDBE2C45, 0X3FD5AF46, + 0X7198540D, 0X83F3D70E, 0X90A324FA, 0X62C8A7F9, + 0XB602C312, 0X44694011, 0X5739B3E5, 0XA55230E6, + 0XFB410CC2, 0X092A8FC1, 0X1A7A7C35, 0XE811FF36, + 0X3CDB9BDD, 0XCEB018DE, 0XDDE0EB2A, 0X2F8B6829, + 0X82F63B78, 0X709DB87B, 0X63CD4B8F, 0X91A6C88C, + 0X456CAC67, 0XB7072F64, 0XA457DC90, 0X563C5F93, + 0X082F63B7, 0XFA44E0B4, 0XE9141340, 0X1B7F9043, + 0XCFB5F4A8, 0X3DDE77AB, 0X2E8E845F, 0XDCE5075C, + 0X92A8FC17, 0X60C37F14, 0X73938CE0, 0X81F80FE3, + 0X55326B08, 0XA759E80B, 0XB4091BFF, 0X466298FC, + 0X1871A4D8, 0XEA1A27DB, 0XF94AD42F, 0X0B21572C, + 0XDFEB33C7, 0X2D80B0C4, 0X3ED04330, 0XCCBBC033, + 0XA24BB5A6, 0X502036A5, 0X4370C551, 0XB11B4652, + 0X65D122B9, 0X97BAA1BA, 0X84EA524E, 0X7681D14D, + 0X2892ED69, 0XDAF96E6A, 0XC9A99D9E, 0X3BC21E9D, + 0XEF087A76, 0X1D63F975, 0X0E330A81, 0XFC588982, + 0XB21572C9, 0X407EF1CA, 0X532E023E, 0XA145813D, + 0X758FE5D6, 0X87E466D5, 0X94B49521, 0X66DF1622, + 0X38CC2A06, 0XCAA7A905, 0XD9F75AF1, 0X2B9CD9F2, + 0XFF56BD19, 0X0D3D3E1A, 0X1E6DCDEE, 0XEC064EED, + 0XC38D26C4, 0X31E6A5C7, 0X22B65633, 0XD0DDD530, + 0X0417B1DB, 0XF67C32D8, 0XE52CC12C, 0X1747422F, + 0X49547E0B, 0XBB3FFD08, 0XA86F0EFC, 0X5A048DFF, + 0X8ECEE914, 0X7CA56A17, 0X6FF599E3, 0X9D9E1AE0, + 0XD3D3E1AB, 0X21B862A8, 0X32E8915C, 0XC083125F, + 0X144976B4, 0XE622F5B7, 0XF5720643, 0X07198540, + 0X590AB964, 0XAB613A67, 0XB831C993, 0X4A5A4A90, + 0X9E902E7B, 0X6CFBAD78, 0X7FAB5E8C, 0X8DC0DD8F, + 0XE330A81A, 0X115B2B19, 0X020BD8ED, 0XF0605BEE, + 0X24AA3F05, 0XD6C1BC06, 0XC5914FF2, 0X37FACCF1, + 0X69E9F0D5, 0X9B8273D6, 0X88D28022, 0X7AB90321, + 0XAE7367CA, 0X5C18E4C9, 0X4F48173D, 0XBD23943E, + 0XF36E6F75, 0X0105EC76, 0X12551F82, 0XE03E9C81, + 0X34F4F86A, 0XC69F7B69, 0XD5CF889D, 0X27A40B9E, + 0X79B737BA, 0X8BDCB4B9, 0X988C474D, 0X6AE7C44E, + 0XBE2DA0A5, 0X4C4623A6, 0X5F16D052, 0XAD7D5351, ) + # fmt: on - _CRC_INIT = 0xffffffff - + _CRC_INIT = 0xFFFFFFFF class Crc32c(object): - _method = 'py' + _method = "py" def __init__(self): self.crc = _CRC_INIT @@ -142,20 +143,20 @@ def update(self, buf, off=None, length=None): off = off or 0 length = length or len(buf) - to_crc = buf[off: off + length] + to_crc = buf[off : off + length] crc = self.crc for b in to_crc: - table_index = (crc ^ b) & 0xff - crc = (_CRC_TABLE[table_index] ^ (crc >> 8)) & 0xffffffff - self.crc = crc & 0xffffffff + table_index = (crc ^ b) & 0xFF + crc = (_CRC_TABLE[table_index] ^ (crc >> 8)) & 0xFFFFFFFF + self.crc = crc & 0xFFFFFFFF def reset(self): self.crc = _CRC_INIT @classmethod def _crc_finalize(cls, crc): - return crc ^ 0xffffffff + return crc ^ 0xFFFFFFFF def getvalue(self): return Crc32c._crc_finalize(self.crc) diff --git a/odps/dag.py b/odps/dag.py index d29d47b8..f0f55a2a 100644 --- a/odps/dag.py +++ b/odps/dag.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import itertools from copy import deepcopy -from .compat import Queue, six, Iterable +from .compat import Iterable, Queue, six class DAGValidationError(Exception): @@ -26,6 +26,7 @@ class DAGValidationError(Exception): class DAG(object): """Directed acyclic graph implementation.""" + _graph_dict_type = dict _dict_type = dict @@ -51,7 +52,7 @@ def add_node(self, node): def remove_node(self, node): if id(node) not in self._graph: - raise KeyError('Node does not exist') + raise KeyError("Node does not exist") self._graph.pop(id(node)) self._map.pop(id(node)) @@ -68,16 +69,20 @@ def remove_node(self, node): edges.remove(id(node)) def contains_edge(self, predecessor_node, successor_node): - if id(predecessor_node) not in self._graph or \ - id(successor_node) not in self._graph: + if ( + id(predecessor_node) not in self._graph + or id(successor_node) not in self._graph + ): return False return id(successor_node) in self._graph[id(predecessor_node)] def add_edge(self, predecessor_node, successor_node, validate=True): - if id(predecessor_node) not in self._graph or \ - id(successor_node) not in self._graph: - raise KeyError('Node does not exist') + if ( + id(predecessor_node) not in self._graph + or id(successor_node) not in self._graph + ): + raise KeyError("Node does not exist") if validate: test_graph = deepcopy(self._graph) @@ -88,7 +93,7 @@ def add_edge(self, predecessor_node, successor_node, validate=True): test_reversed_graph[id(successor_node)].add(id(predecessor_node)) valid, msg = self._validate(test_graph, test_reversed_graph) else: - valid, msg = True, '' + valid, msg = True, "" if valid: self._graph[id(predecessor_node)].add(id(successor_node)) if self._reversed_graph is not None: @@ -98,7 +103,7 @@ def add_edge(self, predecessor_node, successor_node, validate=True): def remove_edge(self, predecessor_node, successor_node): if id(successor_node) not in self._graph.get(id(predecessor_node), []): - raise KeyError('Edge does not exist in the graph') + raise KeyError("Edge does not exist in the graph") self._graph[id(predecessor_node)].remove(id(successor_node)) if self._reversed_graph is not None: @@ -109,15 +114,20 @@ def _indep_ids(self, graph=None, reversed_graph=None): reversed_graph = reversed_graph or self._reversed_graph if reversed_graph is not None: - return [node for node, precessors in six.iteritems(reversed_graph) - if len(precessors) == 0] + return [ + node + for node, precessors in six.iteritems(reversed_graph) + if len(precessors) == 0 + ] all_nodes = set(graph.keys()) return list(all_nodes - set(itertools.chain(*graph.values()))) def indep_nodes(self, graph=None, reversed_graph=None): - return [self._map.get(i) for i in self._indep_ids(graph=graph, - reversed_graph=reversed_graph)] + return [ + self._map.get(i) + for i in self._indep_ids(graph=graph, reversed_graph=reversed_graph) + ] def _predecessor_ids(self, node_id, graph=None, reversed_graph=None): graph = graph or self._graph @@ -128,13 +138,13 @@ def _predecessor_ids(self, node_id, graph=None, reversed_graph=None): def predecessors(self, node): if id(node) not in self._graph: - raise KeyError('Node does not exist: %s' % node) + raise KeyError("Node does not exist: %s" % node) return [self._map.get(node_id) for node_id in self._predecessor_ids(id(node))] def successors(self, node): if id(node) not in self._graph: - raise KeyError('Node does not exist: %r' % node) + raise KeyError("Node does not exist: %r" % node) return [self._map.get(node_id) for node_id in self._graph[id(node)]] @@ -142,18 +152,20 @@ def _validate(self, graph=None, reversed_graph=None): graph = graph or self._graph reversed_graph = reversed_graph or self._reversed_graph if len(self.indep_nodes(graph, reversed_graph)) == 0: - return False, 'No independent nodes detected' + return False, "No independent nodes detected" try: self.topological_sort(graph, reversed_graph) except ValueError: - return False, 'Fail to topological sort' - return True, 'Valid' + return False, "Fail to topological sort" + return True, "Valid" def bfs(self, start_nodes, successor=None, cond=None): cond = cond or (lambda v: True) successor = successor or self.successors - start_nodes = [start_nodes, ] if not isinstance(start_nodes, Iterable) else start_nodes + start_nodes = ( + [start_nodes] if not isinstance(start_nodes, Iterable) else start_nodes + ) start_nodes = [n for n in start_nodes if cond(n)] assert all(id(node) in self._graph for node in start_nodes) @@ -202,7 +214,7 @@ def topological_sort(self, graph=None, reversed_graph=None): indep_ids.append(dep_id) if len(node_ids) != len(graph): - raise ValueError('Graph is not acyclic') + raise ValueError("Graph is not acyclic") return [self._map.get(nid) for nid in node_ids] diff --git a/odps/dbapi.py b/odps/dbapi.py index 988e9458..1d3e5e30 100644 --- a/odps/dbapi.py +++ b/odps/dbapi.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,15 +20,14 @@ from .compat import enum, six from .config import options from .core import ODPS -from .errors import NotSupportedError, InstanceTypeNotSupported, ODPSError -from .utils import to_str +from .errors import InstanceTypeNotSupported, NotSupportedError, ODPSError from .models.session import PUBLIC_SESSION_NAME - +from .utils import to_str # PEP 249 module globals -apilevel = '2.0' +apilevel = "2.0" threadsafety = 2 # Threads may share the module and connections. -paramstyle = 'named' # Python extended format codes, e.g. ...WHERE name=%(name)s +paramstyle = "named" # Python extended format codes, e.g. ...WHERE name=%(name)s class Error(Exception): @@ -54,39 +53,58 @@ def connect(*args, **kwargs): "upgrading": ["ODPS-182", "ODPS-184"], "noresource": ["ODPS-183"], "timeout": ["ODPS-186"], - "generic": ["ODPS-180"] + "generic": ["ODPS-180"], } FALLBACK_POLICY_ALIASES = { "default": ["unsupported", "upgrading", "noresource", "timeout"], - "all": ["unsupported", "upgrading", "noresource", "timeout", "generic"] + "all": ["unsupported", "upgrading", "noresource", "timeout", "generic"], } class Connection(object): - def __init__(self, access_id=None, secret_access_key=None, project=None, - endpoint=None, session_name=None, odps=None, hints=None, **kw): + def __init__( + self, + access_id=None, + secret_access_key=None, + project=None, + endpoint=None, + session_name=None, + odps=None, + hints=None, + **kw + ): if isinstance(access_id, ODPS): access_id, odps = None, access_id + self._use_sqa = kw.pop("use_sqa", False) != False + self._fallback_policy = kw.pop("fallback_policy", "") + self._project_as_schema = kw.pop( + "project_as_schema", options.sqlalchemy.project_as_schema + ) + if odps is None: - # pop unsupported - kw.pop("use_sqa", None) - kw.pop("fallback_policy", None) - self._odps = ODPS(access_id=access_id, secret_access_key=secret_access_key, - project=project, endpoint=endpoint, **kw) + self._odps = ODPS( + access_id=access_id, + secret_access_key=secret_access_key, + project=project, + endpoint=endpoint, + **kw + ) else: if access_id is not None: - raise ValueError('Either access_id or odps can be specified') + raise ValueError("Either access_id or odps can be specified") self._odps = odps + + try: + if self._project_as_schema is None: + self._project_as_schema = not self._odps.is_schema_namespace_enabled() + except: + pass + self._session_name = PUBLIC_SESSION_NAME if session_name is not None: self._session_name = session_name - self._use_sqa = (kw.pop('use_sqa', False) != False) - self._fallback_policy = kw.pop('fallback_policy', '') - self._project_as_schema = kw.pop( - 'project_as_schema', options.sqlalchemy.project_as_schema - ) self._hints = hints @property @@ -102,9 +120,12 @@ def __exit__(self, exc_type, exc_val, exc_tb): def cursor(self, *args, **kwargs): """Return a new :py:class:`Cursor` object using the connection.""" return Cursor( - self, *args, use_sqa=self._use_sqa, + self, + *args, + use_sqa=self._use_sqa, fallback_policy=self._fallback_policy, - hints=self._hints, **kwargs + hints=self._hints, + **kwargs ) def close(self): @@ -125,8 +146,15 @@ def rollback(self): class Cursor(object): - def __init__(self, connection, arraysize=default_arraysize, - use_sqa=False, fallback_policy='', hints=None, **kwargs): + def __init__( + self, + connection, + arraysize=default_arraysize, + use_sqa=False, + fallback_policy="", + hints=None, + **kwargs + ): self._connection = connection self._arraysize = arraysize self._reset_state() @@ -134,7 +162,7 @@ def __init__(self, connection, arraysize=default_arraysize, self._use_sqa = use_sqa self._fallback_policy = [] self._hints = hints - fallback_policies = map(lambda x: x.strip(), fallback_policy.split(',')) + fallback_policies = map(lambda x: x.strip(), fallback_policy.split(",")) for policy in fallback_policies: if policy in FALLBACK_POLICY_ALIASES: self._fallback_policy.extend(FALLBACK_POLICY_ALIASES[policy]) @@ -161,8 +189,10 @@ def arraysize(self, value): try: self._arraysize = max(int(value), default_arraysize) except TypeError: - warnings.warn('arraysize has to be a integer, got {}, ' - 'will set default value 1000'.format(value)) + warnings.warn( + "arraysize has to be a integer, got {}, " + "will set default value 1000".format(value) + ) self._arraysize = default_arraysize @property @@ -192,27 +222,24 @@ def description(self): self._description = [] if self._download_session is not None: for col in self._download_session.schema.columns: - self._description.append(( - col.name, col.type.name, - None, None, None, None, True - )) + self._description.append( + (col.name, col.type.name, None, None, None, None, True) + ) else: - self._description.append(( - '_c0', 'string', None, None, - None, None, True - )) + self._description.append( + ("_c0", "string", None, None, None, None, True) + ) return self._description @staticmethod def escape_string(item): item = to_str(item) return "'{}'".format( - item - .replace('\\', '\\\\') - .replace("'", "\\'") - .replace('\r', '\\r') - .replace('\n', '\\n') - .replace('\t', '\\t') + item.replace("\\", "\\\\") + .replace("'", "\\'") + .replace("\r", "\\r") + .replace("\n", "\\n") + .replace("\t", "\\t") ) def execute(self, operation, parameters=None, **kwargs): @@ -224,7 +251,7 @@ def execute(self, operation, parameters=None, **kwargs): Return values are not defined. """ - for k in ['async', 'async_']: + for k in ["async", "async_"]: if k in kwargs: async_ = kwargs[k] break @@ -261,7 +288,7 @@ def executemany(self, operation, seq_of_parameters): self.execute(operation, parameter) def _sqa_error_should_fallback(self, err_str): - if 'ODPS-18' not in err_str: + if "ODPS-18" not in err_str: return False for fallback_case in self._fallback_policy: fallback_error = FALLBACK_POLICIES.get(fallback_case, None) @@ -279,12 +306,12 @@ def _run_sqa_with_fallback(self, sql, **kw): while True: try: if inst is None: - inst = odps.run_sql_interactive(sql, service_name = session_name) + inst = odps.run_sql_interactive(sql, service_name=session_name) else: inst.wait_for_success(interval=0.5) rd = inst.open_reader(tunnel=True, limit=False) if not rd: - raise ODPSError('failed to create direct download') + raise ODPSError("failed to create direct download") rd.schema # will check if task is ok self._download_session = rd return inst @@ -308,7 +335,8 @@ def _check_download_session(self): if not self._download_session and self._instance: try: self._download_session = self._instance.open_reader( - tunnel=True, limit=False) + tunnel=True, limit=False + ) except InstanceTypeNotSupported: # not select, cannot create session self._download_session = None diff --git a/odps/df/backends/frame.py b/odps/df/backends/frame.py index e14bc5b2..eee25740 100644 --- a/odps/df/backends/frame.py +++ b/odps/df/backends/frame.py @@ -96,8 +96,8 @@ def _reset_pd_axes(self, data): ret_data = data if list(data.columns) != self._names: - # already copied - ret_data = ret_data.set_axis(self._names, axis="columns", inplace=False) + v = ret_data.set_axis(self._names, axis="columns") + ret_data = v if v is not None else ret_data if self._index is not None and list(data.index) != list(self._index): if data is ret_data: diff --git a/odps/df/backends/odpssql/engine.py b/odps/df/backends/odpssql/engine.py index bddbfe23..c687771c 100644 --- a/odps/df/backends/odpssql/engine.py +++ b/odps/df/backends/odpssql/engine.py @@ -91,6 +91,21 @@ def _repr_html_(self): return buf.getvalue() +def get_supported_python_tag(align=None): + # todo remove this when next releases are published + if align is None: + align = options.align_supported_python_tag + if align: + if sys.version_info[:2] >= (3, 11): + return "cp311" + elif sys.version_info[:2] >= (3, 6): + return "cp37" + else: + return "cp27" + else: + return "cp" + str(sys.version_info[0]) + str(sys.version_info[1]) + + class ODPSSQLEngine(Engine): def __init__(self, odps): self._odps = odps @@ -143,10 +158,7 @@ def _run(self, sql, ui, progress_proportion=1, hints=None, priority=None, hints = hints or dict() if self._ctx.get_udf_count() > 0 and sys.version_info[:2] >= (3, 6): hints['odps.sql.jobconf.odps2'] = True - if sys.version_info[:2] >= (3, 11): - hints['odps.sql.python.version'] = 'cp311' - else: - hints['odps.sql.python.version'] = 'cp37' + hints['odps.sql.python.version'] = get_supported_python_tag() image = image or options.df.image if image: hints['odps.session.image'] = image diff --git a/odps/df/backends/odpssql/tests/test_engine.py b/odps/df/backends/odpssql/tests/test_engine.py index 54880ce8..0b87d3ce 100644 --- a/odps/df/backends/odpssql/tests/test_engine.py +++ b/odps/df/backends/odpssql/tests/test_engine.py @@ -14,16 +14,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math +import functools import itertools -import sys -import uuid +import math import os -import zipfile -import tarfile import re -import functools +import sys +import tarfile import time +import uuid +import zipfile from collections import namedtuple, OrderedDict from datetime import datetime, timedelta from functools import partial @@ -35,7 +35,9 @@ from .....compat import PY27, irange as xrange, six, futures, BytesIO from .....errors import ODPSError from .....models import TableSchema -from .....tests.core import get_result, approx_list, py_and_c, run_sub_tests_in_parallel +from .....tests.core import ( + get_result, approx_list, py_and_c, get_test_unique_name, run_sub_tests_in_parallel +) from .....utils import to_text from ....expr.expressions import CollectionExpr from ....types import validate_data_type, DynamicSchema @@ -54,6 +56,8 @@ def _reloader(): cfg = get_config() cfg.tunnel = TableTunnel(cfg.odps, endpoint=cfg.odps._tunnel_endpoint) +_pypi_index = os.environ.get("PYPI_INDEX") or "http://mirrors.aliyun.com/pypi/simple" +_pypi_prefix = _pypi_index.rstrip("/").rsplit("/", 1)[0] py_and_c_deco = py_and_c([ "odps.models.record", "odps.models", "odps.tunnel.io.reader", @@ -155,7 +159,7 @@ def test_tunnel_cases(odps, setup): result = get_result(res) assert [it[:1] for it in data] == result - table_name = tn('pyodps_test_engine_partitioned') + table_name = tn('pyodps_test_engine_partitioned_' + get_test_unique_name(5)) odps.delete_table(table_name, if_exists=True) df = setup.engine.persist(setup.expr, table_name, partitions=['name']) @@ -961,10 +965,10 @@ def h(arg): return arg return h - file_resource_name = tn('pyodps_tmp_file_resource') - table_resource_name = tn('pyodps_tmp_table_resource') - table_name = tn('pyodps_tmp_function_resource_table') - table_name2 = tn('pyodps_tmp_function_resource_table2') + file_resource_name = tn('pyodps_t_tmp_file_resource') + table_resource_name = tn('pyodps_t_tmp_table_resource') + table_name = tn('pyodps_t_tmp_function_resource_table') + table_name2 = tn('pyodps_t_tmp_function_resource_table2') try: odps.delete_resource(file_resource_name) except: @@ -1046,7 +1050,7 @@ def h(row): def test_function_resources_with_partition(odps, setup): data = setup.gen_data(5) - table_name = tn('pyodps_tmp_function_resource_part_table') + table_name = tn('pyodps_t_tmp_function_resource_part_table') odps.delete_table(table_name, if_exists=True) t = odps.create_table(table_name, TableSchema.from_lists( ['id', 'id2'], ['bigint', 'bigint'], ['ds'], ['string'] @@ -1056,7 +1060,7 @@ def test_function_resources_with_partition(odps, setup): with t.open_writer(partition='ds=ds2', create_partition=True) as w: w.write([2, 3]) - table_resource_name = tn('pyodps_tmp_part_table_resource') + table_resource_name = tn('pyodps_t_tmp_part_table_resource') try: odps.delete_resource(table_resource_name) except: @@ -1101,13 +1105,13 @@ def test_third_party_libraries(odps, setup): setup.gen_data(data=data) dateutil_urls = [ - 'http://mirrors.aliyun.com/pypi/packages/d4/70/' + _pypi_prefix + '/packages/d4/70/' 'd60450c3dd48ef87586924207ae8907090de0b306af2bce5d134d78615cb/' 'python_dateutil-2.8.1-py2.py3-none-any.whl', - 'https://mirrors.aliyun.com/pypi/packages/be/ed/' + _pypi_prefix + '/packages/be/ed/' '5bbc91f03fa4c839c4c7360375da77f9659af5f7086b7a7bdda65771c8e0/' 'python-dateutil-2.8.1.tar.gz', - 'http://mirrors.aliyun.com/pypi/packages/b7/9f/' + _pypi_prefix + '/packages/b7/9f/' 'ba2b6aaf27e74df59f31b77d1927d5b037cc79a89cda604071f93d289eaf/' 'python-dateutil-2.5.3.zip#md5=52b3f339f41986c25c3a2247e722db17', ] @@ -1238,7 +1242,7 @@ def test_third_party_wheel(odps, setup): setup.gen_data(data=data) dateutil_url = ( - 'http://mirrors.aliyun.com/pypi/packages/d4/70/' + _pypi_prefix + '/packages/d4/70/' 'd60450c3dd48ef87586924207ae8907090de0b306af2bce5d134d78615cb/' 'python_dateutil-2.8.1-py2.py3-none-any.whl' ) @@ -2038,7 +2042,7 @@ def test_join_groupby(odps, setup): ['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint] ) - table_name = tn('pyodps_test_engine_table2') + table_name = tn('pyodps_test_engine_table2_' + get_test_unique_name(5)) odps.delete_table(table_name, if_exists=True) table2 = odps.create_table(name=table_name, table_schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) @@ -2640,7 +2644,7 @@ def test_join_map_reduce(odps, setup): ['name2', 'id2', 'id3'], [types.string, types.bigint, types.bigint] ) - table_name = tn('pyodps_test_engine_table2') + table_name = tn('pyodps_test_engine_table2_' + get_test_unique_name(5)) odps.delete_table(table_name, if_exists=True) table2 = odps.create_table(name=table_name, table_schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) @@ -2746,9 +2750,9 @@ def test_join(odps, setup): schema2 = TableSchema.from_lists( ['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint] ) - table_name = tn('pyodps_test_engine_table2') + table_name = tn('pyodps_test_engine_table2_' + get_test_unique_name(5)) odps.delete_table(table_name, if_exists=True) - table2 = odps.create_table(name=table_name, table_schema=schema2) + table2 = odps.create_table(name=table_name, table_schema=schema2, lifecycle=1) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) setup.gen_data(data=data) @@ -2885,7 +2889,7 @@ def test_union(odps, setup): schema2 = TableSchema.from_lists( ['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint] ) - table_name = tn('pyodps_test_engine_table2') + table_name = tn('pyodps_test_engine_table2_' + get_test_unique_name(5)) odps.delete_table(table_name, if_exists=True) table2 = odps.create_table(name=table_name, table_schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) @@ -3499,7 +3503,7 @@ def test_except_intersect(odps, setup): ['name3', 1], ['name3', 3], ] schema1 = TableSchema.from_lists(['name', 'id'], [types.string, types.bigint]) - table_name1 = tn('pyodps_test_engine_drop1') + table_name1 = tn('pyodps_test_engine_except_intersect1') odps.delete_table(table_name1, if_exists=True) table1 = odps.create_table(name=table_name1, table_schema=schema1) expr1 = CollectionExpr(_source_data=table1, _schema=odps_schema_to_df_schema(schema1)) @@ -3509,7 +3513,7 @@ def test_except_intersect(odps, setup): ['name2', 1], ['name2', 2], ] schema2 = TableSchema.from_lists(['name', 'id'], [types.string, types.bigint]) - table_name2 = tn('pyodps_test_engine_drop2') + table_name2 = tn('pyodps_test_engine_except_intersect2') odps.delete_table(table_name2, if_exists=True) table2 = odps.create_table(name=table_name2, table_schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) @@ -3885,7 +3889,7 @@ def test_string_splits(odps, setup): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = TableSchema.from_lists(['name', 'id'], datatypes('string', 'int64')) odps_schema = df_schema_to_odps_schema(schema) - table_name = tn('pyodps_test_engine_composites1') + table_name = tn('pyodps_test_engine_str_splits1') odps.delete_table(table_name, if_exists=True) table = odps.create_table(name=table_name, table_schema=odps_schema) expr_in = CollectionExpr(_source_data=table, _schema=schema) diff --git a/odps/df/backends/odpssql/types.py b/odps/df/backends/odpssql/types.py index 2cff0988..341e2ab2 100644 --- a/odps/df/backends/odpssql/types.py +++ b/odps/df/backends/odpssql/types.py @@ -167,7 +167,6 @@ def df_schema_to_odps_schema(df_schema, ignorecase=False, project=None): ) - _use_odps2_types_local = threading.local() diff --git a/odps/df/backends/pd/tests/test_engine.py b/odps/df/backends/pd/tests/test_engine.py index 4b8a63a9..80bb2144 100644 --- a/odps/df/backends/pd/tests/test_engine.py +++ b/odps/df/backends/pd/tests/test_engine.py @@ -45,9 +45,12 @@ from ...errors import CompileError from ..engine import PandasEngine -TEMP_FILE_RESOURCE = tn('pyodps_tmp_file_resource') -TEMP_TABLE = tn('pyodps_temp_table') -TEMP_TABLE_RESOURCE = tn('pyodps_temp_table_resource') +TEMP_FILE_RESOURCE = tn('pyodps_t_tmp_file_resource') +TEMP_TABLE = tn('pyodps_t_tmp_table') +TEMP_TABLE_RESOURCE = tn('pyodps_t_tmp_table_resource') + +_pypi_index = os.environ.get("PYPI_INDEX") or "http://mirrors.aliyun.com/pypi/simple" +_pypi_prefix = _pypi_index.rstrip("/").rsplit("/", 1)[0] @pytest.fixture @@ -692,10 +695,10 @@ def test_third_party_libraries(odps, setup): setup.gen_data(data=data) utils_urls = [ - 'http://mirrors.aliyun.com/pypi/packages/39/7b/' + _pypi_prefix + '/packages/39/7b/' '1cb2391517d9cb30001140c6662e00d7443752e5a1713e317fb93267da3f/' 'python_utils-2.1.0-py2.py3-none-any.whl#md5=9dabec0d4f224ba90fd4c53064e7c016', - 'http://mirrors.aliyun.com/pypi/packages/70/7e/' + _pypi_prefix + '/packages/70/7e/' 'a2fcd97ec348e63be034027d4475986063c6d869f7e9f1b7802a8b17304e/' 'python-utils-2.1.0.tar.gz#md5=9891e757c629fc43ccd2c896852f8266', ] diff --git a/odps/df/backends/tests/test_mixed_engine.py b/odps/df/backends/tests/test_mixed_engine.py index 4b3e3517..7f8cceb6 100644 --- a/odps/df/backends/tests/test_mixed_engine.py +++ b/odps/df/backends/tests/test_mixed_engine.py @@ -74,7 +74,7 @@ def setup(odps): set_local_use_odps2_types(None) - table = tn('pyodps_df_mixed_%d' % os.getpid()) + table = tn('pyodps_df_mixed_' + get_test_unique_name()) if odps.exist_table(table): t = odps.get_table(table) else: @@ -365,7 +365,7 @@ def _do_execute(self, *args, **kwargs): assert res is not None assert sum(res.values['id']) == 6 - table_name = tn('pyodps_df_mixed2') + table_name = tn('pyodps_df_mixed_head_tail') odps.delete_table(table_name, if_exists=True) table = next(setup.odps_df.data_source()) table2 = odps.create_table(table_name, table.table_schema) @@ -405,7 +405,7 @@ def h(row, done): result = expr.execute() assert result.values['id'].sum() == 17 - odps_df2 = setup.pd_df.persist(tn('pyodps_df_mixed2'), odps=odps) + odps_df2 = setup.pd_df.persist(tn('pyodps_df_mixed_mr_res'), odps=odps) try: expr = setup.odps_df.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group='name') result = expr.execute() diff --git a/odps/df/tests/test_delay.py b/odps/df/tests/test_delay.py index 734df84f..2686e1d3 100644 --- a/odps/df/tests/test_delay.py +++ b/odps/df/tests/test_delay.py @@ -71,16 +71,19 @@ def waiter(val, c): filtered = df[df.id > 0].cache() sub_futures = [make_filter(filtered, i).execute(delay=delay) for i in range(1, 4)] future = delay.execute(async_=True, n_parallel=3) - pytest.raises(RuntimeError, lambda: delay.execute()) - - for i in range(1, 4): - assert future.done() is False - assert any(f.done() for f in sub_futures[i - 1:]) is False - assert all(f.done() for f in sub_futures[:i - 1]) is True - assert get_result(sub_futures[i - 1].result()) == [d for d in data if d[2] == i] - assert all(f.done() for f in sub_futures) is True - future.result(timeout=10 * 60) - assert future.done() is True + pytest.raises(RuntimeError, delay.execute) + + try: + for i in range(1, 4): + assert future.done() is False + assert any(f.done() for f in sub_futures[i - 1:]) is False + assert all(f.done() for f in sub_futures[:i - 1]) is True + assert get_result(sub_futures[i - 1].result()) == [d for d in data if d[2] == i] + assert all(f.done() for f in sub_futures) is True + future.result(timeout=10 * 60) + assert future.done() is True + finally: + future.result() def test_persist_execute(odps, setup): diff --git a/odps/distcache.py b/odps/distcache.py index 15789be3..c95fef9c 100644 --- a/odps/distcache.py +++ b/odps/distcache.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,7 +21,7 @@ def get_cache_table(name): pass -def get_cache_archive(name, relative_path='.'): +def get_cache_archive(name, relative_path="."): pass @@ -31,4 +31,3 @@ def get_cache_tabledesc(name): def get_cache_tableinfo(name): pass - diff --git a/odps/errors.py b/odps/errors.py index fd0ee784..6a82ebf8 100644 --- a/odps/errors.py +++ b/odps/errors.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,13 +22,9 @@ from requests import ConnectTimeout as RequestsConnectTimeout from . import utils -from .compat import ( - six, - reduce, - ElementTree as ET, - ElementTreeParseError as ETParseError, - TimeoutError -) +from .compat import ElementTree as ET +from .compat import ElementTreeParseError as ETParseError +from .compat import TimeoutError, reduce, six logger = logging.getLogger(__name__) @@ -46,30 +42,34 @@ class InteractiveError(Exception): def parse_response(resp, endpoint=None, tag=None): - """Parses the content of response and returns an exception object. - """ + """Parses the content of response and returns an exception object.""" try: try: content = resp.content root = ET.fromstring(content) - code = root.find('./Code').text - msg = root.find('./Message').text - request_id = root.find('./RequestId').text - host_id = root.find('./HostId').text + code = root.find("./Code").text + msg = root.find("./Message").text + request_id = root.find("./RequestId").text + host_id = root.find("./HostId").text except ETParseError: - request_id = resp.headers.get('x-odps-request-id', None) + request_id = resp.headers.get("x-odps-request-id", None) if len(resp.content) > 0: obj = json.loads(resp.text) - msg = obj['Message'] - code = obj.get('Code') - host_id = obj.get('HostId') + msg = obj["Message"] + code = obj.get("Code") + host_id = obj.get("HostId") if request_id is None: - request_id = obj.get('RequestId') + request_id = obj.get("RequestId") else: raise clz = globals().get(code, ODPSError) return clz( - msg, request_id=request_id, code=code, host_id=host_id, endpoint=endpoint, tag=tag + msg, + request_id=request_id, + code=code, + host_id=host_id, + endpoint=endpoint, + tag=tag, ) except: # Error occurred during parsing the response. We ignore it and delegate @@ -77,9 +77,9 @@ def parse_response(resp, endpoint=None, tag=None): logger.debug(utils.stringify_expt()) if resp.status_code == 404: - return NoSuchObject('No such object.', endpoint=endpoint, tag=tag) + return NoSuchObject("No such object.", endpoint=endpoint, tag=tag) elif resp.status_code == 401: - return Unauthorized('Unauthorized.', endpoint=endpoint, tag=tag) + return Unauthorized("Unauthorized.", endpoint=endpoint, tag=tag) else: text = resp.content.decode() if six.PY3 else resp.content if text: @@ -88,7 +88,9 @@ def parse_response(resp, endpoint=None, tag=None): text, code=str(resp.status_code), endpoint=endpoint, tag=tag ) else: - return ODPSError(text, code=str(resp.status_code), endpoint=endpoint, tag=tag) + return ODPSError( + text, code=str(resp.status_code), endpoint=endpoint, tag=tag + ) else: return ODPSError(str(resp.status_code), endpoint=endpoint, tag=tag) @@ -111,13 +113,13 @@ def throw_if_parsable(resp, endpoint=None, tag=None): } _SQA_CODE_MAPPING = { - 'ODPS-180': 'SQAGenericError', - 'ODPS-181': 'SQARetryError', - 'ODPS-182': 'SQAAccessDenied', - 'ODPS-183': 'SQAResourceNotEnough', - 'ODPS-184': 'SQAServiceUnavailable', - 'ODPS-185': 'SQAUnsupportedFeature', - 'ODPS-186': 'SQAQueryTimedout', + "ODPS-180": "SQAGenericError", + "ODPS-181": "SQARetryError", + "ODPS-182": "SQAAccessDenied", + "ODPS-183": "SQAResourceNotEnough", + "ODPS-184": "SQAServiceUnavailable", + "ODPS-185": "SQAUnsupportedFeature", + "ODPS-186": "SQAQueryTimedout", } _nginx_bad_gateway_message = "the page you are looking for is currently unavailable" @@ -127,25 +129,23 @@ def parse_instance_error(msg): raw_msg = msg try: root = ET.fromstring(msg) - code = root.find('./Code').text - msg = root.find('./Message').text - request_id_node = root.find('./RequestId') + code = root.find("./Code").text + msg = root.find("./Message").text + request_id_node = root.find("./RequestId") request_id = request_id_node.text if request_id_node else None - host_id_node = root.find('./HostId') + host_id_node = root.find("./HostId") host_id = host_id_node.text if host_id_node else None clz = globals().get(code, ODPSError) - return clz( - msg, request_id=request_id, code=code, host_id=host_id - ) + return clz(msg, request_id=request_id, code=code, host_id=host_id) except: pass msg = utils.to_str(raw_msg) - msg_parts = reduce(operator.add, (pt.split(':') for pt in msg.split(' - '))) + msg_parts = reduce(operator.add, (pt.split(":") for pt in msg.split(" - "))) msg_parts = [pt.strip() for pt in msg_parts] try: - msg_code = next(p for p in msg_parts if p.startswith('ODPS-')) + msg_code = next(p for p in msg_parts if p.startswith("ODPS-")) if msg_code in _CODE_MAPPING: cls = globals().get(_CODE_MAPPING[msg_code], ODPSError) elif len(msg_code) > 8 and msg_code[:8] in _SQA_CODE_MAPPING: @@ -162,12 +162,20 @@ def parse_instance_error(msg): return cls(msg, code=msg_code) -class ODPSError(RuntimeError): +class BaseODPSError(Exception): """Base class of ODPS error""" + def __init__( - self, msg, request_id=None, code=None, host_id=None, instance_id=None, endpoint=None, tag=None + self, + msg, + request_id=None, + code=None, + host_id=None, + instance_id=None, + endpoint=None, + tag=None, ): - super(ODPSError, self).__init__(msg) + super(BaseODPSError, self).__init__(msg) self.request_id = request_id self.instance_id = instance_id self.code = code @@ -191,7 +199,7 @@ def __str__(self): head_parts.append("Endpoint: %s" % self.endpoint) if head_parts: - return '%s\n%s' % (" ".join(head_parts), message) + return "%s\n%s" % (" ".join(head_parts), message) return message @classmethod @@ -199,6 +207,10 @@ def parse(cls, resp): return parse_response(resp) +class ODPSError(BaseODPSError, RuntimeError): + pass + + class ODPSClientError(ODPSError): pass @@ -217,6 +229,7 @@ class ServerDefinedException(ODPSError): # A long list of server defined exceptions + class MethodNotAllowed(ServerDefinedException): pass @@ -241,6 +254,10 @@ class NoSuchTable(NoSuchObject): pass +class NoSuchVolume(NoSuchObject): + pass + + class InvalidArgument(ServerDefinedException): pass @@ -257,6 +274,10 @@ class Unauthorized(AuthorizationRequired): pass +class SignatureNotMatch(ServerDefinedException): + pass + + class SchemaParseError(ServerDefinedException): pass @@ -339,11 +360,11 @@ class RequestTimeTooSkewed(ServerDefinedException): def __init__(self, msg, *args, **kwargs): super(RequestTimeTooSkewed, self).__init__(msg, *args, **kwargs) try: - parts = msg.split(',') - kv_dict = dict(tuple(s.strip() for s in p.split(':', 1)) for p in parts) - self.max_interval_date = int(kv_dict['max_interval_date']) - self.expire_date = self._parse_error_date(kv_dict['expire_date']) - self.now_date = self._parse_error_date(kv_dict['now_date']) + parts = msg.split(",") + kv_dict = dict(tuple(s.strip() for s in p.split(":", 1)) for p in parts) + self.max_interval_date = int(kv_dict["max_interval_date"]) + self.expire_date = self._parse_error_date(kv_dict["expire_date"]) + self.now_date = self._parse_error_date(kv_dict["now_date"]) except: self.max_interval_date = None self.expire_date = None @@ -351,9 +372,12 @@ def __init__(self, msg, *args, **kwargs): @staticmethod def _parse_error_date(date_str): - date_obj = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%fZ') + date_obj = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%fZ") micros = date_obj.microsecond - return datetime.fromtimestamp(calendar.timegm(date_obj.timetuple())).replace(microsecond=micros) + return datetime.fromtimestamp(calendar.timegm(date_obj.timetuple())).replace( + microsecond=micros + ) + # Handling error code typo in ODPS error message RequestTimeTooSkewd = RequestTimeTooSkewed @@ -412,3 +436,11 @@ class SQAUnsupportedFeature(SQAError): class SQAQueryTimedout(SQAError): pass + + +class EmptyTaskInfoError(ODPSError): + pass + + +class ChecksumError(ODPSError, IOError): + pass diff --git a/odps/examples/tables.py b/odps/examples/tables.py index 99efeefc..98ae261e 100644 --- a/odps/examples/tables.py +++ b/odps/examples/tables.py @@ -1,5 +1,5 @@ # encoding: utf-8 -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,22 +13,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import glob import codecs +import glob +import gzip +import os import shutil import tarfile -import gzip import warnings from collections import OrderedDict - from itertools import groupby, product -from ..compat import pickle, urlretrieve, six +from ..compat import pickle, six, urlretrieve from ..tunnel import TableTunnel -from ..utils import load_static_text_file, build_pyodps_dir +from ..utils import build_pyodps_dir, load_static_text_file -USER_DATA_REPO = build_pyodps_dir('data') +USER_DATA_REPO = build_pyodps_dir("data") class TestDataMixIn(object): @@ -40,6 +39,7 @@ def table_creator(func): """ Decorator for table creating method """ + @six.wraps(func) def method(self, table_name, **kwargs): if self.odps.exist_table(table_name): @@ -47,8 +47,8 @@ def method(self, table_name, **kwargs): if list(table.head(1)): return self.odps.delete_table(table_name) - if kwargs.get('project', self.odps.project) != self.odps.project: - tunnel = TableTunnel(self.odps, project=kwargs['project']) + if kwargs.get("project", self.odps.project) != self.odps.project: + tunnel = TableTunnel(self.odps, project=kwargs["project"]) else: tunnel = self.tunnel func(self.odps, table_name, tunnel=tunnel, **kwargs) @@ -67,70 +67,107 @@ def method(self, table_name, **kwargs): def create_ionosphere(odps, table_name, tunnel=None, project=None): if tunnel is None: tunnel = TableTunnel(odps, project=project) - fields = ','.join('a%02d double' % i for i in range(1, 35)) + ', class bigint' + fields = ",".join("a%02d double" % i for i in range(1, 35)) + ", class bigint" odps.delete_table(table_name, if_exists=True, project=project) odps.create_table(table_name, fields, project=project) upload_ss = tunnel.create_upload_session(table_name) writer = upload_ss.open_record_writer(0) - for line in load_static_text_file('data/ionosphere.txt').splitlines(): + for line in load_static_text_file("data/ionosphere.txt").splitlines(): rec = upload_ss.new_record() - cols = [float(c) if rec._columns[i].type == 'double' else int(c) for i, c in enumerate(line.split(','))] + cols = [ + float(c) if rec._columns[i].type == "double" else int(c) + for i, c in enumerate(line.split(",")) + ] [rec.set(i, val) for i, val in enumerate(cols)] writer.write(rec) writer.close() - upload_ss.commit([0, ]) + upload_ss.commit([0]) @table_creator -def create_ionosphere_one_part(odps, table_name, partition_count=3, tunnel=None, project=None): +def create_ionosphere_one_part( + odps, table_name, partition_count=3, tunnel=None, project=None +): if tunnel is None: tunnel = TableTunnel(odps, project=project) - fields = ','.join('a%02d double' % i for i in range(1, 35)) + ', class bigint' + fields = ",".join("a%02d double" % i for i in range(1, 35)) + ", class bigint" odps.delete_table(table_name, if_exists=True, project=project) - odps.create_table(table_name, (fields, 'part bigint'), project=project) + odps.create_table(table_name, (fields, "part bigint"), project=project) for part_id in range(partition_count): - odps.execute_sql('alter table %s add if not exists partition (part=%d)' % (table_name, part_id), project=project) - - upload_sses = [tunnel.create_upload_session(table_name, 'part=%d' % part_id) for part_id in range(partition_count)] + odps.execute_sql( + "alter table %s add if not exists partition (part=%d)" + % (table_name, part_id), + project=project, + ) + + upload_sses = [ + tunnel.create_upload_session(table_name, "part=%d" % part_id) + for part_id in range(partition_count) + ] writers = [session.open_record_writer(0) for session in upload_sses] - for line_no, line in enumerate(load_static_text_file('data/ionosphere.txt').splitlines()): + for line_no, line in enumerate( + load_static_text_file("data/ionosphere.txt").splitlines() + ): part_id = line_no % partition_count rec = upload_sses[part_id].new_record() - cols = [float(c) if rec._columns[i].type == 'double' else int(c) for i, c in enumerate(line.split(','))] + cols = [ + float(c) if rec._columns[i].type == "double" else int(c) + for i, c in enumerate(line.split(",")) + ] cols.append(part_id) [rec.set(i, val) for i, val in enumerate(cols)] writers[part_id].write(rec) [writer.close() for writer in writers] - [upload_ss.commit([0, ]) for upload_ss in upload_sses] + [upload_ss.commit([0]) for upload_ss in upload_sses] @table_creator -def create_ionosphere_two_parts(odps, table_name, partition1_count=2, partition2_count=3, tunnel=None, project=None): +def create_ionosphere_two_parts( + odps, table_name, partition1_count=2, partition2_count=3, tunnel=None, project=None +): if tunnel is None: tunnel = TableTunnel(odps, project=project) - fields = ','.join('a%02d double' % i for i in range(1, 35)) + ', class bigint' + fields = ",".join("a%02d double" % i for i in range(1, 35)) + ", class bigint" odps.delete_table(table_name, if_exists=True, project=project) - odps.create_table(table_name, (fields, 'part1 bigint, part2 bigint'), project=project) + odps.create_table( + table_name, (fields, "part1 bigint, part2 bigint"), project=project + ) for id1, id2 in product(range(partition1_count), range(partition2_count)): - odps.execute_sql('alter table %s add if not exists partition (part1=%d, part2=%d)' % (table_name, id1, id2), - project=project) - - upload_sses = [[tunnel.create_upload_session(table_name, 'part1=%d,part2=%d' % (id1, id2)) - for id2 in range(partition2_count)] for id1 in range(partition1_count)] - writers = [[session.open_record_writer(0) for session in sessions] for sessions in upload_sses] + odps.execute_sql( + "alter table %s add if not exists partition (part1=%d, part2=%d)" + % (table_name, id1, id2), + project=project, + ) + + upload_sses = [ + [ + tunnel.create_upload_session(table_name, "part1=%d,part2=%d" % (id1, id2)) + for id2 in range(partition2_count) + ] + for id1 in range(partition1_count) + ] + writers = [ + [session.open_record_writer(0) for session in sessions] + for sessions in upload_sses + ] - for line_no, line in enumerate(load_static_text_file('data/ionosphere.txt').splitlines()): + for line_no, line in enumerate( + load_static_text_file("data/ionosphere.txt").splitlines() + ): id1, id2 = line_no % partition1_count, line_no % partition2_count rec = upload_sses[id1][id2].new_record() - cols = [float(c) if rec._columns[i].type == 'double' else int(c) for i, c in enumerate(line.split(','))] + cols = [ + float(c) if rec._columns[i].type == "double" else int(c) + for i, c in enumerate(line.split(",")) + ] cols.extend([id1, id2]) [rec.set(i, val) for i, val in enumerate(cols)] writers[id1][id2].write(rec) [writer.close() for ws in writers for writer in ws] - [upload_ss.commit([0, ]) for upload_sss in upload_sses for upload_ss in upload_sss] + [upload_ss.commit([0]) for upload_sss in upload_sses for upload_ss in upload_sss] @table_creator @@ -138,21 +175,26 @@ def create_iris(odps, table_name, tunnel=None, project=None, lifecycle=None): if tunnel is None: tunnel = TableTunnel(odps, project=project) odps.delete_table(table_name, if_exists=True, project=project) - table = odps.create_table(table_name, 'sepal_length double, sepal_width double, petal_length double, ' - + 'petal_width double, category string', project=project, lifecycle=lifecycle) + table = odps.create_table( + table_name, + "sepal_length double, sepal_width double, petal_length double, " + + "petal_width double, category string", + project=project, + lifecycle=lifecycle, + ) upload_ss = tunnel.create_upload_session(table_name) writer = upload_ss.open_record_writer(0) - for line in load_static_text_file('data/iris.txt').splitlines(): + for line in load_static_text_file("data/iris.txt").splitlines(): rec = upload_ss.new_record() - line_parts = line.split(',') + line_parts = line.split(",") cols = [float(c) for c in line_parts[:-1]] cols.append(line_parts[4]) [rec.set(i, val) for i, val in enumerate(cols)] writer.write(rec) writer.close() - upload_ss.commit([0, ]) + upload_ss.commit([0]) return table @@ -161,19 +203,21 @@ def create_iris_kv(odps, table_name, tunnel=None, project=None): if tunnel is None: tunnel = TableTunnel(odps, project=project) odps.delete_table(table_name, if_exists=True, project=project) - odps.create_table(table_name, 'content string, category bigint', project=project) + odps.create_table(table_name, "content string, category bigint", project=project) upload_ss = tunnel.create_upload_session(table_name) writer = upload_ss.open_record_writer(0) - for line in load_static_text_file('data/iris.txt').splitlines(): + for line in load_static_text_file("data/iris.txt").splitlines(): rec = upload_ss.new_record() - line_parts = line.split(',') - rec.set(0, ','.join('%s:%s' % (idx, c) for idx, c in enumerate(line_parts[:-1]))) - rec.set(1, 0 if 'setosa' in line_parts[-1] else 1) + line_parts = line.split(",") + rec.set( + 0, ",".join("%s:%s" % (idx, c) for idx, c in enumerate(line_parts[:-1])) + ) + rec.set(1, 0 if "setosa" in line_parts[-1] else 1) writer.write(rec) writer.close() - upload_ss.commit([0, ]) + upload_ss.commit([0]) @table_creator @@ -181,18 +225,20 @@ def create_corpus(odps, table_name, tunnel=None, project=None): if tunnel is None: tunnel = TableTunnel(odps, project=project) odps.delete_table(table_name, if_exists=True, project=project) - odps.create_table(table_name, 'id string, content string', project=project) + odps.create_table(table_name, "id string, content string", project=project) upload_ss = tunnel.create_upload_session(table_name) writer = upload_ss.open_record_writer(0) - for line_no, line in enumerate(load_static_text_file('data/splited_words.txt').splitlines()): + for line_no, line in enumerate( + load_static_text_file("data/splited_words.txt").splitlines() + ): rec = upload_ss.new_record() - cols = [line_no + 1, line.replace('####', '')] + cols = [line_no + 1, line.replace("####", "")] [rec.set(i, val) for i, val in enumerate(cols)] writer.write(rec) writer.close() - upload_ss.commit([0, ]) + upload_ss.commit([0]) @table_creator @@ -200,22 +246,26 @@ def create_word_triple(odps, table_name, tunnel=None, project=None): if tunnel is None: tunnel = TableTunnel(odps, project=project) odps.delete_table(table_name, if_exists=True, project=project) - odps.create_table(table_name, 'id string, word string, count bigint', project=project) + odps.create_table( + table_name, "id string, word string, count bigint", project=project + ) upload_ss = tunnel.create_upload_session(table_name) writer = upload_ss.open_record_writer(0) - for line_no, line in enumerate(load_static_text_file('data/splited_words.txt').splitlines()): + for line_no, line in enumerate( + load_static_text_file("data/splited_words.txt").splitlines() + ): line = line.strip() if not line: break - for word, group in groupby(sorted(line.split('####'))): + for word, group in groupby(sorted(line.split("####"))): rec = upload_ss.new_record() cols = [str(line_no + 1), word, len(list(group))] [rec.set(i, val) for i, val in enumerate(cols)] writer.write(rec) writer.close() - upload_ss.commit([0, ]) + upload_ss.commit([0]) @table_creator @@ -223,21 +273,23 @@ def create_splited_words(odps, table_name, joined=False, tunnel=None, project=No if tunnel is None: tunnel = TableTunnel(odps, project=project) odps.delete_table(table_name, if_exists=True, project=project) - odps.create_table(table_name, 'doc_id string, content string', project=project) + odps.create_table(table_name, "doc_id string, content string", project=project) upload_ss = tunnel.create_upload_session(table_name) writer = upload_ss.open_record_writer(0) - for line_no, line in enumerate(load_static_text_file('data/splited_words.txt').splitlines()): + for line_no, line in enumerate( + load_static_text_file("data/splited_words.txt").splitlines() + ): if not line.strip(): break if joined: rec = upload_ss.new_record() - cols = [line_no + 1, line.replace('####', ' ')] + cols = [line_no + 1, line.replace("####", " ")] [rec.set(i, val) for i, val in enumerate(cols)] writer.write(rec) else: - for word in line.split('####'): + for word in line.split("####"): if not word: continue rec = upload_ss.new_record() @@ -245,7 +297,7 @@ def create_splited_words(odps, table_name, joined=False, tunnel=None, project=No [rec.set(i, val) for i, val in enumerate(cols)] writer.write(rec) writer.close() - upload_ss.commit([0, ]) + upload_ss.commit([0]) @table_creator @@ -253,14 +305,27 @@ def create_weighted_graph_edges(odps, table_name, tunnel=None, project=None): if tunnel is None: tunnel = TableTunnel(odps, project=project) data_rows = [ - ['1', '1', '2', '1', 0.7], ['1', '1', '3', '1', 0.7], ['1', '1', '4', '1', 0.6], ['2', '1', '3', '1', 0.7], - ['2', '1', '4', '1', 0.6], ['3', '1', '4', '1', 0.6], ['4', '1', '6', '5', 0.3], ['5', '5', '6', '5', 0.6], - ['5', '5', '7', '5', 0.7], ['5', '5', '8', '5', 0.7], ['6', '5', '7', '5', 0.6], ['6', '5', '8', '5', 0.6], - ['7', '5', '8', '5', 0.7] + ["1", "1", "2", "1", 0.7], + ["1", "1", "3", "1", 0.7], + ["1", "1", "4", "1", 0.6], + ["2", "1", "3", "1", 0.7], + ["2", "1", "4", "1", 0.6], + ["3", "1", "4", "1", 0.6], + ["4", "1", "6", "5", 0.3], + ["5", "5", "6", "5", 0.6], + ["5", "5", "7", "5", 0.7], + ["5", "5", "8", "5", 0.7], + ["6", "5", "7", "5", 0.6], + ["6", "5", "8", "5", 0.6], + ["7", "5", "8", "5", 0.7], ] odps.delete_table(table_name, if_exists=True, project=project) - odps.create_table(table_name, 'flow_out_id string, group_out_id string, flow_in_id string, ' + - 'group_in_id string, edge_weight double', project=project) + odps.create_table( + table_name, + "flow_out_id string, group_out_id string, flow_in_id string, " + + "group_in_id string, edge_weight double", + project=project, + ) upload_ss = tunnel.create_upload_session(table_name) writer = upload_ss.open_record_writer(0) @@ -270,7 +335,7 @@ def create_weighted_graph_edges(odps, table_name, tunnel=None, project=None): [rec.set(i, val) for i, val in enumerate(rd)] writer.write(rec) writer.close() - upload_ss.commit([0, ]) + upload_ss.commit([0]) @table_creator @@ -278,11 +343,22 @@ def create_tree_graph(odps, table_name, tunnel=None, project=None): if tunnel is None: tunnel = TableTunnel(odps, project=project) data_rows = [ - ['0', '1'], ['0', '2'], ['1', '3'], ['1', '4'], ['2', '4'], ['2', '5'], ['4', '6'], ['a', 'b'], ['a', 'c'], - ['c', 'd'], ['c', 'e'] + ["0", "1"], + ["0", "2"], + ["1", "3"], + ["1", "4"], + ["2", "4"], + ["2", "5"], + ["4", "6"], + ["a", "b"], + ["a", "c"], + ["c", "d"], + ["c", "e"], ] odps.delete_table(table_name, if_exists=True, project=project) - odps.create_table(table_name, 'flow_out_id string, flow_in_id string', project=project) + odps.create_table( + table_name, "flow_out_id string, flow_in_id string", project=project + ) upload_ss = tunnel.create_upload_session(table_name) writer = upload_ss.open_record_writer(0) @@ -292,7 +368,7 @@ def create_tree_graph(odps, table_name, tunnel=None, project=None): [rec.set(i, val) for i, val in enumerate(rd)] writer.write(rec) writer.close() - upload_ss.commit([0, ]) + upload_ss.commit([0]) @table_creator @@ -300,12 +376,21 @@ def create_weighted_graph_vertices(odps, table_name, tunnel=None, project=None): if tunnel is None: tunnel = TableTunnel(odps, project=project) data_rows = [ - ['1', '1', 0.7, 1.0], ['2', '1', 0.7, 1.0], ['3', '1', 0.7, 1.0], ['4', '1', 0.5, 1.0], ['5', '5', 0.7, 1.0], - ['6', '5', 0.5, 1.0], ['7', '5', 0.7, 1.0], ['8', '5', 0.7, 1.0] + ["1", "1", 0.7, 1.0], + ["2", "1", 0.7, 1.0], + ["3", "1", 0.7, 1.0], + ["4", "1", 0.5, 1.0], + ["5", "5", 0.7, 1.0], + ["6", "5", 0.5, 1.0], + ["7", "5", 0.7, 1.0], + ["8", "5", 0.7, 1.0], ] odps.delete_table(table_name, if_exists=True, project=project) - odps.create_table(table_name, 'node string, label string, node_weight double, label_weight double', - project=project) + odps.create_table( + table_name, + "node string, label string, node_weight double, label_weight double", + project=project, + ) upload_ss = tunnel.create_upload_session(table_name) writer = upload_ss.open_record_writer(0) @@ -315,7 +400,7 @@ def create_weighted_graph_vertices(odps, table_name, tunnel=None, project=None): [rec.set(i, val) for i, val in enumerate(rd)] writer.write(rec) writer.close() - upload_ss.commit([0, ]) + upload_ss.commit([0]) @table_creator @@ -323,45 +408,133 @@ def create_user_item_table(odps, table_name, tunnel=None, mode=None, project=Non if tunnel is None: tunnel = TableTunnel(odps, project=project) data_rows = [ - ['CST0000', 'a', 0], ['CST0000', 'b', 1], ['CST0000', 'c', 2], ['CST0000', 'd', 3], ['CST0001', 'a', 0], - ['CST0001', 'b', 0], ['CST0001', 'a', 1], ['CST0001', 'b', 1], ['CST0001', 'c', 1], ['CST0001', 'b', 2], - ['CST0001', 'c', 2], ['CST0001', 'd', 2], ['CST0001', 'e', 3], ['CST0002', 'a', 0], ['CST0002', 'c', 0], - ['CST0002', 'b', 1], ['CST0002', 'a', 2], ['CST0002', 'b', 2], ['CST0002', 'c', 2], ['CST0002', 'a', 3], - ['CST0000', 'a', 0], ['CST0000', 'b', 1], ['CST0000', 'c', 2], ['CST0000', 'd', 3], ['CST0001', 'a', 0], - ['CST0001', 'b', 0], ['CST0001', 'a', 1], ['CST0001', 'b', 1], ['CST0001', 'c', 1], ['CST0001', 'b', 2], - ['CST0001', 'c', 2], ['CST0001', 'd', 2], ['CST0001', 'e', 3], ['CST0002', 'a', 0], ['CST0002', 'c', 0], - ['CST0002', 'b', 1], ['CST0002', 'a', 2], ['CST0002', 'b', 2], ['CST0002', 'c', 2], ['CST0002', 'a', 3], - ['CST0000', 'a', 0], ['CST0000', 'b', 1], ['CST0000', 'c', 2], ['CST0000', 'd', 3], ['CST0001', 'a', 0], - ['CST0001', 'b', 0], ['CST0001', 'a', 1], ['CST0001', 'b', 1], ['CST0001', 'c', 1], ['CST0001', 'b', 2], - ['CST0001', 'c', 2], ['CST0001', 'd', 2], ['CST0001', 'e', 3], ['CST0002', 'a', 0], ['CST0002', 'c', 0], - ['CST0002', 'b', 1], ['CST0002', 'a', 2], ['CST0002', 'b', 2], ['CST0002', 'c', 2], ['CST0002', 'a', 3], - ['CST0000', 'a', 0], ['CST0000', 'b', 1], ['CST0000', 'c', 2], ['CST0000', 'd', 3], ['CST0001', 'a', 0], - ['CST0001', 'b', 0], ['CST0001', 'a', 1], ['CST0001', 'b', 1], ['CST0001', 'c', 1], ['CST0001', 'b', 2], - ['CST0001', 'c', 2], ['CST0001', 'd', 2], ['CST0001', 'e', 3], ['CST0002', 'a', 0], ['CST0002', 'c', 0], - ['CST0002', 'b', 1], ['CST0002', 'a', 2], ['CST0002', 'b', 2], ['CST0002', 'c', 2], ['CST0002', 'a', 3] + ["CST0000", "a", 0], + ["CST0000", "b", 1], + ["CST0000", "c", 2], + ["CST0000", "d", 3], + ["CST0001", "a", 0], + ["CST0001", "b", 0], + ["CST0001", "a", 1], + ["CST0001", "b", 1], + ["CST0001", "c", 1], + ["CST0001", "b", 2], + ["CST0001", "c", 2], + ["CST0001", "d", 2], + ["CST0001", "e", 3], + ["CST0002", "a", 0], + ["CST0002", "c", 0], + ["CST0002", "b", 1], + ["CST0002", "a", 2], + ["CST0002", "b", 2], + ["CST0002", "c", 2], + ["CST0002", "a", 3], + ["CST0000", "a", 0], + ["CST0000", "b", 1], + ["CST0000", "c", 2], + ["CST0000", "d", 3], + ["CST0001", "a", 0], + ["CST0001", "b", 0], + ["CST0001", "a", 1], + ["CST0001", "b", 1], + ["CST0001", "c", 1], + ["CST0001", "b", 2], + ["CST0001", "c", 2], + ["CST0001", "d", 2], + ["CST0001", "e", 3], + ["CST0002", "a", 0], + ["CST0002", "c", 0], + ["CST0002", "b", 1], + ["CST0002", "a", 2], + ["CST0002", "b", 2], + ["CST0002", "c", 2], + ["CST0002", "a", 3], + ["CST0000", "a", 0], + ["CST0000", "b", 1], + ["CST0000", "c", 2], + ["CST0000", "d", 3], + ["CST0001", "a", 0], + ["CST0001", "b", 0], + ["CST0001", "a", 1], + ["CST0001", "b", 1], + ["CST0001", "c", 1], + ["CST0001", "b", 2], + ["CST0001", "c", 2], + ["CST0001", "d", 2], + ["CST0001", "e", 3], + ["CST0002", "a", 0], + ["CST0002", "c", 0], + ["CST0002", "b", 1], + ["CST0002", "a", 2], + ["CST0002", "b", 2], + ["CST0002", "c", 2], + ["CST0002", "a", 3], + ["CST0000", "a", 0], + ["CST0000", "b", 1], + ["CST0000", "c", 2], + ["CST0000", "d", 3], + ["CST0001", "a", 0], + ["CST0001", "b", 0], + ["CST0001", "a", 1], + ["CST0001", "b", 1], + ["CST0001", "c", 1], + ["CST0001", "b", 2], + ["CST0001", "c", 2], + ["CST0001", "d", 2], + ["CST0001", "e", 3], + ["CST0002", "a", 0], + ["CST0002", "c", 0], + ["CST0002", "b", 1], + ["CST0002", "a", 2], + ["CST0002", "b", 2], + ["CST0002", "c", 2], + ["CST0002", "a", 3], ] odps.delete_table(table_name, if_exists=True, project=project) - if mode == 'agg': - data_rows = [k + (len(list(p)), ) for k, p in groupby(sorted(data_rows, key=lambda item: (item[0], item[1])), lambda item: (item[0], item[1]))] - odps.create_table(table_name, 'user string, item string, payload bigint', project=project) - elif mode == 'exist': + if mode == "agg": + data_rows = [ + k + (len(list(p)),) + for k, p in groupby( + sorted(data_rows, key=lambda item: (item[0], item[1])), + lambda item: (item[0], item[1]), + ) + ] + odps.create_table( + table_name, "user string, item string, payload bigint", project=project + ) + elif mode == "exist": items = dict() - for k, g1 in groupby(sorted(data_rows, key=lambda it: it[0]), key=lambda it: it[0]): + for k, g1 in groupby( + sorted(data_rows, key=lambda it: it[0]), key=lambda it: it[0] + ): items[k] = {k2: 1 for k2 in set(it2[1] for it2 in g1)} products = set(it[1] for it in data_rows) for k, g in six.iteritems(items): unexist_set = products - set(six.iterkeys(g)) for it in unexist_set: g[it] = 0 - data_rows = [(u, p, l) for u, ud in six.iteritems(items) for p, l in six.iteritems(ud)] - odps.create_table(table_name, 'user string, item string, label bigint', project=project) - elif mode == 'kv': - data_rows = [(k[0], '{0}:{1}'.format(k[1], len(list(p)))) - for k, p in groupby(sorted(data_rows, key=lambda item: (item[0], item[1])), lambda item: (item[0], item[1]))] - data_rows = [(k, ','.join((v[1] for v in p))) for k, p in groupby(data_rows, key=lambda item: item[0])] - odps.create_table(table_name, 'user string, item_list string', project=project) + data_rows = [ + (u, p, l) for u, ud in six.iteritems(items) for p, l in six.iteritems(ud) + ] + odps.create_table( + table_name, "user string, item string, label bigint", project=project + ) + elif mode == "kv": + data_rows = [ + (k[0], "{0}:{1}".format(k[1], len(list(p)))) + for k, p in groupby( + sorted(data_rows, key=lambda item: (item[0], item[1])), + lambda item: (item[0], item[1]), + ) + ] + data_rows = [ + (k, ",".join((v[1] for v in p))) + for k, p in groupby(data_rows, key=lambda item: item[0]) + ] + odps.create_table(table_name, "user string, item_list string", project=project) else: - odps.create_table(table_name, 'user string, item string, time bigint', project=project) + odps.create_table( + table_name, "user string, item string, time bigint", project=project + ) upload_ss = tunnel.create_upload_session(table_name) writer = upload_ss.open_record_writer(0) @@ -371,7 +544,7 @@ def create_user_item_table(odps, table_name, tunnel=None, mode=None, project=Non [rec.set(i, val) for i, val in enumerate(rd)] writer.write(rec) writer.close() - upload_ss.commit([0, ]) + upload_ss.commit([0]) @table_creator @@ -379,18 +552,20 @@ def create_dow_jones(odps, table_name, tunnel=None, project=None): fields = """quarter stock date open high low close volume percent_change_price percent_change_volume_over_last_wk previous_weeks_volume next_weeks_open next_weeks_close percent_change_next_weeks_price days_to_next_dividend percent_return_next_dividend""".strip().split() - field_types = OrderedDict([(fname, 'double') for fname in fields]) - field_types['quarter'] = 'bigint' - field_types['stock'] = 'string' - field_types['date'] = 'string' - fields_str = ','.join('`{0}` {1}'.format(k, v) for k, v in six.iteritems(field_types)) + field_types = OrderedDict([(fname, "double") for fname in fields]) + field_types["quarter"] = "bigint" + field_types["stock"] = "string" + field_types["date"] = "string" + fields_str = ",".join( + "`{0}` {1}".format(k, v) for k, v in six.iteritems(field_types) + ) odps.delete_table(table_name, if_exists=True, project=project) odps.create_table(table_name, fields_str, project=project) def iter_lines(): - for line in load_static_text_file('data/dow_jones.txt').splitlines(): - ldata = line.split(',') + for line in load_static_text_file("data/dow_jones.txt").splitlines(): + ldata = line.split(",") ldata[0] = int(ldata[0]) ldata[3:] = [float(v) if v else None for v in ldata[3:]] yield ldata @@ -402,10 +577,10 @@ def iter_lines(): 20 Newsgroups """ -NEWSGROUP_URL = 'http://repo.aliyun.com/shared_data/20news-bydate.tar.gz' -NEWSGROUP_DATA_NAME = '20news-bydate' -NEWSGROUP_ARCHIVE_NAME = '20news-bydate.tar.gz' -NEWSGROUP_CACHE_NAME = '20news-bydate.pkz' +NEWSGROUP_URL = "http://repo.aliyun.com/shared_data/20news-bydate.tar.gz" +NEWSGROUP_DATA_NAME = "20news-bydate" +NEWSGROUP_ARCHIVE_NAME = "20news-bydate.tar.gz" +NEWSGROUP_CACHE_NAME = "20news-bydate.pkz" NEWSGROUP_TRAIN_DIR = "20news-bydate-train" NEWSGROUP_TEST_DIR = "20news-bydate-test" NEWSGROUP_TRAIN_FOLDER = "20news-bydate-train" @@ -413,13 +588,13 @@ def iter_lines(): def download_newsgroup(target_dir, cache_dir): - target_tar = os.path.join(os.path.expanduser('~'), NEWSGROUP_ARCHIVE_NAME) + target_tar = os.path.join(os.path.expanduser("~"), NEWSGROUP_ARCHIVE_NAME) urlretrieve(NEWSGROUP_URL, target_tar) cache_newsgroup_tar(target_tar, target_dir, cache_dir) def cache_newsgroup_tar(target_tar, target_dir, cache_dir): - tarfile.open(target_tar, 'r:gz').extractall(path=target_dir) + tarfile.open(target_tar, "r:gz").extractall(path=target_dir) os.unlink(target_tar) if not os.path.exists(target_dir): @@ -434,42 +609,50 @@ def cache_newsgroup_tar(target_tar, target_dir, cache_dir): def load_files(path, encoding): objs = [] - for fn in glob.glob(os.path.join(path, '*')): + for fn in glob.glob(os.path.join(path, "*")): file_cat = os.path.basename(os.path.normpath(fn)) - for sfn in glob.glob(os.path.join(fn, '*')): + for sfn in glob.glob(os.path.join(fn, "*")): file_id = os.path.basename(os.path.normpath(sfn)) - with open(sfn, 'rb') as f: + with open(sfn, "rb") as f: objs.append((file_id, file_cat, f.read().decode(encoding))) return objs # Store a zipped pickle - cache = dict(train=load_files(train_path, encoding='latin1'), - test=load_files(test_path, encoding='latin1')) - compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec') - with open(cache_path, 'wb') as f: + cache = dict( + train=load_files(train_path, encoding="latin1"), + test=load_files(test_path, encoding="latin1"), + ) + compressed_content = codecs.encode(pickle.dumps(cache), "zlib_codec") + with open(cache_path, "wb") as f: f.write(compressed_content) shutil.rmtree(target_dir) @table_creator -def create_newsgroup_table(odps, table_name, tunnel=None, data_part='train', project=None): +def create_newsgroup_table( + odps, table_name, tunnel=None, data_part="train", project=None +): cache_file = os.path.join(USER_DATA_REPO, NEWSGROUP_CACHE_NAME) if not os.path.exists(USER_DATA_REPO): os.makedirs(USER_DATA_REPO) if not os.path.exists(cache_file): - warnings.warn('We need to download data from ' + NEWSGROUP_URL + '.') - download_newsgroup(os.path.join(USER_DATA_REPO, NEWSGROUP_DATA_NAME), USER_DATA_REPO) + warnings.warn("We need to download data from " + NEWSGROUP_URL + ".") + download_newsgroup( + os.path.join(USER_DATA_REPO, NEWSGROUP_DATA_NAME), USER_DATA_REPO + ) - with open(cache_file, 'rb') as f: - cache = pickle.loads(codecs.decode(f.read(), 'zlib_codec')) + with open(cache_file, "rb") as f: + cache = pickle.loads(codecs.decode(f.read(), "zlib_codec")) if tunnel is None: tunnel = TableTunnel(odps, project=project) odps.delete_table(table_name, if_exists=True, project=project) - odps.create_table(table_name, 'id string, category string, message string', project=project) + odps.create_table( + table_name, "id string, category string, message string", project=project + ) upload_ss = tunnel.create_upload_session(table_name) writer = upload_ss.open_record_writer(0) @@ -479,15 +662,15 @@ def create_newsgroup_table(odps, table_name, tunnel=None, data_part='train', pro [rec.set(i, six.text_type(val)) for i, val in enumerate(line)] writer.write(rec) writer.close() - upload_ss.commit([0, ]) + upload_ss.commit([0]) """ MNIST """ -MNIST_PICKLED_URL = 'http://repo.aliyun.com/shared_data/mnist.pkl.gz' -MNIST_FILE = 'mnist.pkl.gz' +MNIST_PICKLED_URL = "http://repo.aliyun.com/shared_data/mnist.pkl.gz" +MNIST_FILE = "mnist.pkl.gz" mnist_unpickled = None @@ -501,10 +684,10 @@ def load_mnist_data(): if not os.path.exists(USER_DATA_REPO): os.makedirs(USER_DATA_REPO) if not os.path.exists(mnist_file): - warnings.warn('We need to download data from ' + MNIST_PICKLED_URL + '.') + warnings.warn("We need to download data from " + MNIST_PICKLED_URL + ".") urlretrieve(MNIST_PICKLED_URL, mnist_file) - with gzip.open(mnist_file, 'rb') as fobj: + with gzip.open(mnist_file, "rb") as fobj: mnist_unpickled = pickle.load(fobj) fobj.close() @@ -519,16 +702,16 @@ def create_mnist_table(odps, table_name, part_id=0, tunnel=None, project=None): tunnel = TableTunnel(odps) odps.delete_table(table_name, if_exists=True, project=project) - odps.create_table(table_name, 'feature string, class string') + odps.create_table(table_name, "feature string, class string") upload_ss = tunnel.create_upload_session(table_name) writer = upload_ss.open_record_writer(0) for feature, label in zip(train_data[0], train_data[1]): rec = upload_ss.new_record() - rec.set(0, six.text_type(','.join(str(n) for n in feature))) + rec.set(0, six.text_type(",".join(str(n) for n in feature))) rec.set(1, six.text_type(label)) writer.write(rec) writer.close() - upload_ss.commit([0, ]) + upload_ss.commit([0]) diff --git a/odps/inter.py b/odps/inter.py index 00e565d4..c8e7766e 100644 --- a/odps/inter.py +++ b/odps/inter.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,25 +17,24 @@ from __future__ import absolute_import import os -import sys -from hashlib import md5 import pickle import shutil +import sys +from hashlib import md5 from .compat import six from .config import options from .core import ODPS -from .errors import InteractiveError -from .models import TableSchema -from .utils import to_binary, build_pyodps_dir from .df.backends.frame import ResultFrame from .df.backends.odpssql.types import odps_schema_to_df_schema +from .errors import InteractiveError +from .models import TableSchema +from .utils import build_pyodps_dir, to_binary - -DEFAULT_ROOM_NAME = 'default' -ODPS_FILE_NAME = '__ODPS__' -INFO_FILE_NAME = '__INFO__' -OBJECT_FILE_NAME = '__OBJ__' +DEFAULT_ROOM_NAME = "default" +ODPS_FILE_NAME = "__ODPS__" +INFO_FILE_NAME = "__INFO__" +OBJECT_FILE_NAME = "__OBJ__" class Room(object): @@ -52,20 +51,27 @@ def _init(self): odps_file = os.path.join(self._room_dir, ODPS_FILE_NAME) if not os.path.exists(odps_file): - raise InteractiveError( - 'This room(%s) is not configured' % self._room_name) + raise InteractiveError("This room(%s) is not configured" % self._room_name) - with open(odps_file, 'rb') as f: + with open(odps_file, "rb") as f: try: obj = pickle.load(f) except pickle.UnpicklingError: - raise InteractiveError( - 'Failed to enter a room: %s' % self._room_name) - - def _config_rooms(access_id, access_key, default_project, endpoint, tunnel_endpoint=None, - seahawks_url=None, logview_host=None, default_schema=None, - region_name=None, **kwargs): - options.loads(kwargs.get('options', {})) + raise InteractiveError("Failed to enter a room: %s" % self._room_name) + + def _config_rooms( + access_id, + access_key, + default_project, + endpoint, + tunnel_endpoint=None, + seahawks_url=None, + logview_host=None, + default_schema=None, + region_name=None, + **kwargs + ): + options.loads(kwargs.get("options", {})) options.account = ODPS._build_account(access_id, access_key) options.endpoint = endpoint @@ -118,22 +124,22 @@ def store(self, name, obj, desc=None): path = self._obj_store_dir(name) if os.path.exists(path): - raise InteractiveError('%s already exists' % name) + raise InteractiveError("%s already exists" % name) os.makedirs(path) - with open(os.path.join(path, INFO_FILE_NAME), 'wb') as f: + with open(os.path.join(path, INFO_FILE_NAME), "wb") as f: pickle.dump((name, desc), f, protocol=0) - with open(os.path.join(path, OBJECT_FILE_NAME), 'wb') as f: + with open(os.path.join(path, OBJECT_FILE_NAME), "wb") as f: pickle.dump(obj, f, protocol=0) def fetch(self, name): path = self._obj_store_dir(name) if not os.path.exists(path): - raise InteractiveError('%s does not exist' % name) + raise InteractiveError("%s does not exist" % name) - with open(os.path.join(path, OBJECT_FILE_NAME), 'rb') as f: + with open(os.path.join(path, OBJECT_FILE_NAME), "rb") as f: return pickle.load(f) def drop(self, name): @@ -147,13 +153,13 @@ def list_stores(self): for obj_dir in os.listdir(self._room_dir): info_path = os.path.join(self._room_dir, obj_dir, INFO_FILE_NAME) if os.path.exists(info_path): - with open(info_path, 'rb') as f: + with open(info_path, "rb") as f: results.append(list(pickle.load(f))) return results def display(self): - schema = TableSchema.from_lists(['name', 'desc'], ['string'] * 2) + schema = TableSchema.from_lists(["name", "desc"], ["string"] * 2) schema = odps_schema_to_df_schema(schema) frame = ResultFrame(self.list_stores(), schema=schema, pandas=False) try: @@ -163,7 +169,7 @@ def display(self): df = frame.values df.columns.name = self._room_name - frame._values = df.set_index('name') + frame._values = df.set_index("name") except (ImportError, ValueError): pass @@ -171,7 +177,7 @@ def display(self): def _get_root_dir(): - rooms_dir = build_pyodps_dir('rooms') + rooms_dir = build_pyodps_dir("rooms") return os.path.join(rooms_dir, str(sys.version_info[0])) @@ -187,31 +193,55 @@ def _get_room_dir(room_name, mkdir=False): return room_dir -def setup(access_id, access_key, default_project, endpoint=None, tunnel_endpoint=None, - default_schema=None, region_name=None, seahawks_url=None, logview_host=None, - room=DEFAULT_ROOM_NAME, with_options=False, **kwargs): +def setup( + access_id, + access_key, + default_project, + endpoint=None, + tunnel_endpoint=None, + default_schema=None, + region_name=None, + seahawks_url=None, + logview_host=None, + room=DEFAULT_ROOM_NAME, + with_options=False, + **kwargs +): room_dir = _get_room_dir(room, mkdir=True) odps_file = os.path.join(room_dir, ODPS_FILE_NAME) if with_options: trivial_types = (six.string_types, six.integer_types, float, type(None)) options_dump = { - k: v for k, v in six.iteritems(options.dumps()) if isinstance(v, trivial_types) + k: v + for k, v in six.iteritems(options.dumps()) + if isinstance(v, trivial_types) } - kwargs['options'] = options_dump + kwargs["options"] = options_dump if os.path.exists(odps_file): raise InteractiveError( - 'This room(%s) has been configured before, ' - 'you can teardown it first' % room) - - obj = (access_id, access_key, default_project, endpoint, tunnel_endpoint, - seahawks_url, logview_host, default_schema, region_name, kwargs) + "This room(%s) has been configured before, " + "you can teardown it first" % room + ) - with open(odps_file, 'wb') as f: + obj = ( + access_id, + access_key, + default_project, + endpoint, + tunnel_endpoint, + seahawks_url, + logview_host, + default_schema, + region_name, + kwargs, + ) + + with open(odps_file, "wb") as f: pickle.dump(obj, f, protocol=0) - with open(os.path.join(room_dir, INFO_FILE_NAME), 'wb') as f: + with open(os.path.join(room_dir, INFO_FILE_NAME), "wb") as f: f.write(to_binary(room)) diff --git a/odps/ipython/__init__.py b/odps/ipython/__init__.py index d2aca310..028831af 100644 --- a/odps/ipython/__init__.py +++ b/odps/ipython/__init__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from . import magics, completer +from . import completer, magics def load_ipython_extension(ipython): diff --git a/odps/ipython/completer.py b/odps/ipython/completer.py index d72ec39c..33635df1 100644 --- a/odps/ipython/completer.py +++ b/odps/ipython/completer.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,34 +19,34 @@ import re from collections import namedtuple -from ..compat import six, getargspec from .. import ODPS, options, utils +from ..compat import getargspec, six from ..inter import list_rooms -PROJECT_REGEX = re.compile(r'.*project *= *(?P[^\(\),]+)') -NAME_REGEX = re.compile(r'.*name *= *(?P[^\(\),]+)') +PROJECT_REGEX = re.compile(r".*project *= *(?P[^\(\),]+)") +NAME_REGEX = re.compile(r".*name *= *(?P[^\(\),]+)") TEMP_TABLE_PREFIXES = [ utils.TEMP_TABLE_PREFIX, - 'jdbc_temp_tbl_', - 'pai_temp_', - 'temp_xlib_table_' + "jdbc_temp_tbl_", + "pai_temp_", + "temp_xlib_table_", ] class RoomCompleter(object): def __init__(self, ipython=None): self._ipython = ipython - self._regex_str = r'^%(enter|setup|teardown|stores) +' + self._regex_str = r"^%(enter|setup|teardown|stores) +" def register(self): - self._ipython.set_hook('complete_command', self, re_key=self._regex_str) + self._ipython.set_hook("complete_command", self, re_key=self._regex_str) def __call__(self, completer, event): cursor_text = event.text_until_cursor - _, prefix = cursor_text.split(' ', 1) + _, prefix = cursor_text.split(" ", 1) prefix = prefix.strip() rooms = [n for n in list_rooms() if n.startswith(prefix)] - return rooms[:options.completion_size] + return rooms[: options.completion_size] class BaseCompleter(object): @@ -61,7 +61,7 @@ def get_list_call(self, cursor_str, full_line=None): pass def register(self): - self._ipython.set_hook('complete_command', self, re_key=self._regex_str) + self._ipython.set_hook("complete_command", self, re_key=self._regex_str) def __call__(self, completer, event): cursor_text = event.text_until_cursor @@ -73,7 +73,7 @@ def __call__(self, completer, event): if quote is None: quote = '\'' else: - quote = '' + quote = "" try: list_gen = self._ipython.ev(code) @@ -84,7 +84,7 @@ def _is_temp_table(tn): return any(tn.startswith(p) for p in TEMP_TABLE_PREFIXES) def render_object(o): - if hasattr(o, 'name'): + if hasattr(o, "name"): if _is_temp_table(o.name): return None name = o.name @@ -92,23 +92,29 @@ def render_object(o): name = str(o) return quote + utils.str_to_printable(name, auto_quote=False) + quote - names = [render_object(o) for idx, o in enumerate(list_gen) - if idx < options.completion_size] + names = [ + render_object(o) + for idx, o in enumerate(list_gen) + if idx < options.completion_size + ] return [n for n in names if n] class ObjectCompleter(BaseCompleter): @staticmethod def iter_methods(): - odps_methods = [m_name for m_name in dir(ODPS) if callable(getattr(ODPS, m_name)) - and not m_name.startswith('_')] - odps_listers = [m_name for m_name in odps_methods if m_name.startswith('list_')] + odps_methods = [ + m_name + for m_name in dir(ODPS) + if callable(getattr(ODPS, m_name)) and not m_name.startswith("_") + ] + odps_listers = [m_name for m_name in odps_methods if m_name.startswith("list_")] for m_name in odps_methods: - for prefix in ('get_', 'delete_', 'write_', 'read_'): + for prefix in ("get_", "delete_", "write_", "read_"): if m_name.startswith(prefix): lister = None - lister_prefix = m_name.replace(prefix, 'list_') + lister_prefix = m_name.replace(prefix, "list_") for l in odps_listers: if l.startswith(lister_prefix): lister = l @@ -119,19 +125,21 @@ def iter_methods(): def build_regex(self): self._methods = {} - method_type = namedtuple('MethodType', 'use_prefix list_method') + method_type = namedtuple("MethodType", "use_prefix list_method") for m_name, lister in self.iter_methods(): arg_tuple = getargspec(getattr(ODPS, lister)) - use_prefix = 'prefix' in arg_tuple[0] - self._methods[m_name] = method_type(use_prefix=use_prefix, list_method=lister) + use_prefix = "prefix" in arg_tuple[0] + self._methods[m_name] = method_type( + use_prefix=use_prefix, list_method=lister + ) _regex_str = ( - r'(^|.*[\(\)\s,=]+)(?P[^\(\)\s,]+)\.(?P' - + '|'.join(six.iterkeys(self._methods)) - + r')\(' + r"(^|.*[\(\)\s,=]+)(?P[^\(\)\s,]+)\.(?P" + + "|".join(six.iterkeys(self._methods)) + + r")\(" ) - self._regex = re.compile(_regex_str + r'(?P[^\(\)]*)$') + self._regex = re.compile(_regex_str + r"(?P[^\(\)]*)$") return _regex_str def get_list_call(self, cursor_str, full_line=None): @@ -140,22 +148,24 @@ def get_list_call(self, cursor_str, full_line=None): cmatch = self._regex.match(cursor_str) if cmatch is None: return None - odps_obj = cmatch.group('odps') - get_cmd = cmatch.group('getfn') - arg_str = cmatch.group('args').strip() + odps_obj = cmatch.group("odps") + get_cmd = cmatch.group("getfn") + arg_str = cmatch.group("args").strip() - project = 'None' - arg_start, arg_cursor = cmatch.span('args') + project = "None" + arg_start, arg_cursor = cmatch.span("args") arg_body = full_line[arg_start:] pmatch = PROJECT_REGEX.match(arg_body) if pmatch: - project = pmatch.group('project') + project = pmatch.group("project") nmatch = NAME_REGEX.match(arg_str) - name_str = nmatch.group('name') if nmatch else arg_str + name_str = nmatch.group("name") if nmatch else arg_str quote = None - if name_str != '' and not (name_str.startswith('\'') or name_str.startswith('\"')): + if name_str != "" and not ( + name_str.startswith('\'') or name_str.startswith('\"') + ): return None if name_str.endswith('\"') or name_str.endswith('\''): return None @@ -168,9 +178,16 @@ def get_list_call(self, cursor_str, full_line=None): if name_str and self._methods[get_cmd].use_prefix: formatter = '{odps}.{func}(prefix="{prefix}", project={project})' else: - formatter = '{odps}.{func}(project={project})' - return formatter.format(odps=odps_obj, func=self._methods[get_cmd].list_method, - prefix=name_str, project=project), quote + formatter = "{odps}.{func}(project={project})" + return ( + formatter.format( + odps=odps_obj, + func=self._methods[get_cmd].list_method, + prefix=name_str, + project=project, + ), + quote, + ) def load_ipython_extension(ipython): diff --git a/odps/ipython/magics.py b/odps/ipython/magics.py index eebc746d..5535fc58 100644 --- a/odps/ipython/magics.py +++ b/odps/ipython/magics.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,17 +17,21 @@ import logging import time -from ..inter import enter, setup, teardown, list_rooms -from ..compat import six, StringIO +from .. import ODPS, options from .. import types as odps_types -from .. import options, ODPS -from ..utils import replace_sql_parameters, init_progress_ui -from ..df import DataFrame, Scalar, NullScalar, Delay +from ..compat import StringIO, six +from ..df import DataFrame, Delay, NullScalar, Scalar from ..df.backends.frame import ResultFrame from ..df.backends.odpssql.types import odps_schema_to_df_schema, odps_type_to_df_type +from ..inter import enter, list_rooms, setup, teardown from ..models import TableSchema from ..ui.common import html_notify -from ..ui.progress import create_instance_group, reload_instance_status, fetch_instance_group +from ..ui.progress import ( + create_instance_group, + fetch_instance_group, + reload_instance_status, +) +from ..utils import init_progress_ui, replace_sql_parameters logger = logging.getLogger(__name__) @@ -36,21 +40,23 @@ np_int_types = map(np.dtype, [np.int_, np.int8, np.int16, np.int32, np.int64]) np_float_types = map(np.dtype, [np.float_, np.float16, np.float32, np.float64]) - np_to_odps_types = dict([(t, odps_types.bigint) for t in np_int_types] + - [(t, odps_types.double) for t in np_float_types]) + np_to_odps_types = dict( + [(t, odps_types.bigint) for t in np_int_types] + + [(t, odps_types.double) for t in np_float_types] + ) except ImportError: pass try: - from IPython.core.magic import Magics, magics_class, line_cell_magic, line_magic + from IPython.core.magic import Magics, line_cell_magic, line_magic, magics_class except ImportError: # skipped for ci ODPSSql = None pass else: + @magics_class class ODPSSql(Magics): - _odps = None def _set_odps(self): @@ -59,13 +65,15 @@ def _set_odps(self): if options.account is not None and options.default_project is not None: self._odps = ODPS._from_account( - options.account, options.default_project, - endpoint=options.endpoint, tunnel_endpoint=options.tunnel.endpoint + options.account, + options.default_project, + endpoint=options.endpoint, + tunnel_endpoint=options.tunnel.endpoint, ) else: self._odps = enter().odps - @line_magic('enter') + @line_magic("enter") def enter(self, line): room = line.strip() if room: @@ -75,30 +83,30 @@ def enter(self, line): r = enter() self._odps = r.odps - if 'o' not in self.shell.user_ns: - self.shell.user_ns['o'] = self._odps - self.shell.user_ns['odps'] = self._odps + if "o" not in self.shell.user_ns: + self.shell.user_ns["o"] = self._odps + self.shell.user_ns["odps"] = self._odps return r - @line_magic('setup') + @line_magic("setup") def setup(self, line): args = line.strip().split() name, args = args[0], args[1:] setup(*args, room=name) - html_notify('Setup succeeded') + html_notify("Setup succeeded") - @line_magic('teardown') + @line_magic("teardown") def teardown(self, line): name = line.strip() teardown(name) - html_notify('Teardown succeeded') + html_notify("Teardown succeeded") - @line_magic('list_rooms') + @line_magic("list_rooms") def list_rooms(self, line): return list_rooms() - @line_magic('stores') + @line_magic("stores") def list_stores(self, line): line = line.strip() @@ -112,8 +120,11 @@ def list_stores(self, line): @staticmethod def _get_task_percent(task_progress): if len(task_progress.stages) > 0: - all_percent = sum((float(stage.terminated_workers) / stage.total_workers) - for stage in task_progress.stages if stage.total_workers > 0) + all_percent = sum( + (float(stage.terminated_workers) / stage.total_workers) + for stage in task_progress.stages + if stage.total_workers > 0 + ) return all_percent / len(task_progress.stages) else: return 0 @@ -121,12 +132,12 @@ def _get_task_percent(task_progress): def _to_stdout(cls, msg): print(msg) - @line_magic('set') + @line_magic("set") def set_hint(self, line): - if '=' not in line: - raise ValueError('Hint for sql is not allowed') + if "=" not in line: + raise ValueError("Hint for sql is not allowed") - key, val = line.strip().strip(';').split('=', 1) + key, val = line.strip().strip(";").split("=", 1) key, val = key.strip(), val.strip() settings = options.sql.settings @@ -135,22 +146,22 @@ def set_hint(self, line): else: options.sql.settings[key] = val - @line_cell_magic('sql') - def execute(self, line, cell=''): + @line_cell_magic("sql") + def execute(self, line, cell=""): self._set_odps() - content = line + '\n' + cell + content = line + "\n" + cell content = content.strip() sql = None hints = dict() - splits = content.split(';') + splits = content.split(";") for s in splits: stripped = s.strip() - if stripped.lower().startswith('set '): - hint = stripped.split(' ', 1)[1] - k, v = hint.split('=', 1) + if stripped.lower().startswith("set "): + hint = stripped.split(" ", 1)[1] + k, v = hint.split("=", 1) k, v = k.strip(), v.strip() hints[k] = v elif len(stripped) == 0: @@ -159,32 +170,40 @@ def execute(self, line, cell=''): if sql is None: sql = s else: - sql = '%s;%s' % (sql, s) + sql = "%s;%s" % (sql, s) # replace user defined parameters sql = replace_sql_parameters(sql, self.shell.user_ns) if sql: progress_ui = init_progress_ui() - group_id = create_instance_group('SQL Query') + group_id = create_instance_group("SQL Query") progress_ui.add_keys(group_id) instance = self._odps.run_sql(sql, hints=hints) if logger.getEffectiveLevel() <= logging.INFO: - logger.info('Instance ID: %s\n Log view: %s', instance.id, instance.get_logview_address()) + logger.info( + "Instance ID: %s\n Log view: %s", + instance.id, + instance.get_logview_address(), + ) reload_instance_status(self._odps, group_id, instance.id) - progress_ui.status('Executing') + progress_ui.status("Executing") percent = 0 while not instance.is_terminated(retry=True): last_percent = percent reload_instance_status(self._odps, group_id, instance.id) - inst_progress = fetch_instance_group(group_id).instances.get(instance.id) + inst_progress = fetch_instance_group(group_id).instances.get( + instance.id + ) if inst_progress is not None and len(inst_progress.tasks) > 0: - percent = sum(self._get_task_percent(task) - for task in six.itervalues(inst_progress.tasks)) / len(inst_progress.tasks) + percent = sum( + self._get_task_percent(task) + for task in six.itervalues(inst_progress.tasks) + ) / len(inst_progress.tasks) else: percent = 0 @@ -202,18 +221,23 @@ def execute(self, line, cell=''): with instance.open_reader() as reader: try: import pandas as pd + try: - from pandas.io.parsers import ParserError as CParserError + from pandas.io.parsers import ( + ParserError as CParserError, + ) except ImportError: pass try: - from pandas.parser import CParserError + from pandas.parser import CParserError # noqa except ImportError: - CParserError = ValueError + CParserError = ValueError # noqa - if not hasattr(reader, 'raw'): - res = ResultFrame([rec.values for rec in reader], - schema=odps_schema_to_df_schema(reader._schema)) + if not hasattr(reader, "raw"): + res = ResultFrame( + [rec.values for rec in reader], + schema=odps_schema_to_df_schema(reader._schema), + ) else: try: res = pd.read_csv(StringIO(reader.raw)) @@ -222,73 +246,94 @@ def execute(self, line, cell=''): else: cols = res.columns.tolist() schema = odps_schema_to_df_schema( - TableSchema.from_lists(cols, ['string' for _ in cols])) + TableSchema.from_lists( + cols, ["string" for _ in cols] + ) + ) res = ResultFrame(res.values, schema=schema) except (ValueError, CParserError): res = reader.raw except (ImportError, ValueError): - if not hasattr(reader, 'raw'): - res = ResultFrame([rec.values for rec in reader], - schema=odps_schema_to_df_schema(reader._schema)) + if not hasattr(reader, "raw"): + res = ResultFrame( + [rec.values for rec in reader], + schema=odps_schema_to_df_schema(reader._schema), + ) else: try: - columns = [odps_types.Column(name=col.name, - typo=odps_type_to_df_type(col.type)) - for col in reader._columns] + columns = [ + odps_types.Column( + name=col.name, + typo=odps_type_to_df_type(col.type), + ) + for col in reader._columns + ] res = ResultFrame(list(reader), columns=columns) except TypeError: res = reader.raw - html_notify('SQL execution succeeded') + html_notify("SQL execution succeeded") return res finally: progress_ui.close() - @line_magic('persist') + @line_magic("persist") def persist(self, line): try: import pandas as pd + has_pandas = True except (ImportError, ValueError): has_pandas = False self._set_odps() - line = line.strip().strip(';') + line = line.strip().strip(";") frame_name, table_name = line.split(None, 1) - if '.' in table_name: - parts = table_name.split('.') + if "." in table_name: + parts = table_name.split(".") if len(parts) == 3: project_name, schema_name, table_name = parts else: project_name, table_name = parts schema_name = None - if table_name.startswith('`') and table_name.endswith('`'): + if table_name.startswith("`") and table_name.endswith("`"): table_name = table_name[1:-1] else: project_name = schema_name = None frame = self.shell.user_ns[frame_name] - if self._odps.exist_table(table_name, project=project_name, schema=schema_name): - raise TypeError('%s already exists' % table_name) + if self._odps.exist_table( + table_name, project=project_name, schema=schema_name + ): + raise TypeError("%s already exists" % table_name) if isinstance(frame, DataFrame): - frame.persist(name=table_name, project=project_name, schema=schema_name, notify=False) + frame.persist( + name=table_name, + project=project_name, + schema=schema_name, + notify=False, + ) elif has_pandas and isinstance(frame, pd.DataFrame): frame = DataFrame(frame) - frame.persist(name=table_name, project=project_name, schema=schema_name, notify=False) - html_notify('Persist succeeded') - + frame.persist( + name=table_name, + project=project_name, + schema=schema_name, + notify=False, + ) + html_notify("Persist succeeded") def load_ipython_extension(ipython): ipython.register_magics(ODPSSql) # Do global import when load extension - ipython.user_ns['DataFrame'] = DataFrame - ipython.user_ns['Scalar'] = Scalar - ipython.user_ns['NullScalar'] = NullScalar - ipython.user_ns['options'] = options - ipython.user_ns['TableSchema'] = TableSchema - ipython.user_ns['Delay'] = Delay + ipython.user_ns["DataFrame"] = DataFrame + ipython.user_ns["Scalar"] = Scalar + ipython.user_ns["NullScalar"] = NullScalar + ipython.user_ns["options"] = options + ipython.user_ns["TableSchema"] = TableSchema + ipython.user_ns["Delay"] = Delay diff --git a/odps/ipython/tests/test_completer.py b/odps/ipython/tests/test_completer.py index 4fe109e7..45bd4665 100644 --- a/odps/ipython/tests/test_completer.py +++ b/odps/ipython/tests/test_completer.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,31 +22,39 @@ def test_object_completer_call(): assert completer.get_list_call("o3.get_bizarre(abc") is None assert completer.get_list_call("table = o1.get_table(") == ( - "o1.list_tables(project=None)", None + "o1.list_tables(project=None)", + None, ) assert completer.get_list_call(", o2_a.get_table(") == ( - "o2_a.list_tables(project=None)", None + "o2_a.list_tables(project=None)", + None, ) assert completer.get_list_call("o3.get_table(abc") is None assert completer.get_list_call('o3.get_table("abc", project=') is None assert completer.get_list_call('(o4.delete_table(" def') == ( - 'o4.list_tables(prefix=" def", project=None)', '"' + 'o4.list_tables(prefix=" def", project=None)', + '"', ) assert completer.get_list_call("( o5.get_table( 'ghi") == ( - 'o5.list_tables(prefix="ghi", project=None)', "'" + 'o5.list_tables(prefix="ghi", project=None)', + "'", ) assert completer.get_list_call("obj.o6.write_table( 'ghi") == ( - 'obj.o6.list_tables(prefix="ghi", project=None)', "'" + 'obj.o6.list_tables(prefix="ghi", project=None)', + "'", ) assert completer.get_list_call( - 'obj.o7.get_table(project= "another_proj", name= \'ghi' - ) == ('obj.o7.list_tables(prefix="ghi", project="another_proj")', "'") + 'obj.o7.get_table(project= "another_proj", name= \'ghi' + ) == ('obj.o7.list_tables(prefix="ghi", project="another_proj")', "'") assert completer.get_list_call( - "obj.o8.get_table('ghi", - 'obj.o8.get_table(\'ghi, project= "another_proj"', - ) == ('obj.o8.list_tables(prefix="ghi", project="another_proj")', "'") + "obj.o8.get_table('ghi", + 'obj.o8.get_table(\'ghi, project= "another_proj"', + ) == ('obj.o8.list_tables(prefix="ghi", project="another_proj")', "'") assert completer.get_list_call( - "obj.o9.get_table(name = 'ghi", - 'obj.o9.get_table(name = \'ghi, project= "another_proj"', - ) == ('obj.o9.list_tables(prefix="ghi", project="another_proj")', "'") - assert completer.get_list_call('obj.o10.get_table(project= "another_proj", \'ghi') is None + "obj.o9.get_table(name = 'ghi", + 'obj.o9.get_table(name = \'ghi, project= "another_proj"', + ) == ('obj.o9.list_tables(prefix="ghi", project="another_proj")', "'") + assert ( + completer.get_list_call('obj.o10.get_table(project= "another_proj", \'ghi') + is None + ) diff --git a/odps/ipython/tests/test_magics.py b/odps/ipython/tests/test_magics.py index 26e338ad..5c6f7bb7 100644 --- a/odps/ipython/tests/test_magics.py +++ b/odps/ipython/tests/test_magics.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ import pytest from ...config import options -from ...tests.core import tn, get_result +from ...tests.core import get_result, tn from ..magics import ODPSSql try: @@ -28,7 +28,7 @@ has_ipython = True except ImportError: has_ipython = False - pytestmark = pytest.mark.skip('Skipped when no IPython is detected.') + pytestmark = pytest.mark.skip("Skipped when no IPython is detected.") @pytest.fixture(autouse=True) @@ -41,13 +41,13 @@ def tunnel_config(): def test_load_extension(): - from ..magics import load_ipython_extension, Magics + from ..magics import Magics, load_ipython_extension def register_func(magics): magics_store.append(magics) magics_store = [] - FakeShell = namedtuple('FakeShell', 'user_ns register_magics') + FakeShell = namedtuple("FakeShell", "user_ns register_magics") fake_shell = FakeShell(user_ns={}, register_magics=register_func) load_ipython_extension(fake_shell) @@ -56,28 +56,28 @@ def register_func(magics): def test_execute_sql(odps): - FakeShell = namedtuple('FakeShell', 'user_ns') + FakeShell = namedtuple("FakeShell", "user_ns") magic_class = ODPSSql(FakeShell(user_ns={})) magic_class._odps = odps - test_table_name = tn('pyodps_t_test_sql_magic') - test_content = [['line1'], ['line2']] + test_table_name = tn("pyodps_t_test_sql_magic") + test_content = [["line1"], ["line2"]] odps.delete_table(test_table_name, if_exists=True) - odps.create_table(test_table_name, 'col string', lifecycle=1) + odps.create_table(test_table_name, "col string", lifecycle=1) odps.write_table(test_table_name, test_content) options.tunnel.use_instance_tunnel = False - result = magic_class.execute('select * from %s' % test_table_name) + result = magic_class.execute("select * from %s" % test_table_name) assert get_result(result) == test_content options.tunnel.use_instance_tunnel = True - result = magic_class.execute('select * from %s' % test_table_name) + result = magic_class.execute("select * from %s" % test_table_name) assert get_result(result) == test_content - result = magic_class.execute('show tables') + result = magic_class.execute("show tables") assert len(result) > 0 - table_name = tn('pyodps_test_magics_create_table_result') - magic_class.execute('create table %s (col string) lifecycle 1' % table_name) - magic_class.execute('drop table %s' % table_name) + table_name = tn("pyodps_test_magics_create_table_result") + magic_class.execute("create table %s (col string) lifecycle 1" % table_name) + magic_class.execute("drop table %s" % table_name) diff --git a/odps/lab_extension/.eslintignore b/odps/lab_extension/.eslintignore deleted file mode 100644 index 5c99ba78..00000000 --- a/odps/lab_extension/.eslintignore +++ /dev/null @@ -1,5 +0,0 @@ -node_modules -dist -coverage -**/*.d.ts -tests diff --git a/odps/lab_extension/.eslintrc.js b/odps/lab_extension/.eslintrc.js deleted file mode 100644 index d66148c1..00000000 --- a/odps/lab_extension/.eslintrc.js +++ /dev/null @@ -1,39 +0,0 @@ -module.exports = { - extends: [ - 'eslint:recommended', - 'plugin:@typescript-eslint/eslint-recommended', - 'plugin:@typescript-eslint/recommended', - 'plugin:prettier/recommended' - ], - parser: '@typescript-eslint/parser', - parserOptions: { - project: 'tsconfig.json', - sourceType: 'module' - }, - plugins: ['@typescript-eslint'], - rules: { - '@typescript-eslint/naming-convention': [ - 'error', - { - 'selector': 'interface', - 'format': ['PascalCase'], - 'custom': { - 'regex': '^I[A-Z]', - 'match': true - } - } - ], - '@typescript-eslint/no-unused-vars': ['warn', { args: 'none' }], - '@typescript-eslint/no-explicit-any': 'off', - '@typescript-eslint/no-namespace': 'off', - '@typescript-eslint/no-use-before-define': 'off', - '@typescript-eslint/quotes': [ - 'error', - 'single', - { avoidEscape: true, allowTemplateLiterals: false } - ], - curly: ['error', 'all'], - eqeqeq: 'error', - 'prefer-arrow-callback': 'error' - } -}; diff --git a/odps/lab_extension/.gitignore b/odps/lab_extension/.gitignore deleted file mode 100644 index 2aeea5c8..00000000 --- a/odps/lab_extension/.gitignore +++ /dev/null @@ -1,112 +0,0 @@ -*.bundle.* -lib/ -node_modules/ -*.egg-info/ -.ipynb_checkpoints -*.tsbuildinfo -pyodps-lab-extension/labextension - -# Created by https://www.gitignore.io/api/python -# Edit at https://www.gitignore.io/?templates=python - -### Python ### -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# Mr Developer -.mr.developer.cfg -.project -.pydevproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# End of https://www.gitignore.io/api/python - -# OSX files -.DS_Store diff --git a/odps/lab_extension/.prettierignore b/odps/lab_extension/.prettierignore deleted file mode 100644 index d2d608b0..00000000 --- a/odps/lab_extension/.prettierignore +++ /dev/null @@ -1,5 +0,0 @@ -node_modules -**/node_modules -**/lib -**/package.json -pyodps-lab-extension diff --git a/odps/lab_extension/.prettierrc b/odps/lab_extension/.prettierrc deleted file mode 100644 index b0a179d4..00000000 --- a/odps/lab_extension/.prettierrc +++ /dev/null @@ -1,5 +0,0 @@ -{ - "singleQuote": true, - "trailingComma": "none", - "arrowParens": "avoid" -} diff --git a/odps/lab_extension/LICENSE b/odps/lab_extension/LICENSE deleted file mode 100644 index 58b8170a..00000000 --- a/odps/lab_extension/LICENSE +++ /dev/null @@ -1,28 +0,0 @@ -BSD 3-Clause License - -Copyright (c) 2020, qianjun.wqj All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -* Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/odps/lab_extension/MANIFEST.in b/odps/lab_extension/MANIFEST.in deleted file mode 100644 index 2b17d2ea..00000000 --- a/odps/lab_extension/MANIFEST.in +++ /dev/null @@ -1,24 +0,0 @@ -include LICENSE -include README.md -include pyproject.toml -include jupyter-config/pyodps-lab-extension.json - -include package.json -include install.json -include ts*.json -include yarn.lock - -graft pyodps-lab-extension/labextension - -# Javascript files -graft src -graft style -prune **/node_modules -prune lib - -# Patterns to exclude from any directory -global-exclude *~ -global-exclude *.pyc -global-exclude *.pyo -global-exclude .git -global-exclude .ipynb_checkpoints diff --git a/odps/lab_extension/README.md b/odps/lab_extension/README.md deleted file mode 100644 index efbbfe2b..00000000 --- a/odps/lab_extension/README.md +++ /dev/null @@ -1,62 +0,0 @@ -# pyodps-lab-extension - -![Github Actions Status](https://github.com/github_username/pyodps-lab-extension/workflows/Build/badge.svg) - -A jupyterlab extension for better odps sql support - - - -## Requirements - -* JupyterLab >= 3.0 - -## Install - -```bash -pip install pyodps-lab-extension -``` - - -## Contributing - -### Development install - -Note: You will need NodeJS to build the extension package. - -The `jlpm` command is JupyterLab's pinned version of -[yarn](https://yarnpkg.com/) that is installed with JupyterLab. You may use -`yarn` or `npm` in lieu of `jlpm` below. - -```bash -# Clone the repo to your local environment -# Change directory to the pyodps-lab-extension directory -# Install package in development mode -pip install -e . -# Link your development version of the extension with JupyterLab -jupyter labextension develop . --overwrite -# Rebuild extension Typescript source after making changes -jlpm run build -``` - -You can watch the source directory and run JupyterLab at the same time in different terminals to watch for changes in the extension's source and automatically rebuild the extension. - -```bash -# Watch the source directory in one terminal, automatically rebuilding when needed -jlpm run watch -# Run JupyterLab in another terminal -jupyter lab -``` - -With the watch command running, every saved change will immediately be built locally and available in your running JupyterLab. Refresh JupyterLab to load the change in your browser (you may need to wait several seconds for the extension to be rebuilt). - -By default, the `jlpm run build` command generates the source maps for this extension to make it easier to debug using the browser dev tools. To also generate source maps for the JupyterLab core extensions, you can run the following command: - -```bash -jupyter lab build --minimize=False -``` - -### Uninstall - -```bash -pip uninstall pyodps-lab-extension -``` diff --git a/odps/lab_extension/install.json b/odps/lab_extension/install.json deleted file mode 100644 index 57ad6b51..00000000 --- a/odps/lab_extension/install.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "packageManager": "python", - "packageName": "pyodps-lab-extension", - "uninstallInstructions": "Use your Python package manager (pip, conda, etc.) to uninstall the package pyodps-lab-extension" -} diff --git a/odps/lab_extension/package.json b/odps/lab_extension/package.json deleted file mode 100644 index 321f7939..00000000 --- a/odps/lab_extension/package.json +++ /dev/null @@ -1,92 +0,0 @@ -{ - "name": "pyodps-lab-extension", - "version": "0.1.0", - "description": "A jupyterlab extension for better odps sql support", - "keywords": [ - "jupyter", - "jupyterlab", - "jupyterlab-extension" - ], - "homepage": "https://github.com/github_username/pyodps-lab-extension", - "bugs": { - "url": "https://github.com/github_username/pyodps-lab-extension/issues" - }, - "license": "BSD-3-Clause", - "author": { - "name": "qianjun.wqj", - "email": "qianjun.wqj@alibaba-inc.com" - }, - "files": [ - "lib/**/*.{d.ts,eot,gif,html,jpg,js,js.map,json,png,svg,woff2,ttf}", - "style/**/*.{css,.js,eot,gif,html,jpg,json,png,svg,woff2,ttf}" - ], - "main": "lib/index.js", - "types": "lib/index.d.ts", - "style": "style/index.css", - "repository": { - "type": "git", - "url": "https://github.com/github_username/pyodps-lab-extension.git" - }, - "scripts": { - "build": "jlpm run build:lib && jlpm run build:labextension:dev", - "build:prod": "jlpm run build:lib && jlpm run build:labextension", - "build:labextension": "jupyter labextension build .", - "build:labextension:dev": "jupyter labextension build --development True .", - "build:lib": "tsc", - "clean": "jlpm run clean:lib", - "clean:lib": "rimraf lib tsconfig.tsbuildinfo", - "clean:labextension": "rimraf pyodps-lab-extension/labextension", - "clean:all": "jlpm run clean:lib && jlpm run clean:labextension", - "eslint": "eslint . --ext .ts,.tsx --fix", - "eslint:check": "eslint . --ext .ts,.tsx", - "install:extension": "jupyter labextension develop --overwrite .", - "prepare": "jlpm run clean && jlpm run build:prod", - "watch": "run-p watch:src watch:labextension", - "watch:src": "tsc -w", - "watch:labextension": "jupyter labextension watch ." - }, - "dependencies": { - "@jupyterlab/application": "^3.0.0", - "@jupyterlab/cells": "^3.0.0", - "@jupyterlab/apputils": "^3.0.5", - "@jupyterlab/coreutils": "^5.0.3", - "@jupyterlab/notebook": "^3.0.0", - "@jupyterlab/rendermime": "^3.0.6", - "@jupyterlab/services": "^6.0.5", - "@lumino/signaling": "^1.4.3", - "@nteract/data-explorer": "^8.2.9", - "async": "^3.2.0", - "axios": "^0.21.0", - "lodash": "^4.17.20", - "monaco-editor": "^0.21.2", - "react": "~16.9.0", - "react-dom": "~16.9.0", - "react-monaco-editor": "^0.41.2", - "styled-components": "^5.2.1" - }, - "devDependencies": { - "@jupyterlab/builder": "^3.0.0", - "@types/async": "^3.2.5", - "@types/axios": "^0.14.0", - "@types/lodash": "^4.14.165", - "@types/react-dom": "^17.0.1", - "@typescript-eslint/eslint-plugin": "^4.8.1", - "@typescript-eslint/parser": "^4.16.1", - "eslint": "^7.14.0", - "eslint-config-prettier": "^6.15.0", - "eslint-plugin-prettier": "^3.1.4", - "npm-run-all": "^4.1.5", - "prettier": "^2.1.1", - "rimraf": "^3.0.2", - "typescript": "~4.1.3" - }, - "sideEffects": [ - "style/*.css", - "style/index.js" - ], - "styleModule": "style/index.js", - "jupyterlab": { - "extension": true, - "outputDir": "pyodps-lab-extension/labextension" - } -} diff --git a/odps/lab_extension/pyodps-lab-extension/_version.py b/odps/lab_extension/pyodps-lab-extension/_version.py deleted file mode 100644 index b96d38b7..00000000 --- a/odps/lab_extension/pyodps-lab-extension/_version.py +++ /dev/null @@ -1,19 +0,0 @@ -import json -from pathlib import Path - -__all__ = ["__version__"] - -def _fetchVersion(): - HERE = Path(__file__).parent.resolve() - - for settings in HERE.rglob("package.json"): - try: - with settings.open() as f: - return json.load(f)["version"] - except FileNotFoundError: - pass - - raise FileNotFoundError(f"Could not find package.json under dir {HERE!s}") - -__version__ = _fetchVersion() - diff --git a/odps/lab_extension/pyproject.toml b/odps/lab_extension/pyproject.toml deleted file mode 100644 index ba04c53f..00000000 --- a/odps/lab_extension/pyproject.toml +++ /dev/null @@ -1,3 +0,0 @@ -[build-system] -requires = ["jupyter_packaging~=0.7.9", "jupyterlab~=3.0", "setuptools>=40.8.0", "wheel"] -build-backend = "setuptools.build_meta" diff --git a/odps/lab_extension/setup.py b/odps/lab_extension/setup.py deleted file mode 100644 index fea999c1..00000000 --- a/odps/lab_extension/setup.py +++ /dev/null @@ -1,94 +0,0 @@ -""" -pyodps-lab-extension setup -""" -import json -from pathlib import Path - -from jupyter_packaging import ( - create_cmdclass, - install_npm, - ensure_targets, - combine_commands, - skip_if_exists -) -import setuptools - -HERE = Path(__file__).parent.resolve() - -# The name of the project -name = "pyodps-lab-extension" - -lab_path = (HERE / name / "labextension") - -# Representative files that should exist after a successful build -jstargets = [ - str(lab_path / "package.json"), -] - -package_data_spec = { - name: ["*"], -} - -labext_name = "pyodps-lab-extension" - -data_files_spec = [ - ("share/jupyter/labextensions/%s" % labext_name, str(lab_path), "**"), - ("share/jupyter/labextensions/%s" % labext_name, str(HERE), "install.json"), -] - -cmdclass = create_cmdclass("jsdeps", - package_data_spec=package_data_spec, - data_files_spec=data_files_spec -) - -js_command = combine_commands( - install_npm(HERE, build_cmd="build:prod", npm=["jlpm"]), - ensure_targets(jstargets), -) - -is_repo = (HERE / ".git").exists() -if is_repo: - cmdclass["jsdeps"] = js_command -else: - cmdclass["jsdeps"] = skip_if_exists(jstargets, js_command) - -long_description = (HERE / "README.md").read_text() - -# Get the package info from package.json -pkg_json = json.loads((HERE / "package.json").read_bytes()) - -setup_args = dict( - name=name, - version=pkg_json["version"], - url=pkg_json["homepage"], - author=pkg_json["author"]["name"], - author_email=pkg_json["author"]["email"], - description=pkg_json["description"], - license=pkg_json["license"], - long_description=long_description, - long_description_content_type="text/markdown", - cmdclass=cmdclass, - packages=setuptools.find_packages(), - install_requires=[ - "jupyterlab~=3.0", - ], - zip_safe=False, - include_package_data=True, - python_requires=">=3.6", - platforms="Linux, Mac OS X, Windows", - keywords=["Jupyter", "JupyterLab", "JupyterLab3"], - classifiers=[ - "License :: OSI Approved :: BSD License", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Framework :: Jupyter", - ], -) - - -if __name__ == "__main__": - setuptools.setup(**setup_args) diff --git a/odps/lab_extension/src/Container/index.ts b/odps/lab_extension/src/Container/index.ts deleted file mode 100644 index 50d10287..00000000 --- a/odps/lab_extension/src/Container/index.ts +++ /dev/null @@ -1,19 +0,0 @@ -import { Token } from '@lumino/coreutils'; - -export class Container { - private map = new WeakMap, any>(); - - public set(token: Token, impl: any) { - this.map.set(token, impl); - } - - public get(token: Token): T { - if (this.map.has(token)) { - return this.map.get(token); - } else { - throw ReferenceError(); - } - } -} - -export const container = new Container(); diff --git a/odps/lab_extension/src/DataVisualization/RenderTableSchema.tsx b/odps/lab_extension/src/DataVisualization/RenderTableSchema.tsx deleted file mode 100644 index 7d674072..00000000 --- a/odps/lab_extension/src/DataVisualization/RenderTableSchema.tsx +++ /dev/null @@ -1,59 +0,0 @@ -import { Widget } from '@lumino/widgets'; -import { IRenderMime } from '@jupyterlab/rendermime-interfaces'; -import { Message } from '@lumino/messaging'; -import ReactDOM from 'react-dom'; -import React from 'react'; -import { DataExplorer, Toolbar, Viz } from '@nteract/data-explorer'; -import { container } from '../Container'; -import { IThemeManager } from '@jupyterlab/apputils'; - -export class RenderTableSchema extends Widget implements IRenderMime.IRenderer { - private readonly _mimeType: string; - private readonly _themeManager: IThemeManager; - private get isLightTheme() { - return this._themeManager.isLight(this._themeManager.theme); - } - - /** - * Create a new widget for rendering JSON. - */ - constructor(options: IRenderMime.IRendererOptions) { - super(); - this._mimeType = options.mimeType; - this._themeManager = container.get(IThemeManager); - } - - /** - * Render JSON into this widget's node. - */ - public renderModel(model: IRenderMime.IMimeModel): Promise { - const data = model.data[this._mimeType] || ({} as any); - return new Promise(resolve => { - ReactDOM.render( -
- - - - -
, - this.node, - resolve - ); - }); - } - - /** - * Called before the widget is detached from the DOM. - */ - protected onBeforeDetach(msg: Message): void { - // Unmount the component so it can tear down. - ReactDOM.unmountComponentAtNode(this.node); - } -} diff --git a/odps/lab_extension/src/Editor/ConfigBuilder/EditorConfigBuilder.ts b/odps/lab_extension/src/Editor/ConfigBuilder/EditorConfigBuilder.ts deleted file mode 100644 index 1d39bcd8..00000000 --- a/odps/lab_extension/src/Editor/ConfigBuilder/EditorConfigBuilder.ts +++ /dev/null @@ -1,31 +0,0 @@ -export interface IEditorConfigBuilder { - /** - * set Theme automatically - * dependents on IThemeManager - */ - autoTheme(): this; - - /** - * the Editor uri, make sure uri is unique - * if you wants multiple editor instance - * @param uri - */ - uri(uri: string): this; - - /** - * the sql content - * @param content - */ - content(content: string): this; - - /** - * generate LSP Url automatically - * dependents on window.location.host - */ - autoLSPUrl(): this; - - /** - * build configs - */ - build(): object; -} diff --git a/odps/lab_extension/src/Editor/ConfigBuilder/LSPEditorConfigBuilder.ts b/odps/lab_extension/src/Editor/ConfigBuilder/LSPEditorConfigBuilder.ts deleted file mode 100644 index ace989e5..00000000 --- a/odps/lab_extension/src/Editor/ConfigBuilder/LSPEditorConfigBuilder.ts +++ /dev/null @@ -1,105 +0,0 @@ -import { container } from '../../Container'; -import { IThemeManager } from '@jupyterlab/apputils'; -import { IEditorConfigBuilder } from './EditorConfigBuilder'; -import { isInner } from '../../Utils/isInner'; - -export class LSPEditorConfigBuilder implements IEditorConfigBuilder { - private config = { - language: 'odps', - uri: '123', - editorOptions: { - theme: 'vs-dark', - readOnly: false, - minimap: { enabled: true }, - fontSize: 14, - wordWrap: 'wordWrapColumn', - wordWrapColumn: 80 - }, - useLsp: true, - lspOptions: { - wsUrl: 'wss://lsp-cn-shanghai.data.aliyun.com/lsp', - projectInfo: { - projectId: 123, - projectIdentifier: 'ots_etl' - }, - settings: { - autoComplete: [ - 'keyword', - 'white', - 'snippet', - 'project', - 'table', - 'column' - ], - codeStyle: 1, - faultCheck: true - } - }, - content: '' - }; - - /** - * set Theme automatically - * dependents on IThemeManager - */ - public autoTheme() { - const theme = container.get(IThemeManager); - this.config.editorOptions.theme = theme.isLight(theme.theme) - ? 'vs' - : 'vs-dark'; - return this; - } - - /** - * the Editor uri, make sure uri is unique - * if you wants multiple editor instance - * @param uri - */ - public uri(uri: string) { - this.config.uri = uri; - return this; - } - - /** - * the sql content - * @param content - */ - public content(content: string) { - this.config.content = content; - return this; - } - - /** - * generate LSP Url automatically - * dependents on window.location.host - */ - public autoLSPUrl() { - const regions = [ - 'cn-beijing', - 'cn-shanghai', - 'cn-hangzhou', - 'cn-shenzhen', - 'ap-southeast' - ]; - const host = window.location.host; - - for (let i = 0; i < regions.length; i = i + 1) { - if (host.includes(regions[i])) { - this.config.lspOptions.wsUrl = `wss://lsp-${regions[i]}.data.aliyun.com/lsp`; - return this; - } - } - - - return this; - } - - /** - * build configs - */ - public build() { - return { - options: this.config - }; - } -} diff --git a/odps/lab_extension/src/Editor/ConfigBuilder/MonacoEditorConfigBuilder.ts b/odps/lab_extension/src/Editor/ConfigBuilder/MonacoEditorConfigBuilder.ts deleted file mode 100644 index 75c596d4..00000000 --- a/odps/lab_extension/src/Editor/ConfigBuilder/MonacoEditorConfigBuilder.ts +++ /dev/null @@ -1,67 +0,0 @@ -import { container } from '../../Container'; -import { IThemeManager } from '@jupyterlab/apputils'; -import { IEditorConfigBuilder } from './EditorConfigBuilder'; -import { MonacoEditorProps } from 'react-monaco-editor/lib/types'; - -/** - * @deprecated use LSPEditorConfigBuilder first - */ -export class MonacoEditorConfigBuilder implements IEditorConfigBuilder { - private config: MonacoEditorProps = { - language: 'sql', - options: { - theme: 'vs-dark', - readOnly: false, - minimap: { enabled: true }, - fontSize: 14, - wordWrap: 'wordWrapColumn', - wordWrapColumn: 80 - }, - value: '' - }; - - /** - * set Theme automatically - * dependents on IThemeManager - */ - public autoTheme() { - const theme = container.get(IThemeManager); - this.config.options.theme = theme.isLight(theme.theme) ? 'vs' : 'vs-dark'; - return this; - } - - /** - * the Editor uri, make sure uri is unique - * if you wants multiple editor instance - * @param uri - */ - public uri(uri: string) { - // empty implement - return this; - } - - /** - * the sql content - * @param content - */ - public content(content: string) { - this.config.value = content; - return this; - } - - /** - * generate LSP Url automatically - * dependents on window.location.host - */ - public autoLSPUrl() { - // empty implement - return this; - } - - /** - * build configs - */ - public build() { - return this.config; - } -} diff --git a/odps/lab_extension/src/Editor/SqlEditor.tsx b/odps/lab_extension/src/Editor/SqlEditor.tsx deleted file mode 100644 index 37764cd2..00000000 --- a/odps/lab_extension/src/Editor/SqlEditor.tsx +++ /dev/null @@ -1,133 +0,0 @@ -import { CodeCell } from '@jupyterlab/cells'; -import React from 'react'; -import ReactDOM from 'react-dom'; -import { INotebookTracker } from '@jupyterlab/notebook'; -import { Signal } from '@lumino/signaling'; -import { debounce } from 'lodash'; -import { - ODPS_CONFIGURE_PYTHON_CODE, - ODPS_EXECUTE_PYTHON_CODE -} from './Template'; -import Axios from 'axios'; -import { LSPEditorConfigBuilder } from './ConfigBuilder/LSPEditorConfigBuilder'; -import { isInner } from '../Utils/isInner'; - -/** - * 1. Insert code cell for ak and odps configuration - * 2. Mount SQL Editor on current cell - * bind sql change to code cell model value change - * @param tracker - */ -export const onSqlCellTypeSelected = (tracker: INotebookTracker) => { - Private.insertOdpsConfigureCell(tracker); - const cell = tracker.activeCell as CodeCell; - Private.mountSqlEditor(cell); -}; - -/** - * Traversal all cells and reverse state for sql cell - * @param tracker - */ -export const reverseAllEditor = (tracker: INotebookTracker) => { - const notebook = tracker.currentWidget; - const model = notebook.model; - - for (let i = 0; i <= model.cells.length; i++) { - const cellModel = model.cells.get(i); - if (!cellModel) { - break; - } - if (cellModel.metadata.get('odps_sql_cell')) { - const find = notebook.content.widgets.find( - item => item.model === cellModel - ) as CodeCell; - Private.mountSqlEditor(find); - } - } -}; - -namespace Private { - /** - * judge if there is any configure cell already exist - * if not, this function will - * insert an OdpsConfigure python code before SQL Editor - * @param tracker - */ - export const insertOdpsConfigureCell = (tracker: INotebookTracker) => { - const notebook = tracker.currentWidget!; - const model = notebook.model; - - for (let i = 0; i <= model.cells.length; i++) { - const cellModel = model.cells.get(i); - if (!cellModel) { - break; - } - if (cellModel.metadata.get('odps_configure')) { - return; - } - } - - const factory = model.contentFactory; - const codeCellModel = factory.createCodeCell({}); - codeCellModel.metadata.set('odps_configure', true); - - if (isInner()) { - } else { - codeCellModel.value.text = ODPS_CONFIGURE_PYTHON_CODE(); - model.cells.insert(notebook.content.activeCellIndex, codeCellModel); - } - }; - - /** - * insert SQL Editor and mount - * @param cell - */ - export const mountSqlEditor = (cell: CodeCell) => { - const codeCellModel = cell.model; - - codeCellModel.mimeType = 'ipython/sql'; - cell.editor.host.style.height = '400px'; - cell.editor.host.oncontextmenu = e => { - e.stopPropagation(); - }; - - const defaultSql: string = - (codeCellModel.metadata.get('sql_value') as string | undefined) || ''; - codeCellModel.value.text = ODPS_EXECUTE_PYTHON_CODE(defaultSql); - codeCellModel.metadata.set('odps_sql_cell', true); - - const builder = new LSPEditorConfigBuilder(); - - window.getLspEditor.then(() => { - const reactEle = React.createElement(window.LSPEditor, { - ...builder - .autoTheme() - .autoLSPUrl() - .content(defaultSql) - .uri(codeCellModel.id) - .build(), - // @ts-ignore - onChange: (content: string) => { - signal.emit({ - content - }); - } - }); - - const signal = new Signal< - typeof reactEle, - { - content: string; - } - >(reactEle); - signal.connect( - debounce((sender, args) => { - codeCellModel.value.text = ODPS_EXECUTE_PYTHON_CODE(args.content); - codeCellModel.metadata.set('sql_value', args.content); - }, 100) - ); - - ReactDOM.render(reactEle, cell.editor.host); - }); - }; -} diff --git a/odps/lab_extension/src/Editor/Template.ts b/odps/lab_extension/src/Editor/Template.ts deleted file mode 100644 index 665157e6..00000000 --- a/odps/lab_extension/src/Editor/Template.ts +++ /dev/null @@ -1,95 +0,0 @@ -import { isInner } from '../Utils/isInner'; - -export const ODPS_CONFIGURE_PYTHON_CODE = ({ - accessId = '', - accessSecret = '', - project = '', - endpoint = '' -} = {}) => { - const innerEnv = isInner(); - - const akDocument = innerEnv - ? 'Intranet environment detected, ak default set to inner d2 ak' - : 'https://c.tb.cn/F3.ZGP28B'; - const projectDocument = innerEnv - ? 'https://c.tb.cn/F3.ZGSHRA' - : 'https://c.tb.cn/F3.ZGObwD'; - const endpointDocument = innerEnv - ? 'Intranet environment detected, endpoint default set to inner odps endpoint' - : 'endpoint document: https://c.tb.cn/F3.ZG7jad'; - - return `from odps import ODPS -%load_ext odps - -o = ODPS( - access_id='${accessId}', # ak document: ${akDocument} - secret_access_key='${accessSecret}', - project='${project}', # project document: ${projectDocument} - endpoint='${endpoint}' # ${endpointDocument} -)`; -}; - -export const ODPS_EXECUTE_PYTHON_CODE = (sql: string) => { - return `def execute(sql): - from tqdm.notebook import tqdm - from time import sleep - from IPython.display import display, HTML, clear_output - import pandas as pd - - global _sql_execute_result, o - - if "o" not in globals(): - print("Please run odps configuration cell first") - return - - if "_sql_execute_result" not in globals(): - _sql_execute_result = {} - - bar = tqdm(total=1, desc='Preparing sql query') - progress = None - - instance = o.run_sql(sql) - - bar.update(1) - - display( - HTML( - f'
Open LogView to checkout details
' - ) - ) - - finished_last_loop = 0 - - while not instance.is_terminated(): - task_progress = instance.get_task_progress(instance.get_task_names()) - stages = task_progress.stages - finished = sum(map(lambda x: x.terminated_workers, stages)) - total = sum(map(lambda x: x.total_workers, stages)) - if progress: - if len(stages) == 0: - progress.update(total) - else: - progress.update(finished - finished_last_loop) - finished_last_loop = finished - elif not progress and len(stages) == 0: - continue - else: - progress = tqdm(total=total, desc='executing sql query') - progress.update(finished - finished_last_loop) - finished_last_loop = finished - sleep(1) - print('The data is being formatted. If the amount of data is large, it will take a while') - df = instance.open_reader().to_pandas() - - result_key = len(_sql_execute_result.keys()) - _sql_execute_result[result_key] = df - - pd.options.display.html.table_schema = True - pd.options.display.max_rows = None - clear_output() - print("you can find execute result in global variable: _sql_execute_result[{}]".format(result_key)) - - return df - -execute('''${sql}''')`; -}; diff --git a/odps/lab_extension/src/Register/RegisterDataVisualization.ts b/odps/lab_extension/src/Register/RegisterDataVisualization.ts deleted file mode 100644 index fbbf7260..00000000 --- a/odps/lab_extension/src/Register/RegisterDataVisualization.ts +++ /dev/null @@ -1,18 +0,0 @@ -import { IRenderMimeRegistry } from '@jupyterlab/rendermime'; -import { IRenderMime } from '@jupyterlab/rendermime-interfaces'; -import { RenderTableSchema } from '../DataVisualization/RenderTableSchema'; - -export const registerDataVisualization = ( - renderMimeRegistry: IRenderMimeRegistry -) => { - renderMimeRegistry.addFactory({ - safe: true, - defaultRank: 0, - mimeTypes: ['application/vnd.dataresource+json'], - createRenderer( - options: IRenderMime.IRendererOptions - ): IRenderMime.IRenderer { - return new RenderTableSchema(options); - } - }); -}; diff --git a/odps/lab_extension/src/Register/RegisterSelectCellType.ts b/odps/lab_extension/src/Register/RegisterSelectCellType.ts deleted file mode 100644 index 24b63955..00000000 --- a/odps/lab_extension/src/Register/RegisterSelectCellType.ts +++ /dev/null @@ -1,27 +0,0 @@ -import { INotebookTracker } from '@jupyterlab/notebook'; -import { onSqlCellTypeSelected } from '../Editor/SqlEditor'; - -export const CELL_VALUE = 'code'; -export const CELL_NAME = 'ODPS SQL'; - -export const registerSelectCellType = (tracker: INotebookTracker) => { - tracker.currentChanged.connect((_, notebook) => { - const selector = notebook.node.querySelector( - ':scope .jp-Notebook-toolbarCellTypeDropdown select' - ); - if (selector.querySelector(':scope option[odps=true]')) { - return; - } - const ele = document.createElement('option'); - ele.value = CELL_VALUE; - ele.text = CELL_NAME; - ele.setAttribute('odps', 'true'); - selector?.appendChild(ele); - selector.addEventListener('change', evt => { - const target = evt.target as HTMLSelectElement; - if (target.value === CELL_VALUE) { - onSqlCellTypeSelected(tracker); - } - }); - }); -}; diff --git a/odps/lab_extension/src/Register/RegisterSqlEditorReverser.ts b/odps/lab_extension/src/Register/RegisterSqlEditorReverser.ts deleted file mode 100644 index b3b55b84..00000000 --- a/odps/lab_extension/src/Register/RegisterSqlEditorReverser.ts +++ /dev/null @@ -1,31 +0,0 @@ -import { INotebookModel, INotebookTracker } from '@jupyterlab/notebook'; -import { reverseAllEditor } from '../Editor/SqlEditor'; -import { IChangedArgs } from '@jupyterlab/coreutils'; - -export const registerSqlEditorReverser = (tracker: INotebookTracker) => { - const slot = (sender: INotebookModel, args: IChangedArgs) => { - if (args.name === 'dirty' && !args.newValue) { - reverseAllEditor(tracker); - } - }; - if (tracker.currentWidget?.isAttached) { - slot(tracker.currentWidget.model, { - name: 'dirty', - newValue: false, - oldValue: false - }); - } - tracker.currentChanged.connect(() => { - if (tracker.currentWidget.model.dirty) { - // fixed when page already mount, but not ready - tracker.currentWidget.model.stateChanged.disconnect(slot); - tracker.currentWidget.model.stateChanged.connect(slot); - } else { - slot(tracker.currentWidget.model, { - name: 'dirty', - newValue: false, - oldValue: false - }); - } - }); -}; diff --git a/odps/lab_extension/src/Utils/injectCDN.ts b/odps/lab_extension/src/Utils/injectCDN.ts deleted file mode 100644 index 753386ad..00000000 --- a/odps/lab_extension/src/Utils/injectCDN.ts +++ /dev/null @@ -1,18 +0,0 @@ -import async from 'async'; - -export const injectCDN = () => { - const url = [ - '//g.alicdn.com/code/lib/react/16.6.1/umd/react.production.min.js', - '//g.alicdn.com/code/lib/react-dom/16.6.1/umd/react-dom.production.min.js', - '//alifd.alicdn.com/npm/@alifd/next/1.11.6/next.min.js', - '//f.alicdn.com/lodash.js/4.17.4/lodash.min.js', - '//g.alicdn.com/LSP/LSP-Editor/0.4.15/index.js' - ]; - async.eachOfSeries(url, (item, key, callback) => { - const ele = document.createElement('script'); - ele.src = item; - ele.type = 'text/javascript'; - ele.onload = () => callback(); - document.head.appendChild(ele); - }); -}; diff --git a/odps/lab_extension/src/Utils/isInner.ts b/odps/lab_extension/src/Utils/isInner.ts deleted file mode 100644 index ef6baede..00000000 --- a/odps/lab_extension/src/Utils/isInner.ts +++ /dev/null @@ -1,3 +0,0 @@ -export const isInner = () => { - return false; -}; diff --git a/odps/lab_extension/src/global.d.ts b/odps/lab_extension/src/global.d.ts deleted file mode 100644 index edf5c9b1..00000000 --- a/odps/lab_extension/src/global.d.ts +++ /dev/null @@ -1,4 +0,0 @@ -interface Window { - getLspEditor: Promise; - LSPEditor: any; -} \ No newline at end of file diff --git a/odps/lab_extension/src/index.ts b/odps/lab_extension/src/index.ts deleted file mode 100644 index aa2a41a0..00000000 --- a/odps/lab_extension/src/index.ts +++ /dev/null @@ -1,38 +0,0 @@ -import { - JupyterFrontEnd, - JupyterFrontEndPlugin -} from '@jupyterlab/application'; -import { INotebookTracker } from '@jupyterlab/notebook'; -import { registerSelectCellType } from './Register/RegisterSelectCellType'; -import { registerSqlEditorReverser } from './Register/RegisterSqlEditorReverser'; -import { container } from './Container'; -import { registerDataVisualization } from './Register/RegisterDataVisualization'; -import { injectCDN } from './Utils/injectCDN'; -import { IRenderMimeRegistry } from '@jupyterlab/rendermime'; -import { IThemeManager } from '@jupyterlab/apputils'; - -/** - * Initialization data for the pyodps-lab-extension extension. - */ -const extension: JupyterFrontEndPlugin = { - id: 'pyodps-lab-extension:plugin', - autoStart: true, - requires: [INotebookTracker, IThemeManager, IRenderMimeRegistry], - activate: ( - app: JupyterFrontEnd, - tracker: INotebookTracker, - themeManager: IThemeManager, - renderMimeRegistry: IRenderMimeRegistry - ) => { - container.set(INotebookTracker, tracker); - container.set(IThemeManager, themeManager); - container.set(IRenderMimeRegistry, renderMimeRegistry); - injectCDN(); - registerDataVisualization(renderMimeRegistry); - registerSelectCellType(tracker); - registerSqlEditorReverser(tracker); - console.log('JupyterLab extension pyodps-lab-extension is activated!'); - } -}; - -export default extension; diff --git a/odps/lab_extension/style/base.css b/odps/lab_extension/style/base.css deleted file mode 100644 index e69de29b..00000000 diff --git a/odps/lab_extension/style/index.css b/odps/lab_extension/style/index.css deleted file mode 100644 index 8a7ea29e..00000000 --- a/odps/lab_extension/style/index.css +++ /dev/null @@ -1 +0,0 @@ -@import url('base.css'); diff --git a/odps/lab_extension/style/index.js b/odps/lab_extension/style/index.js deleted file mode 100644 index a028a764..00000000 --- a/odps/lab_extension/style/index.js +++ /dev/null @@ -1 +0,0 @@ -import './base.css'; diff --git a/odps/lab_extension/tsconfig.json b/odps/lab_extension/tsconfig.json deleted file mode 100644 index 8cde80eb..00000000 --- a/odps/lab_extension/tsconfig.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "compilerOptions": { - "allowSyntheticDefaultImports": true, - "composite": true, - "declaration": true, - "esModuleInterop": true, - "incremental": true, - "jsx": "react", - "module": "esnext", - "moduleResolution": "node", - "noEmitOnError": true, - "noImplicitAny": true, - "noUnusedLocals": true, - "preserveWatchOutput": true, - "resolveJsonModule": true, - "outDir": "lib", - "rootDir": "src", - "strict": true, - "strictNullChecks": false, - "target": "es2017", - "types": [] - }, - "include": ["src/**/*"] -} diff --git a/odps/lib/tblib/__init__.py b/odps/lib/tblib/__init__.py index 90e1a595..5c068d96 100644 --- a/odps/lib/tblib/__init__.py +++ b/odps/lib/tblib/__init__.py @@ -1,8 +1,8 @@ import re import sys -from types import CodeType -from types import FrameType -from types import TracebackType +from types import CodeType, FrameType, TracebackType + +from ..six import raise_from try: from __pypy__ import tproxy @@ -16,11 +16,13 @@ if not tb_set_next and not tproxy: raise ImportError("Cannot use tblib. Runtime not supported.") -__version__ = '1.7.0' -__all__ = 'Traceback', 'TracebackParseError', 'Frame', 'Code' +__version__ = "3.0.0" +__all__ = "Traceback", "TracebackParseError", "Frame", "Code" PY3 = sys.version_info[0] == 3 -FRAME_RE = re.compile(r'^\s*File "(?P.+)", line (?P\d+)(, in (?P.+))?$') +FRAME_RE = re.compile( + r'^\s*File "(?P.+)", line (?P\d+)(, in (?P.+))?$' +) class _AttrDict(dict): @@ -30,7 +32,7 @@ def __getattr__(self, name): try: return self[name] except KeyError: - raise AttributeError(name) + raise_from(AttributeError(name), None) # noinspection PyPep8Naming @@ -46,6 +48,7 @@ class Code(object): """ Class that replicates just enough of the builtin Code object to enable serialization and traceback rendering. """ + co_code = None def __init__(self, code): @@ -59,28 +62,34 @@ def __init__(self, code): self.co_flags = 64 self.co_firstlineno = 0 - # noinspection SpellCheckingInspection - def __tproxy__(self, operation, *args, **kwargs): - """ - Necessary for PyPy's tproxy. - """ - if operation in ('__getattribute__', '__getattr__'): - return getattr(self, args[0]) - else: - return getattr(self, operation)(*args, **kwargs) + if not PY3: + # noinspection SpellCheckingInspection + def __tproxy__(self, operation, *args, **kwargs): + """ + Necessary for PyPy's tproxy. + """ + if operation in ("__getattribute__", "__getattr__"): + return getattr(self, args[0]) + else: + return getattr(self, operation)(*args, **kwargs) class Frame(object): """ Class that replicates just enough of the builtin Frame object to enable serialization and traceback rendering. + + Args: + + get_locals (callable): A function that take a frame argument and returns a dict. + + See :class:`Traceback` class for example. """ - def __init__(self, frame): - self.f_locals = {} - self.f_globals = dict( - (k, v) - for k, v in frame.f_globals.items() - if k in ("__file__", "__name__") - ) + + def __init__(self, frame, get_locals=None): + self.f_locals = {} if get_locals is None else get_locals(frame) + self.f_globals = { + k: v for k, v in frame.f_globals.items() if k in ("__file__", "__name__") + } self.f_code = Code(frame.f_code) self.f_lineno = frame.f_lineno @@ -92,28 +101,45 @@ def clear(self): in turn is called by unittest.TestCase.assertRaises """ - # noinspection SpellCheckingInspection - def __tproxy__(self, operation, *args, **kwargs): - """ - Necessary for PyPy's tproxy. - """ - if operation in ('__getattribute__', '__getattr__'): - if args[0] == 'f_code': - return tproxy(CodeType, self.f_code.__tproxy__) + if not PY3: + # noinspection SpellCheckingInspection + def __tproxy__(self, operation, *args, **kwargs): + """ + Necessary for PyPy's tproxy. + """ + if operation in ("__getattribute__", "__getattr__"): + if args[0] == "f_code": + return tproxy(CodeType, self.f_code.__tproxy__) + else: + return getattr(self, args[0]) else: - return getattr(self, args[0]) - else: - return getattr(self, operation)(*args, **kwargs) + return getattr(self, operation)(*args, **kwargs) class Traceback(object): """ Class that wraps builtin Traceback objects. + + Args: + get_locals (callable): A function that take a frame argument and returns a dict. + + Ideally you will only return exactly what you need, and only with simple types that can be json serializable. + + Example: + + .. code:: python + + def get_locals(frame): + if frame.f_locals.get("__tracebackhide__"): + return {"__tracebackhide__": True} + else: + return {} """ + tb_next = None - def __init__(self, tb): - self.tb_frame = Frame(tb.tb_frame) + def __init__(self, tb, get_locals=None): + self.tb_frame = Frame(tb.tb_frame, get_locals=get_locals) # noinspection SpellCheckingInspection self.tb_lineno = int(tb.tb_lineno) @@ -123,7 +149,7 @@ def __init__(self, tb): cls = type(self) while tb is not None: traceback = object.__new__(cls) - traceback.tb_frame = Frame(tb.tb_frame) + traceback.tb_frame = Frame(tb.tb_frame, get_locals=get_locals) traceback.tb_lineno = int(tb.tb_lineno) prev_traceback.tb_next = traceback prev_traceback = traceback @@ -133,42 +159,74 @@ def as_traceback(self): """ Convert to a builtin Traceback object that is usable for raising or rendering a stacktrace. """ - if tproxy: - return tproxy(TracebackType, self.__tproxy__) - if not tb_set_next: - raise RuntimeError("Unsupported Python interpreter!") + if not PY3: + if tproxy: + return tproxy(TracebackType, self.__tproxy__) + if not tb_set_next: + raise RuntimeError("Unsupported Python interpreter!") current = self top_tb = None tb = None while current: f_code = current.tb_frame.f_code - code = compile('\n' * (current.tb_lineno - 1) + 'raise __traceback_maker', current.tb_frame.f_code.co_filename, 'exec') + code = compile( + "\n" * (current.tb_lineno - 1) + "raise __traceback_maker", + current.tb_frame.f_code.co_filename, + "exec", + ) if hasattr(code, "replace"): # Python 3.8 and newer - code = code.replace(co_argcount=0, - co_filename=f_code.co_filename, co_name=f_code.co_name, - co_freevars=(), co_cellvars=()) + code = code.replace( + co_argcount=0, + co_filename=f_code.co_filename, + co_name=f_code.co_name, + co_freevars=(), + co_cellvars=(), + ) elif PY3: code = CodeType( - 0, code.co_kwonlyargcount, - code.co_nlocals, code.co_stacksize, code.co_flags, - code.co_code, code.co_consts, code.co_names, code.co_varnames, - f_code.co_filename, f_code.co_name, - code.co_firstlineno, code.co_lnotab, (), () + 0, + code.co_kwonlyargcount, + code.co_nlocals, + code.co_stacksize, + code.co_flags, + code.co_code, + code.co_consts, + code.co_names, + code.co_varnames, + f_code.co_filename, + f_code.co_name, + code.co_firstlineno, + code.co_lnotab, + (), + (), ) else: code = CodeType( 0, - code.co_nlocals, code.co_stacksize, code.co_flags, - code.co_code, code.co_consts, code.co_names, code.co_varnames, - f_code.co_filename.encode(), f_code.co_name.encode(), - code.co_firstlineno, code.co_lnotab, (), () + code.co_nlocals, + code.co_stacksize, + code.co_flags, + code.co_code, + code.co_consts, + code.co_names, + code.co_varnames, + f_code.co_filename.encode(), + f_code.co_name.encode(), + code.co_firstlineno, + code.co_lnotab, + (), + (), ) # noinspection PyBroadException try: - exec(code, dict(current.tb_frame.f_globals), {}) + exec( + code, + dict(current.tb_frame.f_globals), + dict(current.tb_frame.f_locals), + ) # noqa: S102 except Exception: next_tb = sys.exc_info()[2].tb_next if top_tb is None: @@ -184,22 +242,24 @@ def as_traceback(self): finally: del top_tb del tb + to_traceback = as_traceback - # noinspection SpellCheckingInspection - def __tproxy__(self, operation, *args, **kwargs): - """ - Necessary for PyPy's tproxy. - """ - if operation in ('__getattribute__', '__getattr__'): - if args[0] == 'tb_next': - return self.tb_next and self.tb_next.as_traceback() - elif args[0] == 'tb_frame': - return tproxy(FrameType, self.tb_frame.__tproxy__) + if not PY3: + # noinspection SpellCheckingInspection + def __tproxy__(self, operation, *args, **kwargs): + """ + Necessary for PyPy's tproxy. + """ + if operation in ("__getattribute__", "__getattr__"): + if args[0] == "tb_next": + return self.tb_next and self.tb_next.as_traceback() + elif args[0] == "tb_frame": + return tproxy(FrameType, self.tb_frame.__tproxy__) + else: + return getattr(self, args[0]) else: - return getattr(self, args[0]) - else: - return getattr(self, operation)(*args, **kwargs) + return getattr(self, operation)(*args, **kwargs) def as_dict(self): """ @@ -209,22 +269,27 @@ def as_dict(self): if self.tb_next is None: tb_next = None else: - tb_next = self.tb_next.to_dict() + if hasattr(self.tb_next, "to_dict"): + tb_next = self.tb_next.to_dict() + else: + tb_next = self.tb_next.as_dict() code = { - 'co_filename': self.tb_frame.f_code.co_filename, - 'co_name': self.tb_frame.f_code.co_name, + "co_filename": self.tb_frame.f_code.co_filename, + "co_name": self.tb_frame.f_code.co_name, } frame = { - 'f_globals': self.tb_frame.f_globals, - 'f_code': code, - 'f_lineno': self.tb_frame.f_lineno, + "f_globals": self.tb_frame.f_globals, + "f_locals": self.tb_frame.f_locals, + "f_code": code, + "f_lineno": self.tb_frame.f_lineno, } return { - 'tb_frame': frame, - 'tb_lineno': self.tb_lineno, - 'tb_next': tb_next, + "tb_frame": frame, + "tb_lineno": self.tb_lineno, + "tb_next": tb_next, } + to_dict = as_dict @classmethod @@ -232,26 +297,27 @@ def from_dict(cls, dct): """ Creates an instance from a dictionary with the same structure as ``.as_dict()`` returns. """ - if dct['tb_next']: - tb_next = cls.from_dict(dct['tb_next']) + if dct["tb_next"]: + tb_next = cls.from_dict(dct["tb_next"]) else: tb_next = None code = _AttrDict( - co_filename=dct['tb_frame']['f_code']['co_filename'], - co_name=dct['tb_frame']['f_code']['co_name'], + co_filename=dct["tb_frame"]["f_code"]["co_filename"], + co_name=dct["tb_frame"]["f_code"]["co_name"], ) frame = _AttrDict( - f_globals=dct['tb_frame']['f_globals'], + f_globals=dct["tb_frame"]["f_globals"], + f_locals=dct["tb_frame"].get("f_locals", {}), f_code=code, - f_lineno=dct['tb_frame']['f_lineno'], + f_lineno=dct["tb_frame"]["f_lineno"], ) tb = _AttrDict( tb_frame=frame, - tb_lineno=dct['tb_lineno'], + tb_lineno=dct["tb_lineno"], tb_next=tb_next, ) - return cls(tb) + return cls(tb, get_locals=get_all_locals) @classmethod def from_string(cls, string, strict=True): @@ -265,13 +331,13 @@ def from_string(cls, string, strict=True): for line in string.splitlines(): line = line.rstrip() if header: - if line == 'Traceback (most recent call last):': + if line == "Traceback (most recent call last):": header = False continue frame_match = FRAME_RE.match(line) if frame_match: frames.append(frame_match.groupdict()) - elif line.startswith(' '): + elif line.startswith(" "): pass elif strict: break # traceback ended @@ -284,14 +350,19 @@ def from_string(cls, string, strict=True): tb_frame=_AttrDict( frame, f_globals=_AttrDict( - __file__=frame['co_filename'], - __name__='?', + __file__=frame["co_filename"], + __name__="?", ), + f_locals={}, f_code=_AttrDict(frame), - f_lineno=int(frame['tb_lineno']), + f_lineno=int(frame["tb_lineno"]), ), tb_next=previous, ) return cls(previous) else: raise TracebackParseError("Could not find any frames in %r." % string) + + +def get_all_locals(frame): + return dict(frame.f_locals) diff --git a/odps/lib/tblib/cpython.py b/odps/lib/tblib/cpython.py index 06d89836..783e229e 100644 --- a/odps/lib/tblib/cpython.py +++ b/odps/lib/tblib/cpython.py @@ -76,7 +76,9 @@ def tb_set_next(tb, next): tb_set_next = None try: - if platform.python_implementation() == "CPython": + if sys.version_info[0] >= 3: + tb_set_next = lambda tb, nxt: setattr(tb, "tb_next", nxt) + elif platform.python_implementation() == "CPython": tb_set_next = _init_ugly_crap() except Exception as exc: sys.stderr.write("Failed to initialize cpython support: {!r}".format(exc)) diff --git a/odps/lib/tblib/decorators.py b/odps/lib/tblib/decorators.py index 93bb86d3..3aa5bf17 100644 --- a/odps/lib/tblib/decorators.py +++ b/odps/lib/tblib/decorators.py @@ -1,6 +1,7 @@ import sys from functools import wraps +from ..six import PY3, raise_from, reraise from . import Traceback @@ -15,7 +16,10 @@ def traceback(self): return self.__traceback.as_traceback() def reraise(self): - raise self.exc_value.with_traceback(self.traceback) from None + if PY3: + raise_from(self.exc_value.with_traceback(self.traceback), None) + else: + reraise(self.exc_type, self.exc_value, self.traceback) def return_error(func, exc_type=Exception): @@ -29,14 +33,19 @@ def return_exceptions_wrapper(*args, **kwargs): return return_exceptions_wrapper -returns_error = return_errors = returns_errors = return_error # cause I make too many typos +returns_error = ( + return_errors +) = returns_errors = return_error # cause I make too many typos @return_error def apply_with_return_error(args): """ args is a tuple where the first argument is a callable. + eg:: + apply_with_return_error((func, 1, 2, 3)) - this will call func(1, 2, 3) + """ return args[0](*args[1:]) diff --git a/odps/lib/tblib/pickling_support.py b/odps/lib/tblib/pickling_support.py index 3185fe13..92222f07 100644 --- a/odps/lib/tblib/pickling_support.py +++ b/odps/lib/tblib/pickling_support.py @@ -1,14 +1,18 @@ import sys +from functools import partial from types import TracebackType -from . import Frame -from . import Traceback +from . import Frame, Traceback if sys.version_info[0] >= 3: import copyreg + + PY3 = True else: import copy_reg as copyreg + PY3 = False + def unpickle_traceback(tb_frame, tb_lineno, tb_next): ret = object.__new__(Traceback) @@ -18,14 +22,27 @@ def unpickle_traceback(tb_frame, tb_lineno, tb_next): return ret.as_traceback() -def pickle_traceback(tb): - return unpickle_traceback, (Frame(tb.tb_frame), tb.tb_lineno, tb.tb_next and Traceback(tb.tb_next)) +def pickle_traceback(tb, get_locals=None): + return unpickle_traceback, ( + Frame(tb.tb_frame, get_locals=get_locals), + tb.tb_lineno, + tb.tb_next and Traceback(tb.tb_next, get_locals=get_locals), + ) -def unpickle_exception(func, args, cause, tb): +# Note: Older versions of tblib will generate pickle archives that call unpickle_exception() with +# fewer arguments. We assign default values to some of the arguments to support this. +def unpickle_exception( + func, args, cause, tb, context=None, suppress_context=False, notes=None +): inst = func(*args) inst.__cause__ = cause inst.__traceback__ = tb + if PY3: + inst.__context__ = context + inst.__suppress_context__ = suppress_context + if notes is not None: + inst.__notes__ = notes return inst @@ -40,9 +57,21 @@ def pickle_exception(obj): rv = obj.__reduce_ex__(3) if isinstance(rv, str): raise TypeError("str __reduce__ output is not supported") - assert isinstance(rv, tuple) and len(rv) >= 2 - - return (unpickle_exception, rv[:2] + (obj.__cause__, obj.__traceback__)) + rv[2:] + assert isinstance(rv, tuple) + assert len(rv) >= 2 + + return ( + unpickle_exception, + rv[:2] + + ( + obj.__cause__, + obj.__traceback__, + getattr(obj, "__context__", None), + getattr(obj, "__suppress_context__", None), + # __notes__ doesn't exist prior to Python 3.11; and even on Python 3.11 it may be absent + getattr(obj, "__notes__", None), + ), + ) + rv[2:] def _get_subclasses(cls): @@ -54,8 +83,17 @@ def _get_subclasses(cls): to_visit += list(this.__subclasses__()) -def install(*exc_classes_or_instances): - copyreg.pickle(TracebackType, pickle_traceback) +def install(*exc_classes_or_instances, **kwargs): + """ + Args: + + get_locals (callable): A function that take a frame argument and returns a dict. See :class:`tblib.Traceback` class for example. + """ + get_locals = kwargs.pop("get_locals", None) + if kwargs: + raise TypeError("Keyword arguments %s not supported" % list(kwargs.keys())) + + copyreg.pickle(TracebackType, partial(pickle_traceback, get_locals=get_locals)) if sys.version_info[0] < 3: # Dummy decorator? @@ -72,9 +110,7 @@ def install(*exc_classes_or_instances): for exc in exc_classes_or_instances: if isinstance(exc, BaseException): - while exc is not None: - copyreg.pickle(type(exc), pickle_exception) - exc = exc.__cause__ + _install_for_instance(exc, set()) elif isinstance(exc, type) and issubclass(exc, BaseException): copyreg.pickle(exc, pickle_exception) # Allow using @install as a decorator for Exception classes @@ -85,3 +121,28 @@ def install(*exc_classes_or_instances): "Expected subclasses or instances of BaseException, got %s" % (type(exc)) ) + + +def _install_for_instance(exc, seen): + assert isinstance(exc, BaseException) + + # Prevent infinite recursion if we somehow get a self-referential exception. (Self-referential + # exceptions should never normally happen, but if it did somehow happen, we want to pickle the + # exception faithfully so the developer can troubleshoot why it happened.) + if id(exc) in seen: + return + seen.add(id(exc)) + + copyreg.pickle(type(exc), pickle_exception) + + if exc.__cause__ is not None: + _install_for_instance(exc.__cause__, seen) + if exc.__context__ is not None: + _install_for_instance(exc.__context__, seen) + + # This case is meant to cover BaseExceptionGroup on Python 3.11 as well as backports like the + # exceptiongroup module + if hasattr(exc, "exceptions") and isinstance(exc.exceptions, (tuple, list)): + for subexc in exc.exceptions: + if isinstance(subexc, BaseException): + _install_for_instance(subexc, seen) diff --git a/odps/models/__init__.py b/odps/models/__init__.py index 143d5ecb..8203bdcf 100644 --- a/odps/models/__init__.py +++ b/odps/models/__init__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,35 +14,39 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys +import warnings + from .core import RestModel -from .projects import Projects -from .project import Project -from .tables import Tables -from .table import Table, TableSchema -from .instances import Instances -from .instance import Instance -from .functions import Functions from .function import Function -from .resources import Resources +from .functions import Functions +from .instance import Instance +from .instances import Instances +from .project import Project +from .projects import Projects +from .quota import Quota +from .quotas import Quotas +from .record import Record from .resource import * -from .session import InSessionInstance, SessionInstance +from .resources import Resources +from .session import InSessionInstance, SessionInstance, SessionMethods +from .table import Table, TableSchema +from .tableio import TableIOMethods +from .tables import Tables +from .tasks import * from .tenant import Tenant -from .volumes import * -from .volume_parted import PartedVolume, VolumePartition -from .volume_fs import FSVolume, FSVolumeDir, FSVolumeFile from .volume_ext import ExternalVolume, ExternalVolumeDir, ExternalVolumeFile -from .xflows import XFlows -from .xflow import XFlow -from .tasks import * -from .record import Record +from .volume_fs import FSVolume, FSVolumeDir, FSVolumeFile +from .volume_parted import PartedVolume, VolumePartition +from .volumes import * from .worker import Worker - -import sys -import warnings +from .xflow import XFlow +from .xflows import XFlows if sys.version_info[:2] < (3, 7): Schema = TableSchema # Schema is to keep compatible else: + def __getattr__(name): if name != "Schema": raise AttributeError(name) @@ -58,6 +62,7 @@ def __getattr__(name): utils.add_survey_call("odps.models.Schema") return TableSchema + Column = TableSchema.TableColumn Partition = TableSchema.TablePartition diff --git a/odps/models/cache.py b/odps/models/cache.py index c49efb53..0ceaa1c9 100644 --- a/odps/models/cache.py +++ b/odps/models/cache.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,8 +16,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import weakref import inspect +import weakref from ..compat import six @@ -28,7 +28,7 @@ def __init__(self): @staticmethod def _get_cache_class(cls): - if hasattr(cls, '_cache_class'): + if hasattr(cls, "_cache_class"): cls._cache_class = ObjectCache._get_cache_class(cls._cache_class) return cls._cache_class return cls @@ -38,11 +38,11 @@ def _fetch(self, cache_key): if parent is None: return self._caches.get(cache_key) - ancestor = getattr(parent, '_parent') + ancestor = getattr(parent, "_parent") parent_cls = self._get_cache_class(type(parent)) if parent_cls is None: return None - name = getattr(parent, 'name', parent_cls.__name__.lower()) + name = getattr(parent, "name", parent_cls.__name__.lower()) parent_cache_key = client, ancestor, parent_cls, name @@ -51,9 +51,9 @@ def _fetch(self, cache_key): def _get_cache(self, cls, **kw): kwargs = dict(kw) - parent = kwargs.pop('parent', None) or kwargs.pop('_parent', None) - name = kwargs.pop(getattr(cls, '_cache_name_arg', 'name'), None) - client = kwargs.pop('client', None) or kwargs.pop('_client', None) + parent = kwargs.pop("parent", None) or kwargs.pop("_parent", None) + name = kwargs.pop(getattr(cls, "_cache_name_arg", "name"), None) + client = kwargs.pop("client", None) or kwargs.pop("_client", None) cache_cls = self._get_cache_class(cls) cache_key = client, parent, cache_cls, name @@ -79,15 +79,15 @@ def cache_lazyload(self, func, cls, **kwargs): obj = func(cls, **kwargs) - if not hasattr(cls, '_filter_cache'): + if not hasattr(cls, "_filter_cache"): self._caches[cache_key] = obj elif cls._filter_cache(func, **obj.extract(**kwargs)): self._caches[cache_key] = obj return obj def cache_container(self, func, cls, **kwargs): - parent = kwargs.get('parent') or kwargs.get('_parent') - client = kwargs.get('client') or kwargs.get('_client') + parent = kwargs.get("parent") or kwargs.get("_parent") + client = kwargs.get("client") or kwargs.get("_client") name = cls.__name__.lower() cache_key = client, parent, cls, name @@ -105,8 +105,8 @@ def del_item_cache(self, obj, item): item = obj[item] - client = getattr(item, '_client') - parent = getattr(item, '_parent') + client = getattr(item, "_client") + parent = getattr(item, "_parent") name = item._name() if name is not None: @@ -127,9 +127,9 @@ def del_item_cache(self, obj, item): def cache(func): def inner(cls, **kwargs): bases = [base.__name__ for base in inspect.getmro(cls)] - if 'LazyLoad' in bases: + if "LazyLoad" in bases: return _object_cache.cache_lazyload(func, cls, **kwargs) - elif 'Container' in bases: + elif "Container" in bases: return _object_cache.cache_container(func, cls, **kwargs) return func(cls, **kwargs) @@ -141,7 +141,7 @@ def inner(cls, **kwargs): def del_cache(func): def inner(obj, item): - if func.__name__ == '__delitem__': + if func.__name__ == "__delitem__": _object_cache.del_item_cache(obj, item) return func(obj, item) diff --git a/odps/models/cluster_info.py b/odps/models/cluster_info.py index 6e5c4476..93e8d374 100644 --- a/odps/models/cluster_info.py +++ b/odps/models/cluster_info.py @@ -27,19 +27,19 @@ class ClusterSortOrder(Enum): class ClusterSortCol(serializers.JSONSerializableModel): - name = serializers.JSONNodeField('col') + name = serializers.JSONNodeField("col") order = serializers.JSONNodeField( - 'order', parse_callback=lambda x: ClusterSortOrder(x.upper()) if x else None + "order", parse_callback=lambda x: ClusterSortOrder(x.upper()) if x else None ) class ClusterInfo(serializers.JSONSerializableModel): cluster_type = serializers.JSONNodeField( - 'ClusterType', parse_callback=lambda x: ClusterType(x.lower()) if x else None + "ClusterType", parse_callback=lambda x: ClusterType(x.lower()) if x else None ) - bucket_num = serializers.JSONNodeField('BucketNum') - cluster_cols = serializers.JSONNodeField('ClusterCols') - sort_cols = serializers.JSONNodesReferencesField(ClusterSortCol, 'SortCols') + bucket_num = serializers.JSONNodeField("BucketNum") + cluster_cols = serializers.JSONNodeField("ClusterCols") + sort_cols = serializers.JSONNodesReferencesField(ClusterSortCol, "SortCols") @classmethod def deserial(cls, content, obj=None, **kw): diff --git a/odps/models/core.py b/odps/models/core.py index ced852a6..c8e68117 100644 --- a/odps/models/core.py +++ b/odps/models/core.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,54 +17,58 @@ import warnings from .. import options, serializers, utils -from ..compat import six, quote_plus +from ..compat import quote_plus, six from .cache import cache, del_cache class XMLRemoteModel(serializers.XMLSerializableModel): - __slots__ = '_parent', '_client', '_schema_name' + __slots__ = "_parent", "_client", "_schema_name" def __init__(self, **kwargs): - if 'parent' in kwargs: - kwargs['_parent'] = kwargs.pop('parent') - if 'client' in kwargs: - kwargs['_client'] = kwargs.pop('client') + if "parent" in kwargs: + kwargs["_parent"] = kwargs.pop("parent") + if "client" in kwargs: + kwargs["_client"] = kwargs.pop("client") self._schema_name = utils.notset if not frozenset(kwargs).issubset(self.__slots__): unexpected = sorted(set(kwargs) - set(self.__slots__)) - raise TypeError("%s() meet illegal arguments (%s)" % ( - type(self).__name__, ', '.join(unexpected))) + raise TypeError( + "%s() meet illegal arguments (%s)" + % (type(self).__name__, ", ".join(unexpected)) + ) super(XMLRemoteModel, self).__init__(**kwargs) @classmethod def parse(cls, client, response, obj=None, **kw): - kw['_client'] = client + kw["_client"] = client return super(XMLRemoteModel, cls).parse(response, obj=obj, **kw) class AbstractXMLRemoteModel(XMLRemoteModel): - __slots__ = '_type_indicator', + __slots__ = ("_type_indicator",) class JSONRemoteModel(serializers.JSONSerializableModel): - __slots__ = '_parent', '_client' + __slots__ = "_parent", "_client" def __init__(self, **kwargs): - if 'parent' in kwargs: - kwargs['_parent'] = kwargs.pop('parent') - if 'client' in kwargs: - kwargs['_client'] = kwargs.pop('client') + if "parent" in kwargs: + kwargs["_parent"] = kwargs.pop("parent") + if "client" in kwargs: + kwargs["_client"] = kwargs.pop("client") if not frozenset(kwargs).issubset(self.__slots__): unexpected = sorted(set(kwargs) - set(self.__slots__)) - raise TypeError("%s() meet illegal arguments (%s)" % ( - type(self).__name__, ', '.join(unexpected))) + raise TypeError( + "%s() meet illegal arguments (%s)" + % (type(self).__name__, ", ".join(unexpected)) + ) super(JSONRemoteModel, self).__init__(**kwargs) @classmethod def parse(cls, client, response, obj=None, **kw): - kw['_client'] = client + kw["_client"] = client return super(JSONRemoteModel, cls).parse(response, obj=obj, **kw) @@ -77,7 +81,7 @@ def _getattr(self, attr): @classmethod def _encode(cls, name): - name = quote_plus(name).replace('+', '%20') + name = quote_plus(name).replace("+", "%20") return name def resource(self, client=None, endpoint=None): @@ -91,7 +95,7 @@ def resource(self, client=None, endpoint=None): name = self._name() if name is None: return parent_res - return '/'.join([parent_res, self._encode(name)]) + return "/".join([parent_res, self._encode(name)]) def __eq__(self, other): if other is None: @@ -100,8 +104,7 @@ def __eq__(self, other): if not isinstance(other, type(self)): return False - return self._name() == other._name() and \ - self.parent == other.parent + return self._name() == other._name() and self.parent == other.parent def __hash__(self): return hash(type(self)) * hash(self._parent) * hash(self._name()) @@ -112,7 +115,7 @@ def _get_schema_name(self): if isinstance(self._parent, LazyLoad): schema = self._parent.get_schema() - elif isinstance(self._parent, Container): + elif isinstance(self._parent, Container) and self._parent._parent is not None: schema = self._parent._parent.get_schema() else: schema = None @@ -121,7 +124,7 @@ def _get_schema_name(self): class LazyLoad(RestModel): - __slots__ = '_loaded', + __slots__ = ("_loaded",) @cache def __new__(cls, *args, **kwargs): @@ -129,11 +132,11 @@ def __new__(cls, *args, **kwargs): def __init__(self, **kwargs): self._loaded = False - kwargs.pop('no_cache', None) + kwargs.pop("no_cache", None) super(LazyLoad, self).__init__(**kwargs) def _name(self): - return self._getattr('name') + return self._getattr("name") def __getattribute__(self, attr): if ( @@ -149,11 +152,13 @@ def __getattribute__(self, attr): category=DeprecationWarning, ) typ = type(self) - utils.add_survey_call(".".join([typ.__module__, typ.__name__, attr]) + ":legacy_parsedate") + utils.add_survey_call( + ".".join([typ.__module__, typ.__name__, attr]) + ":legacy_parsedate" + ) val = object.__getattribute__(self, attr) if val is None and not self._loaded: - fields = getattr(type(self), '__fields') + fields = getattr(type(self), "__fields") if attr in fields: self.reload() return object.__getattribute__(self, attr) @@ -181,7 +186,7 @@ def __repr__(self): def _repr(self): name = self._name() if name: - return '<%s %s>' % (type(self).__name__, name) + return "<%s %s>" % (type(self).__name__, name) else: raise ValueError @@ -246,7 +251,7 @@ def __getitem__(self, item): if not item: raise ValueError("Empty string not supported") return self._get(item) - raise ValueError('Unsupported getitem value: %s' % item) + raise ValueError("Unsupported getitem value: %s" % item) @del_cache def __delitem__(self, key): @@ -264,7 +269,7 @@ def __setstate__(self, state): class Iterable(Container): - __slots__ = '_iter', + __slots__ = ("_iter",) def __init__(self, **kwargs): super(Iterable, self).__init__(**kwargs) diff --git a/odps/models/function.py b/odps/models/function.py index ef805226..0245322f 100644 --- a/odps/models/function.py +++ b/odps/models/function.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,45 +14,46 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .. import serializers, utils from .core import LazyLoad from .resource import Resource -from .. import serializers, utils class Function(LazyLoad): """ Function can be used in UDF when user writes a SQL. """ - __slots__ = '_resources_objects', '_owner_changed' - _root = 'Function' + __slots__ = "_resources_objects", "_owner_changed" - name = serializers.XMLNodeField('Alias') - _owner = serializers.XMLNodeField('Owner') - creation_time = serializers.XMLNodeField('CreationTime', parse_callback=utils.parse_rfc822) - class_type = serializers.XMLNodeField('ClassType') - _resources = serializers.XMLNodesField('Resources', 'ResourceName') - is_sql_function = serializers.XMLNodeField('IsSqlFunction') - is_embedded_function = serializers.XMLNodeField('IsEmbeddedFunction') - program_language = serializers.XMLNodeField('ProgramLanguage') - code = serializers.XMLNodeField('Code') - file_name = serializers.XMLNodeField('FileName') + _root = "Function" + + name = serializers.XMLNodeField("Alias") + _owner = serializers.XMLNodeField("Owner") + creation_time = serializers.XMLNodeField( + "CreationTime", parse_callback=utils.parse_rfc822 + ) + class_type = serializers.XMLNodeField("ClassType") + _resources = serializers.XMLNodesField("Resources", "ResourceName") + is_sql_function = serializers.XMLNodeField("IsSqlFunction") + is_embedded_function = serializers.XMLNodeField("IsEmbeddedFunction") + program_language = serializers.XMLNodeField("ProgramLanguage") + code = serializers.XMLNodeField("Code") + file_name = serializers.XMLNodeField("FileName") def __init__(self, **kwargs): self._resources_objects = None self._owner_changed = False - resources = kwargs.pop('resources', None) - if 'owner' in kwargs: - kwargs['_owner'] = kwargs.pop('owner') + resources = kwargs.pop("resources", None) + if "owner" in kwargs: + kwargs["_owner"] = kwargs.pop("owner") super(Function, self).__init__(**kwargs) if resources is not None: self.resources = resources def reload(self): - resp = self._client.get( - self.resource(), curr_schema=self._get_schema_name() - ) + resp = self._client.get(self.resource(), curr_schema=self._get_schema_name()) self.parse(self._client, resp, obj=self) @property @@ -113,11 +114,9 @@ def update_owner(self, new_owner): params = {} schema_name = self._get_schema_name() if schema_name: - params['curr_schema'] = schema_name + params["curr_schema"] = schema_name - headers = { - 'x-odps-owner': new_owner - } + headers = {"x-odps-owner": new_owner} self._client.put( self.resource(), None, action="updateowner", params=params, headers=headers ) diff --git a/odps/models/functions.py b/odps/models/functions.py index a7cac339..bd1d423a 100644 --- a/odps/models/functions.py +++ b/odps/models/functions.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,20 +14,19 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .. import errors, serializers +from ..compat import six from .core import Iterable from .function import Function -from .. import serializers, errors -from ..compat import six class Functions(Iterable): - - marker = serializers.XMLNodeField('Marker') - max_items = serializers.XMLNodeField('MaxItems') - functions = serializers.XMLNodesReferencesField(Function, 'Function') + marker = serializers.XMLNodeField("Marker") + max_items = serializers.XMLNodeField("MaxItems") + functions = serializers.XMLNodesReferencesField(Function, "Function") def _name(self): - return 'registration/functions' + return "registration/functions" def _get(self, name): return Function(client=self._client, parent=self, name=name) @@ -53,23 +52,23 @@ def iterate(self, name=None, owner=None, **kw): params = kw.copy() params["expectmarker"] = "true" if name is not None: - params['name'] = name + params["name"] = name if owner is not None: - params['owner'] = owner + params["owner"] = owner schema_name = self._get_schema_name() if schema_name is not None: - params['curr_schema'] = schema_name + params["curr_schema"] = schema_name def _it(): - last_marker = params.get('marker') - if 'marker' in params and (last_marker is None or len(last_marker) == 0): + last_marker = params.get("marker") + if "marker" in params and (last_marker is None or len(last_marker) == 0): return url = self.resource() resp = self._client.get(url, params=params) f = Functions.parse(self._client, resp, obj=self) - params['marker'] = f.marker + params["marker"] = f.marker return f.functions @@ -87,7 +86,7 @@ def create(self, obj=None, **kwargs): if function._client is None: function._client = self._client - headers = {'Content-Type': 'application/xml'} + headers = {"Content-Type": "application/xml"} data = function.serialize() self._client.post( @@ -98,12 +97,14 @@ def create(self, obj=None, **kwargs): return function def update(self, func): - new_func = Function(parent=self, client=self._client, - name=func.name, class_type=func.class_type, - resources=func.resources) - headers = { - 'Content-Type': 'application/xml' - } + new_func = Function( + parent=self, + client=self._client, + name=func.name, + class_type=func.class_type, + resources=func.resources, + ) + headers = {"Content-Type": "application/xml"} data = new_func.serialize() self._client.put( diff --git a/odps/models/instance.py b/odps/models/instance.py index eaa3274f..5fd62353 100644 --- a/odps/models/instance.py +++ b/odps/models/instance.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ # limitations under the License. from __future__ import print_function + import base64 import functools import json @@ -28,23 +29,27 @@ import requests -from .. import serializers, utils, errors, compat, readers, options +from .. import compat, errors, options, readers, serializers, utils from ..accounts import BearerTokenAccount from ..compat import Enum, six from ..lib.monotonic import monotonic +from ..lib.tblib import pickling_support from ..utils import to_str -from .core import LazyLoad, XMLRemoteModel, JSONRemoteModel +from .core import JSONRemoteModel, LazyLoad, XMLRemoteModel from .job import Job -from .readers import TunnelRecordReader, TunnelArrowReader -from .worker import WorkerDetail2, LOG_TYPES_MAPPING +from .readers import TunnelArrowReader, TunnelRecordReader +from .tasks import SQLTask +from .worker import LOG_TYPES_MAPPING, WorkerDetail2 logger = logging.getLogger(__name__) +pickling_support.install() _RESULT_LIMIT_HELPER_MSG = ( - 'See https://pyodps.readthedocs.io/zh_CN/latest/base-sql.html#read-sql-exec-result ' - 'for more information about limits on instance results.' + "See https://pyodps.readthedocs.io/zh_CN/latest/base-sql.html#read-sql-exec-result " + "for more information about limits on instance results." ) +_STATUS_QUERY_TIMEOUT = 5 * 60 # timeout when getting status def _with_status_api_lock(func): @@ -63,9 +68,16 @@ def schema(self): @staticmethod def _read_instance_split( - conn, download_id, start, count, idx, - rest_client=None, project=None, instance_id=None, tunnel_endpoint=None, - columns=None + conn, + download_id, + start, + count, + idx, + rest_client=None, + project=None, + instance_id=None, + tunnel_endpoint=None, + columns=None, ): # read part data from ..tunnel import InstanceTunnel @@ -74,15 +86,28 @@ def _read_instance_split( instance_tunnel = InstanceTunnel( client=rest_client, project=project, endpoint=tunnel_endpoint ) - session = instance_tunnel.create_download_session( - instance=instance_id, download_id=download_id + session = utils.call_with_retry( + instance_tunnel.create_download_session, + instance=instance_id, + download_id=download_id, ) - with session.open_record_reader(start, count, columns=columns) as reader: - conn.send((idx, reader.to_pandas(), True)) + + def _data_to_pandas(): + with session.open_record_reader( + start, count, columns=columns + ) as reader: + return reader.to_pandas() + + data = utils.call_with_retry(_data_to_pandas) + conn.send((idx, data, True)) except: - conn.send((idx, sys.exc_info(), False)) + try: + conn.send((idx, sys.exc_info(), False)) + except: + logger.exception("Failed to write in process %d", idx) + raise - def _get_process_split_reader(self, columns=None): + def _get_process_split_reader(self, columns=None, append_partitions=None): # noqa rest_client = self._parent._client project = self._parent.project.name tunnel_endpoint = self._parent.project._tunnel_endpoint @@ -99,17 +124,17 @@ def _get_process_split_reader(self, columns=None): class InstanceRecordReader(SpawnedInstanceReaderMixin, TunnelRecordReader): - def __init__(self, instance, download_session): + def __init__(self, instance, download_session, columns=None): super(InstanceRecordReader, self).__init__( - instance, download_session + instance, download_session, columns=columns ) self._schema = download_session.schema class InstanceArrowReader(SpawnedInstanceReaderMixin, TunnelArrowReader): - def __init__(self, instance, download_session): + def __init__(self, instance, download_session, columns=None): super(InstanceArrowReader, self).__init__( - instance, download_session + instance, download_session, columns=columns ) self._schema = download_session.schema @@ -139,19 +164,19 @@ class Instance(LazyLoad): __slots__ = ( "_task_results", "_is_sync", - "_instance_tunnel", "_id_thread_local", "_status_api_lock", "_logview_address", "_logview_address_time", "_logview_logged", + "_job_source", ) - _download_id = utils.thread_local_attribute('_id_thread_local', lambda: None) + _download_id = utils.thread_local_attribute("_id_thread_local", lambda: None) def __init__(self, **kwargs): - if 'task_results' in kwargs: - kwargs['_task_results'] = kwargs.pop('task_results') + if "task_results" in kwargs: + kwargs["_task_results"] = kwargs.pop("task_results") super(Instance, self).__init__(**kwargs) try: @@ -170,37 +195,35 @@ def __init__(self, **kwargs): self._logview_address = None self._logview_address_time = None self._logview_logged = False + self._job_source = None @property def id(self): return self.name class Status(Enum): - RUNNING = 'Running' - SUSPENDED = 'Suspended' - TERMINATED = 'Terminated' + RUNNING = "Running" + SUSPENDED = "Suspended" + TERMINATED = "Terminated" class InstanceStatus(XMLRemoteModel): - _root = 'Instance' + _root = "Instance" - status = serializers.XMLNodeField('Status') + status = serializers.XMLNodeField("Status") class InstanceResult(XMLRemoteModel): - class TaskResult(XMLRemoteModel): - class Result(XMLRemoteModel): - - transform = serializers.XMLNodeAttributeField(attr='Transform') - format = serializers.XMLNodeAttributeField(attr='Format') - text = serializers.XMLNodeField('.', default='') + transform = serializers.XMLNodeAttributeField(attr="Transform") + format = serializers.XMLNodeAttributeField(attr="Format") + text = serializers.XMLNodeField(".", default="") def __str__(self): if six.PY2: text = utils.to_binary(self.text) else: text = self.text - if self.transform is not None and self.transform == 'Base64': + if self.transform is not None and self.transform == "Base64": try: return utils.to_str(base64.b64decode(text)) except TypeError: @@ -209,18 +232,18 @@ def __str__(self): def __bytes__(self): text = utils.to_binary(self.text) - if self.transform is not None and self.transform == 'Base64': + if self.transform is not None and self.transform == "Base64": try: return utils.to_binary(base64.b64decode(text)) except TypeError: return text return text - type = serializers.XMLNodeAttributeField(attr='Type') - name = serializers.XMLNodeField('Name') - result = serializers.XMLNodeReferenceField(Result, 'Result') + type = serializers.XMLNodeAttributeField(attr="Type") + name = serializers.XMLNodeField("Name") + result = serializers.XMLNodeReferenceField(Result, "Result") - task_results = serializers.XMLNodesReferencesField(TaskResult, 'Tasks', 'Task') + task_results = serializers.XMLNodesReferencesField(TaskResult, "Tasks", "Task") class Task(XMLRemoteModel): """ @@ -229,21 +252,28 @@ class Task(XMLRemoteModel): It has a name, a task type, the start to end time, and a running status. """ - name = serializers.XMLNodeField('Name') - type = serializers.XMLNodeAttributeField(attr='Type') - start_time = serializers.XMLNodeField('StartTime', parse_callback=utils.parse_rfc822) - end_time = serializers.XMLNodeField('EndTime', parse_callback=utils.parse_rfc822) + name = serializers.XMLNodeField("Name") + type = serializers.XMLNodeAttributeField(attr="Type") + start_time = serializers.XMLNodeField( + "StartTime", parse_callback=utils.parse_rfc822 + ) + end_time = serializers.XMLNodeField( + "EndTime", parse_callback=utils.parse_rfc822 + ) status = serializers.XMLNodeField( - 'Status', parse_callback=lambda s: Instance.Task.TaskStatus(s.upper())) - histories = serializers.XMLNodesReferencesField('Instance.Task', 'Histories', 'History') + "Status", parse_callback=lambda s: Instance.Task.TaskStatus(s.upper()) + ) + histories = serializers.XMLNodesReferencesField( + "Instance.Task", "Histories", "History" + ) class TaskStatus(Enum): - WAITING = 'WAITING' - RUNNING = 'RUNNING' - SUCCESS = 'SUCCESS' - FAILED = 'FAILED' - SUSPENDED = 'SUSPENDED' - CANCELLED = 'CANCELLED' + WAITING = "WAITING" + RUNNING = "RUNNING" + SUCCESS = "SUCCESS" + FAILED = "FAILED" + SUSPENDED = "SUSPENDED" + CANCELLED = "CANCELLED" class TaskProgress(XMLRemoteModel): """ @@ -259,45 +289,62 @@ class TaskProgress(XMLRemoteModel): """ class StageProgress(XMLRemoteModel): - - name = serializers.XMLNodeAttributeField(attr='ID') - backup_workers = serializers.XMLNodeField('BackupWorkers', parse_callback=int) - terminated_workers = serializers.XMLNodeField('TerminatedWorkers', parse_callback=int) - running_workers = serializers.XMLNodeField('RunningWorkers', parse_callback=int) - total_workers = serializers.XMLNodeField('TotalWorkers', parse_callback=int) - input_records = serializers.XMLNodeField('InputRecords', parse_callback=int) - output_records = serializers.XMLNodeField('OutputRecords', parse_callback=int) - finished_percentage = serializers.XMLNodeField('FinishedPercentage', parse_callback=int) - - stages = serializers.XMLNodesReferencesField(StageProgress, 'Stage') + name = serializers.XMLNodeAttributeField(attr="ID") + backup_workers = serializers.XMLNodeField( + "BackupWorkers", parse_callback=int + ) + terminated_workers = serializers.XMLNodeField( + "TerminatedWorkers", parse_callback=int + ) + running_workers = serializers.XMLNodeField( + "RunningWorkers", parse_callback=int + ) + total_workers = serializers.XMLNodeField( + "TotalWorkers", parse_callback=int + ) + input_records = serializers.XMLNodeField( + "InputRecords", parse_callback=int + ) + output_records = serializers.XMLNodeField( + "OutputRecords", parse_callback=int + ) + finished_percentage = serializers.XMLNodeField( + "FinishedPercentage", parse_callback=int + ) + + stages = serializers.XMLNodesReferencesField(StageProgress, "Stage") def get_stage_progress_formatted_string(self): buf = six.StringIO() - buf.write(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) - buf.write(' ') + buf.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + buf.write(" ") for stage in self.stages: - buf.write('{0}:{1}/{2}/{3}{4}[{5}%]\t'.format( - stage.name, - stage.running_workers, - stage.terminated_workers, - stage.total_workers, - '(+%s backups)' % stage.backup_workers if stage.backup_workers > 0 else '', - stage.finished_percentage - )) + buf.write( + "{0}:{1}/{2}/{3}{4}[{5}%]\t".format( + stage.name, + stage.running_workers, + stage.terminated_workers, + stage.total_workers, + "(+%s backups)" % stage.backup_workers + if stage.backup_workers > 0 + else "", + stage.finished_percentage, + ) + ) return buf.getvalue() class TaskInfo(serializers.XMLSerializableModel): - _root = 'Instance' - __slots__ = 'key', 'value' + _root = "Instance" + __slots__ = "key", "value" - key = serializers.XMLNodeField('Key') - value = serializers.XMLNodeField('Value') + key = serializers.XMLNodeField("Key") + value = serializers.XMLNodeField("Value") class TaskCost(object): - __slots__ = 'cpu_cost', 'memory_cost', 'input_size' + __slots__ = "cpu_cost", "memory_cost", "input_size" def __init__(self, cpu_cost=None, memory_cost=None, input_size=None): self.cpu_cost = cpu_cost @@ -305,7 +352,7 @@ def __init__(self, cpu_cost=None, memory_cost=None, input_size=None): self.input_size = input_size class SQLCost(object): - __slots__ = 'udf_num', 'complexity', 'input_size' + __slots__ = "udf_num", "complexity", "input_size" def __init__(self, udf_num=None, complexity=None, input_size=None): self.udf_num = udf_num @@ -315,51 +362,49 @@ def __init__(self, udf_num=None, complexity=None, input_size=None): class DownloadSessionCreationError(errors.InternalServerError): pass - name = serializers.XMLNodeField('Name') - owner = serializers.XMLNodeField('Owner') - start_time = serializers.XMLNodeField('StartTime', parse_callback=utils.parse_rfc822) - end_time = serializers.XMLNodeField('EndTime', parse_callback=utils.parse_rfc822) - _status = serializers.XMLNodeField('Status', parse_callback=lambda s: Instance.Status(s)) - _tasks = serializers.XMLNodesReferencesField(Task, 'Tasks', 'Task') - class TaskSummary(dict): def __init__(self, *args, **kwargs): super(Instance.TaskSummary, self).__init__(*args, **kwargs) self.summary_text, self.json_summary = None, None class AnonymousSubmitInstance(XMLRemoteModel): - _root = 'Instance' - job = serializers.XMLNodeReferenceField(Job, 'Job') + _root = "Instance" + job = serializers.XMLNodeReferenceField(Job, "Job") class InstanceQueueingInfo(JSONRemoteModel): - __slots__ = '_instance', + __slots__ = ("_instance",) class Status(Enum): - RUNNING = 'Running' - SUSPENDED = 'Suspended' - TERMINATED = 'Terminated' - UNKNOWN = 'Unknown' + RUNNING = "Running" + SUSPENDED = "Suspended" + TERMINATED = "Terminated" + UNKNOWN = "Unknown" _properties = serializers.JSONRawField() # hold the raw dict - instance_id = serializers.JSONNodeField('instanceId') - priority = serializers.JSONNodeField('instancePriority') - progress = serializers.JSONNodeField('instanceProcess') - job_name = serializers.JSONNodeField('jobName') - project = serializers.JSONNodeField('projectName') - skynet_id = serializers.JSONNodeField('skynetId') - start_time = serializers.JSONNodeField('startTime', parse_callback=utils.strptime_with_tz) - task_type = serializers.JSONNodeField('taskType') - task_name = serializers.JSONNodeField('taskName') - user_account = serializers.JSONNodeField('userAccount') - status = serializers.JSONNodeField('status', parse_callback=Status) + instance_id = serializers.JSONNodeField("instanceId") + priority = serializers.JSONNodeField("instancePriority") + progress = serializers.JSONNodeField("instanceProcess") + job_name = serializers.JSONNodeField("jobName") + project = serializers.JSONNodeField("projectName") + skynet_id = serializers.JSONNodeField("skynetId") + start_time = serializers.JSONNodeField( + "startTime", parse_callback=utils.strptime_with_tz + ) + task_type = serializers.JSONNodeField("taskType") + task_name = serializers.JSONNodeField("taskName") + user_account = serializers.JSONNodeField("userAccount") + status = serializers.JSONNodeField("status", parse_callback=Status) @property def instance(self): - if hasattr(self, '_instance') and self._instance: + if hasattr(self, "_instance") and self._instance: return self._instance from .projects import Projects - self._instance = Projects(client=self._client)[self.project].instances[self.instance_id] + + self._instance = Projects(client=self._client)[self.project].instances[ + self.instance_id + ] return self._instance def __getattr__(self, item): @@ -370,14 +415,30 @@ def __getattr__(self, item): return super(Instance.InstanceQueueingInfo, self).__getattr__(item) - def reload(self): - resp = self._client.get(self.resource()) + name = serializers.XMLNodeField("Name") + owner = serializers.XMLNodeField("Owner") + start_time = serializers.XMLNodeField( + "StartTime", parse_callback=utils.parse_rfc822 + ) + end_time = serializers.XMLNodeField("EndTime", parse_callback=utils.parse_rfc822) + _status = serializers.XMLNodeField( + "Status", parse_callback=lambda s: Instance.Status(s) + ) + _tasks = serializers.XMLNodesReferencesField(Task, "Tasks", "Task") + + def reload(self, blocking=False): + actions = [] + if blocking: + actions.append("instancestatus") + resp = self._client.get(self.resource(), actions=actions) - self.owner = resp.headers.get('x-odps-owner') - self.start_time = utils.parse_rfc822(resp.headers.get('x-odps-start-time')) - end_time_header = 'x-odps-end-time' - if end_time_header in resp.headers and \ - len(resp.headers[end_time_header].strip()) > 0: + self.owner = resp.headers.get("x-odps-owner") + self.start_time = utils.parse_rfc822(resp.headers.get("x-odps-start-time")) + end_time_header = "x-odps-end-time" + if ( + end_time_header in resp.headers + and len(resp.headers[end_time_header].strip()) > 0 + ): self.end_time = utils.parse_rfc822(resp.headers.get(end_time_header)) self.parse(self._client, resp, obj=self) @@ -390,24 +451,37 @@ def stop(self): :return: None """ - instance_status = Instance.InstanceStatus(status='Terminated') + instance_status = Instance.InstanceStatus(status="Terminated") xml_content = instance_status.serialize() - headers = {'Content-Type': 'application/xml'} + headers = {"Content-Type": "application/xml"} self._client.put(self.resource(), xml_content, headers=headers) + @staticmethod + def _call_with_retry(func, retry=False, retry_timeout=None): + retry_kw = { + "retry_times": options.retry_times if retry else 0, + "exc_type": (errors.InternalServerError, errors.RequestTimeTooSkewed), + } + if retry and retry_timeout is not None: + # use retry timeout instead of retry count + retry_kw.update({"retry_times": None, "retry_timeout": retry_timeout}) + return utils.call_with_retry(func, **retry_kw) + @_with_status_api_lock - def get_task_results_without_format(self, timeout=None): + def get_task_results_without_format(self, timeout=None, retry=True): if self._is_sync: return self._task_results - resp = self._client.get(self.resource(), action="result", timeout=timeout) + def _get_resp(): + return self._client.get(self.resource(), action="result", timeout=timeout) + resp = self._call_with_retry(_get_resp, retry=retry, retry_timeout=timeout) instance_result = Instance.InstanceResult.parse(self._client, resp) return OrderedDict([(r.name, r.result) for r in instance_result.task_results]) @_with_status_api_lock - def get_task_results(self, timeout=None): + def get_task_results(self, timeout=None, retry=True): """ Get all the task results. @@ -415,14 +489,25 @@ def get_task_results(self, timeout=None): :rtype: dict """ - results = self.get_task_results_without_format(timeout=timeout) + results = self.get_task_results_without_format(timeout=timeout, retry=retry) if options.tunnel.string_as_binary: - return OrderedDict([(k, bytes(result)) for k, result in six.iteritems(results)]) + return OrderedDict( + [(k, bytes(result)) for k, result in six.iteritems(results)] + ) else: - return OrderedDict([(k, str(result)) for k, result in six.iteritems(results)]) + return OrderedDict( + [(k, str(result)) for k, result in six.iteritems(results)] + ) + + def _get_default_task_name(self): + job = self._get_job() + if len(job.tasks) != 1: + msg = "No tasks" if len(job.tasks) == 0 else "Multiple tasks" + raise errors.ODPSError("%s in instance." % msg) + return job.tasks[0].name @_with_status_api_lock - def get_task_result(self, task_name, timeout=None): + def get_task_result(self, task_name=None, timeout=None, retry=True): """ Get a single task result. @@ -430,10 +515,11 @@ def get_task_result(self, task_name, timeout=None): :return: task result :rtype: str """ - return self.get_task_results(timeout=timeout).get(task_name) + task_name = task_name or self._get_default_task_name() + return self.get_task_results(timeout=timeout, retry=retry).get(task_name) @_with_status_api_lock - def get_task_summary(self, task_name): + def get_task_summary(self, task_name=None): """ Get a task's summary, mostly used for MapReduce. @@ -441,37 +527,40 @@ def get_task_summary(self, task_name): :return: summary as a dict parsed from JSON :rtype: dict """ - - params = {'taskname': task_name} + task_name = task_name or self._get_default_task_name() + params = {"taskname": task_name} resp = self._client.get( - self.resource(), action='instancesummary', params=params + self.resource(), action="instancesummary", params=params ) - map_reduce = resp.json().get('Instance') + map_reduce = resp.json().get("Instance") if map_reduce: - json_summary = map_reduce.get('JsonSummary') + json_summary = map_reduce.get("JsonSummary") if json_summary: summary = Instance.TaskSummary(json.loads(json_summary)) - summary.summary_text = map_reduce.get('Summary') + summary.summary_text = map_reduce.get("Summary") summary.json_summary = json_summary return summary @_with_status_api_lock - def get_task_statuses(self): + def get_task_statuses(self, retry=True, timeout=None): """ Get all tasks' statuses :return: a dict which key is the task name and value is the :class:`odps.models.Instance.Task` object :rtype: dict """ - resp = self._client.get(self.resource(), action="taskstatus") - self.parse(self._client, resp, obj=self) + def _get_resp(): + return self._client.get(self.resource(), action="taskstatus") + + resp = self._call_with_retry(_get_resp, retry=retry, retry_timeout=timeout) + self.parse(self._client, resp, obj=self) return dict([(task.name, task) for task in self._tasks]) @_with_status_api_lock - def get_task_names(self): + def get_task_names(self, retry=True, timeout=None): """ Get names of all tasks @@ -479,9 +568,9 @@ def get_task_names(self): :rtype: list """ - return compat.lkeys(self.get_task_statuses()) + return compat.lkeys(self.get_task_statuses(retry=retry, timeout=timeout)) - def get_task_cost(self, task_name): + def get_task_cost(self, task_name=None): """ Get task cost @@ -499,54 +588,78 @@ def get_task_cost(self, task_name): >>> cost.input_size 0 """ + task_name = task_name or self._get_default_task_name() summary = self.get_task_summary(task_name) if summary is None: return None - if 'Cost' in summary: - task_cost = summary['Cost'] + if "Cost" in summary: + task_cost = summary["Cost"] - cpu_cost = task_cost.get('CPU') - memory = task_cost.get('Memory') - input_size = task_cost.get('Input') + cpu_cost = task_cost.get("CPU") + memory = task_cost.get("Memory") + input_size = task_cost.get("Input") return Instance.TaskCost(cpu_cost, memory, input_size) - def get_task_info(self, task_name, key): + def _raise_empty_task_info(self, resp): + raise errors.EmptyTaskInfoError( + "Empty response. Task server maybe dead.", + code=resp.status_code, + instance_id=self.id, + endpoint=self._client.endpoint, + request_id=resp.headers.get("x-odps-request-id"), + tag="ODPS", + ) + + def get_task_info(self, task_name, key, raise_empty=False): """ Get task related information. :param task_name: name of the task :param key: key of the information item + :param raise_empty: if True, will raise error when response is empty :return: a string of the task information """ actions = ["info"] - params = OrderedDict([('taskname', task_name), ('key', key)]) + params = OrderedDict([("taskname", task_name), ("key", key)]) resp = self._client.get(self.resource(), actions=actions, params=params) - return resp.content.decode() + resp_data = resp.content.decode() + if raise_empty and not resp_data: + self._raise_empty_task_info(resp) + return resp_data - def put_task_info(self, task_name, key, value, check_location=False): + def put_task_info( + self, task_name, key, value, check_location=False, raise_empty=False + ): """ Put information into a task. :param task_name: name of the task :param key: key of the information item :param value: value of the information item + :param check_location: raises if Location header is missing + :param raise_empty: if True, will raise error when response is empty """ actions = ["info"] - params = {'taskname': task_name} - headers = {'Content-Type': 'application/xml'} + params = {"taskname": task_name} + headers = {"Content-Type": "application/xml"} body = self.TaskInfo(key=key, value=value).serialize() - resp = self._client.put(self.resource(), actions=actions, params=params, headers=headers, data=body) + resp = self._client.put( + self.resource(), actions=actions, params=params, headers=headers, data=body + ) - location = resp.headers.get('Location') + location = resp.headers.get("Location") if check_location and (location is None or len(location) == 0): - raise errors.ODPSError('Invalid response, Location header required.') - return resp.content.decode() + raise errors.ODPSError("Invalid response, Location header required.") + resp_data = resp.content.decode() + if raise_empty and not resp_data: + self._raise_empty_task_info(resp) + return resp_data - def get_task_quota(self, task_name): + def get_task_quota(self, task_name=None): """ Get queueing info of the task. Note that time between two calls should larger than 30 seconds, otherwise empty dict is returned. @@ -554,8 +667,9 @@ def get_task_quota(self, task_name): :param task_name: name of the task :return: quota info in dict format """ - actions = ['instancequota'] - params = {'taskname': task_name} + task_name = task_name or self._get_default_task_name() + actions = ["instancequota"] + params = {"taskname": task_name} resp = self._client.get(self.resource(), actions=actions, params=params) return json.loads(resp.text) @@ -571,48 +685,50 @@ def get_sql_task_cost(self): """ resp = self.get_task_result(self.get_task_names()[0]) cost = json.loads(resp) - sql_cost = cost['Cost']['SQL'] + sql_cost = cost["Cost"]["SQL"] - udf_num = sql_cost.get('UDF') - complexity = sql_cost.get('Complexity') - input_size = sql_cost.get('Input') + udf_num = sql_cost.get("UDF") + complexity = sql_cost.get("Complexity") + input_size = sql_cost.get("Input") return Instance.SQLCost(udf_num, complexity, input_size) + def _get_status(self, blocking=False): + if self._status != Instance.Status.TERMINATED: + self.reload(blocking) + return self._status + @property @_with_status_api_lock def status(self): - if self._status != Instance.Status.TERMINATED: - self.reload() + return self._get_status() - return self._status - - def is_terminated(self, retry=False): + def is_terminated(self, retry=True, blocking=False, retry_timeout=None): """ If this instance has finished or not. :return: True if finished else False :rtype: bool """ - return utils.call_with_retry( - lambda: self.status == Instance.Status.TERMINATED, - retry_times=options.retry_times if retry else 0, - exc_type=(errors.InternalServerError, errors.RequestTimeTooSkewed), + return self._call_with_retry( + lambda: self._get_status(blocking) == Instance.Status.TERMINATED, + retry=retry, + retry_timeout=retry_timeout, ) - def is_running(self, retry=False): + def is_running(self, retry=True, blocking=False, retry_timeout=None): """ If this instance is still running. :return: True if still running else False :rtype: bool """ - return utils.call_with_retry( - lambda: self.status == Instance.Status.RUNNING, - retry_times=options.retry_times if retry else 0, - exc_type=(errors.InternalServerError, errors.RequestTimeTooSkewed), + return self._call_with_retry( + lambda: self._get_status(blocking) == Instance.Status.RUNNING, + retry=retry, + retry_timeout=retry_timeout, ) - def is_successful(self, retry=False): + def is_successful(self, retry=True, retry_timeout=None): """ If the instance runs successfully. @@ -630,10 +746,8 @@ def _get_successful(): for task in statuses.values() ) - return utils.call_with_retry( - _get_successful, - retry_times=options.retry_times if retry else 0, - exc_type=(errors.InternalServerError, errors.RequestTimeTooSkewed), + return self._call_with_retry( + _get_successful, retry=retry, retry_timeout=retry_timeout ) @property @@ -646,7 +760,9 @@ def get_all_task_progresses(self): for task_name in self.get_task_names() } - def wait_for_completion(self, interval=1, timeout=None, max_interval=None): + def wait_for_completion( + self, interval=1, timeout=None, max_interval=None, blocking=True + ): """ Wait for the instance to complete, and neglect the consequence. @@ -654,21 +770,27 @@ def wait_for_completion(self, interval=1, timeout=None, max_interval=None): :param max_interval: if specified, next check interval will be multiplied by 2 till max_interval is reached. :param timeout: time + :param blocking: whether to block waiting at server side. Note that this option does + not affect client behavior. :return: None """ - start_time = monotonic() - progress_time = monotonic() + start_time = check_time = progress_time = monotonic() last_progress = 0 - while not self.is_terminated(retry=True): + while not self.is_terminated( + retry=True, blocking=blocking, retry_timeout=_STATUS_QUERY_TIMEOUT + ): try: - time.sleep(interval) + sleep_interval_left = interval - (monotonic() - check_time) + if sleep_interval_left > 0: + time.sleep(sleep_interval_left) check_time = monotonic() if max_interval is not None: interval = min(interval * 2, max_interval) if timeout is not None and check_time - start_time > timeout: raise errors.WaitTimeoutError( - "Wait completion of instance %s timed out" % self.id, instance_id=self.id + "Wait completion of instance %s timed out" % self.id, + instance_id=self.id, ) if logger.getEffectiveLevel() <= logging.INFO: @@ -679,14 +801,21 @@ def wait_for_completion(self, interval=1, timeout=None, max_interval=None): for progress in task_progresses.values() for stage in progress.stages ) - if check_time - start_time >= options.progress_time_interval and ( - total_progress - last_progress >= options.progress_percentage_gap - or check_time - progress_time >= options.progress_time_interval + if ( + check_time - start_time >= options.progress_time_interval + and ( + total_progress - last_progress + >= options.progress_percentage_gap + or check_time - progress_time + >= options.progress_time_interval + ) ): if not self._logview_logged: self._logview_logged = True logger.info( - "Instance ID: %s\n Log view: %s", self.id, self.get_logview_address() + "Instance ID: %s\n Log view: %s", + self.id, + self.get_logview_address(), ) output_parts = [str(self.id)] + [ @@ -703,7 +832,9 @@ def wait_for_completion(self, interval=1, timeout=None, max_interval=None): except KeyboardInterrupt: break - def wait_for_success(self, interval=1, timeout=None, max_interval=None): + def wait_for_success( + self, interval=1, timeout=None, max_interval=None, blocking=True + ): """ Wait for instance to complete, and check if the instance is successful. @@ -711,11 +842,18 @@ def wait_for_success(self, interval=1, timeout=None, max_interval=None): :param max_interval: if specified, next check interval will be multiplied by 2 till max_interval is reached. :param timeout: time + :param blocking: whether to block waiting at server side. Note that this option does + not affect client behavior. :return: None :raise: :class:`odps.errors.ODPSError` if the instance failed """ - self.wait_for_completion(interval=interval, max_interval=max_interval, timeout=timeout) + self.wait_for_completion( + interval=interval, + max_interval=max_interval, + timeout=timeout, + blocking=blocking, + ) if not self.is_successful(retry=True): for task_name, task in six.iteritems(self.get_task_statuses()): @@ -723,13 +861,15 @@ def wait_for_success(self, interval=1, timeout=None, max_interval=None): if task.status == Instance.Task.TaskStatus.FAILED: exc = errors.parse_instance_error(self.get_task_result(task_name)) elif task.status != Instance.Task.TaskStatus.SUCCESS: - exc = errors.ODPSError('%s, status=%s' % (task_name, task.status.value)) + exc = errors.ODPSError( + "%s, status=%s" % (task_name, task.status.value) + ) if exc: exc.instance_id = self.id raise exc @_with_status_api_lock - def get_task_progress(self, task_name): + def get_task_progress(self, task_name=None): """ Get task's current progress @@ -737,14 +877,14 @@ def get_task_progress(self, task_name): :return: the task's progress :rtype: :class:`odps.models.Instance.Task.TaskProgress` """ - - params = {'instanceprogress': task_name, 'taskname': task_name} + task_name = task_name or self._get_default_task_name() + params = {"instanceprogress": task_name, "taskname": task_name} resp = self._client.get(self.resource(), params=params) return Instance.Task.TaskProgress.parse(self._client, resp) @_with_status_api_lock - def get_task_detail(self, task_name): + def get_task_detail(self, task_name=None): """ Get task's detail @@ -752,18 +892,20 @@ def get_task_detail(self, task_name): :return: the task's detail :rtype: list or dict according to the JSON """ + def _get_detail(): from ..compat import json # fix object_pairs_hook parameter for Py2.6 - params = {'taskname': task_name} + params = {"taskname": task_name} resp = self._client.get( - self.resource(), action='instancedetail', params=params + self.resource(), action="instancedetail", params=params ) return json.loads( resp.content.decode() if six.PY3 else resp.content, - object_pairs_hook=OrderedDict + object_pairs_hook=OrderedDict, ) + task_name = task_name or self._get_default_task_name() result = _get_detail() if not result: # todo: this is a workaround for the bug that get_task_detail returns nothing. @@ -772,7 +914,7 @@ def _get_detail(): return result @_with_status_api_lock - def get_task_detail2(self, task_name): + def get_task_detail2(self, task_name=None, **kw): """ Get task's detail v2 @@ -780,9 +922,12 @@ def get_task_detail2(self, task_name): :return: the task's detail :rtype: list or dict according to the JSON """ - params = {'taskname': task_name} + task_name = task_name or self._get_default_task_name() + params = {"taskname": task_name} + if "subquery_id" in kw: + params["subquery_id"] = str(kw.pop("subquery_id")) - resp = self._client.get(self.resource(), action='detail', params=params) + resp = self._client.get(self.resource(), action="detail", params=params) res = resp.content.decode() if six.PY3 else resp.content try: return json.loads(res, object_pairs_hook=OrderedDict) @@ -799,12 +944,14 @@ def get_task_workers(self, task_name=None, json_obj=None): .. seealso:: :class:`odps.models.Worker` """ - if task_name is None and json_obj is None: - raise ValueError('Either task_name or json_obj should be provided') + if json_obj is None: + task_name = task_name or self._get_default_task_name() if json_obj is None: json_obj = self.get_task_detail2(task_name) - return WorkerDetail2.extract_from_json(json_obj, client=self._client, parent=self) + return WorkerDetail2.extract_from_json( + json_obj, client=self._client, parent=self + ) @_with_status_api_lock def get_worker_log(self, log_id, log_type, size=0): @@ -816,18 +963,23 @@ def get_worker_log(self, log_id, log_type, size=0): :param size: length of the log to retrieve :return: log content """ - params = OrderedDict([('log', ''), ('id', log_id)]) + params = OrderedDict([("log", ""), ("id", log_id)]) if log_type is not None: log_type = log_type.lower() if log_type not in LOG_TYPES_MAPPING: - raise ValueError('log_type should choose a value in ' + - ' '.join(six.iterkeys(LOG_TYPES_MAPPING))) - params['logtype'] = LOG_TYPES_MAPPING[log_type] + raise ValueError( + "log_type should choose a value in " + + " ".join(six.iterkeys(LOG_TYPES_MAPPING)) + ) + params["logtype"] = LOG_TYPES_MAPPING[log_type] if size > 0: - params['size'] = str(size) + params["size"] = str(size) resp = self._client.get(self.resource(), params=params) return resp.text - get_worker_log.__doc__ = get_worker_log.__doc__.format(log_types=', '.join(sorted(six.iterkeys(LOG_TYPES_MAPPING)))) + + get_worker_log.__doc__ = get_worker_log.__doc__.format( + log_types=", ".join(sorted(six.iterkeys(LOG_TYPES_MAPPING))) + ) @_with_status_api_lock def get_logview_address(self, hours=None): @@ -850,12 +1002,15 @@ def get_logview_address(self, hours=None): else: hours = hours or options.logview_hours policy = { - 'Statement': [{ - 'Action': ['odps:Read'], - 'Effect': 'Allow', - 'Resource': 'acs:odps:*:projects/%s/instances/%s' % (project.name, self.id) - }], - 'Version': '1', + "Statement": [ + { + "Action": ["odps:Read"], + "Effect": "Allow", + "Resource": "acs:odps:*:projects/%s/instances/%s" + % (project.name, self.id), + } + ], + "Version": "1", } token = self.project.generate_auth_token(policy, "bearer", hours) @@ -878,11 +1033,12 @@ def __str__(self): return self.id def _get_job(self): - url = self.resource() - resp = self._client.get(url, action='source') + if not self._job_source: + url = self.resource() + resp = self._client.get(url, action="source") - job = Job.parse(self._client, resp, parent=self) - return job + self._job_source = Job.parse(self._client, resp, parent=self) + return self._job_source def get_tasks(self): return self.tasks @@ -897,50 +1053,78 @@ def priority(self): job = self._get_job() return job.priority - def _get_queueing_info(self): + def _get_queueing_info(self, **kw): + params = {} + if "subquery_id" in kw: + params["subquery_id"] = str(kw.pop("subquery_id")) + url = self.resource() - resp = self._client.get(url, action='cached') - return Instance.InstanceQueueingInfo.parse( - self._client, resp, parent=self.project.instance_queueing_infos), resp + resp = self._client.get(url, action="cached", params=params) + return ( + Instance.InstanceQueueingInfo.parse( + self._client, resp, parent=self.project.instance_queueing_infos + ), + resp, + ) def get_queueing_info(self): info, _ = self._get_queueing_info() return info - def _create_instance_tunnel(self, endpoint=None): - if self._instance_tunnel is not None: - return self._instance_tunnel - - from ..tunnel import InstanceTunnel - - self._instance_tunnel = InstanceTunnel( - client=self._client, project=self.project, - endpoint=endpoint or self.project._tunnel_endpoint - ) - return self._instance_tunnel + def get_sql_query(self): + task = [t for t in self.tasks if isinstance(t, SQLTask)] + if not task: + raise errors.ODPSError("Instance %s does not contain a SQLTask.", self.id) + if len(task) > 1: # pragma: no cover + raise errors.ODPSError("Multiple SQLTasks exist in instance %s.", self.id) + return task[0].query - @utils.survey - def _open_result_reader(self, schema=None, task_name=None, timeout=None, **_): + def _check_get_task_name(self, task_type, task_name=None, err_head=None): if not self.is_successful(retry=True): raise errors.ODPSError( - 'Cannot open reader, instance(%s) may fail or has not finished yet' % self.id) + "%s, instance(%s) may fail or has not finished yet" + % (err_head, self.id) + ) - sql_tasks = dict([(name, task) for name, task in six.iteritems(self.get_task_statuses()) - if task.type.lower() == 'sql']) - if len(sql_tasks) > 1: + task_type = task_type.lower() + filtered_tasks = { + name: task + for name, task in six.iteritems(self.get_task_statuses()) + if task.type.lower() == task_type + } + if len(filtered_tasks) > 1: if task_name is None: raise errors.ODPSError( - 'Cannot open reader, job has more than one sql tasks, please specify one') - elif task_name not in sql_tasks: + "%s, job has more than one %s tasks, please specify one" + % (err_head, task_type) + ) + elif task_name not in filtered_tasks: raise errors.ODPSError( - 'Cannot open reader, unknown task name: %s' % task_name) - elif len(sql_tasks) == 1: - task_name = list(sql_tasks)[0] + "%s, unknown task name: %s" % (err_head, task_name) + ) + return task_name + elif len(filtered_tasks) == 1: + return list(filtered_tasks)[0] else: - raise errors.ODPSError('Cannot open reader, job has no sql task') + raise errors.ODPSError("%s, job has no %s task" % (err_head, task_type)) + + def _create_instance_tunnel(self, endpoint=None, quota_name=None): + from ..tunnel import InstanceTunnel + + return InstanceTunnel( + client=self._client, + project=self.project, + quota_name=quota_name, + endpoint=endpoint or self.project._tunnel_endpoint, + ) + @utils.survey + def _open_result_reader(self, schema=None, task_name=None, timeout=None, **kw): + task_name = self._check_get_task_name( + "sql", task_name=task_name, err_head="Cannot open reader" + ) result = self.get_task_result(task_name, timeout=timeout) - reader = readers.CsvRecordReader(schema, result) + reader = readers.CsvRecordReader(schema, result, **kw) if options.result_reader_create_callback: options.result_reader_create_callback(reader) return reader @@ -948,17 +1132,23 @@ def _open_result_reader(self, schema=None, task_name=None, timeout=None, **_): def _open_tunnel_reader(self, **kw): from ..tunnel.instancetunnel import InstanceDownloadSession - reopen = kw.pop('reopen', False) - endpoint = kw.pop('endpoint', None) + reopen = kw.pop("reopen", False) + endpoint = kw.pop("endpoint", None) + quota_name = kw.pop("quota_name", None) arrow = kw.pop("arrow", False) - tunnel = self._create_instance_tunnel(endpoint=endpoint) + columns = kw.pop("columns", None) + + tunnel = self._create_instance_tunnel(endpoint=endpoint, quota_name=quota_name) download_id = self._download_id if not reopen else None try: download_session = tunnel.create_download_session( instance=self, download_id=download_id, **kw ) - if download_id and download_session.status != InstanceDownloadSession.Status.Normal: + if ( + download_id + and download_session.status != InstanceDownloadSession.Status.Normal + ): download_session = tunnel.create_download_session(instance=self, **kw) except errors.InternalServerError: e, tb = sys.exc_info()[1:] @@ -968,9 +1158,9 @@ def _open_tunnel_reader(self, **kw): self._download_id = download_session.id if arrow: - return InstanceArrowReader(self, download_session) + return InstanceArrowReader(self, download_session, columns=columns) else: - return InstanceRecordReader(self, download_session) + return InstanceRecordReader(self, download_session, columns=columns) def open_reader(self, *args, **kwargs): """ @@ -1004,31 +1194,37 @@ def open_reader(self, *args, **kwargs): >>> for record in reader[0: count]: >>> # read all data, actually better to split into reading for many times """ - use_tunnel = kwargs.get('use_tunnel', kwargs.get('tunnel')) + use_tunnel = kwargs.get("use_tunnel", kwargs.get("tunnel")) auto_fallback_result = use_tunnel is None timeout = kwargs.pop("timeout", None) if use_tunnel is None: use_tunnel = options.tunnel.use_instance_tunnel if use_tunnel: - timeout = timeout if timeout is not None else options.tunnel.legacy_fallback_timeout + timeout = ( + timeout + if timeout is not None + else options.tunnel.legacy_fallback_timeout + ) kwargs["timeout"] = timeout result_fallback_errors = ( - errors.InvalidProjectTable, errors.InvalidArgument, errors.NoSuchProject + errors.InvalidProjectTable, + errors.InvalidArgument, + errors.NoSuchProject, ) if use_tunnel: # for compatibility - if 'limit_enabled' in kwargs: - kwargs['limit'] = kwargs['limit_enabled'] - del kwargs['limit_enabled'] + if "limit_enabled" in kwargs: + kwargs["limit"] = kwargs["limit_enabled"] + del kwargs["limit_enabled"] - if 'limit' not in kwargs: - kwargs['limit'] = options.tunnel.limit_instance_tunnel + if "limit" not in kwargs: + kwargs["limit"] = options.tunnel.limit_instance_tunnel auto_fallback_protection = False - if kwargs['limit'] is None: - kwargs['limit'] = False + if kwargs["limit"] is None: + kwargs["limit"] = False auto_fallback_protection = True try: @@ -1037,21 +1233,27 @@ def open_reader(self, *args, **kwargs): # service version too low to support instance tunnel. if not auto_fallback_result: raise - if not kwargs.get('limit'): - warnings.warn('Instance tunnel not supported, will fallback to ' - 'conventional ways. 10000 records will be limited. ' - + _RESULT_LIMIT_HELPER_MSG) + if not kwargs.get("limit"): + warnings.warn( + "Instance tunnel not supported, will fallback to " + "conventional ways. 10000 records will be limited. " + + _RESULT_LIMIT_HELPER_MSG + ) except requests.Timeout: # tunnel creation timed out, which might be caused by too many files # on the service. if not auto_fallback_result: raise - if not kwargs.get('limit'): + if not kwargs.get("limit"): warnings.warn( - 'Instance tunnel timed out, will fallback to conventional ways. ' - '10000 records will be limited. You may try merging small files ' - 'on your source table. ' + _RESULT_LIMIT_HELPER_MSG) - except (Instance.DownloadSessionCreationError, errors.InstanceTypeNotSupported): + "Instance tunnel timed out, will fallback to conventional ways. " + "10000 records will be limited. You may try merging small files " + "on your source table. " + _RESULT_LIMIT_HELPER_MSG + ) + except ( + Instance.DownloadSessionCreationError, + errors.InstanceTypeNotSupported, + ): # this is for DDL sql instances such as `show partitions` which raises # InternalServerError when creating download sessions. if not auto_fallback_result: @@ -1060,10 +1262,115 @@ def open_reader(self, *args, **kwargs): # project is protected if not auto_fallback_protection: raise - if not kwargs.get('limit'): - warnings.warn('Project under protection, 10000 records will be limited.' - + _RESULT_LIMIT_HELPER_MSG) - kwargs['limit'] = True + if not kwargs.get("limit"): + warnings.warn( + "Project under protection, 10000 records will be limited." + + _RESULT_LIMIT_HELPER_MSG + ) + kwargs["limit"] = True return self.open_reader(*args, **kwargs) return self._open_result_reader(*args, **kwargs) + + def _iter_reader_with_pandas(self, iter_func, **kw): + try: + with self.open_reader(**kw) as reader: + for batch in iter_func(reader): + yield batch + except (errors.ChecksumError, errors.MethodNotAllowed): + # arrow tunnel not implemented or not supported + kw.pop("arrow", None) + with self.open_reader(**kw) as reader: + for batch in iter_func(reader): + yield batch + + def to_pandas( + self, + columns=None, + limit=None, + start=None, + count=None, + n_process=1, + quota_name=None, + tags=None, + **kwargs + ): + """ + Read instance data into pandas DataFrame. The limit argument follows definition + of `open_reader` API. + + :param list columns: columns to read + :param bool limit: if True, enable the limitation + :param int start: start row index from 0 + :param int count: data count to read + :param int n_process: number of processes to accelerate reading + :param str quota_name: name of tunnel quota to use + """ + try: + import pyarrow as pa + except ImportError: + pa = None + + kw = dict( + limit=limit, + columns=columns, + arrow=pa is not None, + quota_name=quota_name, + tags=tags, + **kwargs + ) + if limit is None: + kw.pop("limit") + + def _it(reader): + yield reader.to_pandas(start=start, count=count, n_process=n_process) + + return next(self._iter_reader_with_pandas(_it, **kw)) + + def iter_pandas( + self, + columns=None, + limit=None, + batch_size=None, + start=None, + count=None, + quota_name=None, + tags=None, + **kwargs + ): + """ + Iterate table data in blocks as pandas DataFrame. The limit argument + follows definition of `open_reader` API. + + :param list columns: columns to read + :param bool limit: if True, enable the limitation + :param int batch_size: size of DataFrame batch to read + :param int start: start row index from 0 + :param int count: data count to read + :param str quota_name: name of tunnel quota to use + """ + try: + import pyarrow as pa + except ImportError: + pa = None + + batch_size = batch_size or options.tunnel.read_row_batch_size + kw = dict( + limit=limit, + columns=columns, + arrow=pa is not None, + quota_name=quota_name, + tags=tags, + **kwargs + ) + if limit is None: + kw.pop("limit") + + def _it(reader): + for batch in reader.iter_pandas( + batch_size, start=start, count=count, columns=columns + ): + yield batch + + for batch in self._iter_reader_with_pandas(_it, **kw): + yield batch diff --git a/odps/models/instances.py b/odps/models/instances.py index 3564433a..be586874 100644 --- a/odps/models/instances.py +++ b/odps/models/instances.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,13 +18,13 @@ from collections import OrderedDict from datetime import datetime +from .. import errors, serializers, utils +from ..compat import six +from ..config import options from .core import Iterable from .instance import Instance -from .session import SessionInstance from .job import Job -from .. import serializers, errors, utils -from ..compat import six -from ..config import options +from .session import SessionInstance class BaseInstances(Iterable): @@ -48,17 +48,26 @@ def __contains__(self, item): def __iter__(self): return self.iterate() - def iterate(self, start_time=None, end_time=None, status=None, only_owner=None, - max_items=None, job_name=None, quota_index=None, **kw): - if 'from_time' in kw: - start_time = kw['from_time'] + def iterate( + self, + start_time=None, + end_time=None, + status=None, + only_owner=None, + max_items=None, + job_name=None, + quota_index=None, + **kw + ): + if "from_time" in kw: + start_time = kw["from_time"] if isinstance(status, six.string_types): status = Instance.Status(status.capitalize()) params = dict() if status is not None: - params['status'] = status.value + params["status"] = status.value if start_time is not None or end_time is not None: daterange = six.StringIO() if start_time is not None: @@ -66,33 +75,32 @@ def iterate(self, start_time=None, end_time=None, status=None, only_owner=None, daterange.write(str(utils.to_timestamp(start_time))) else: daterange.write(str(int(start_time))) - daterange.write(':') + daterange.write(":") if end_time is not None: if isinstance(end_time, datetime): daterange.write(str(utils.to_timestamp(end_time))) else: daterange.write(str(int(end_time))) - params['daterange'] = daterange.getvalue() + params["daterange"] = daterange.getvalue() if only_owner is not None: - params['onlyowner'] = 'yes' if only_owner else 'no' + params["onlyowner"] = "yes" if only_owner else "no" if max_items is not None: - params['maxitems'] = max_items + params["maxitems"] = max_items if job_name is not None: - params['jobname'] = job_name + params["jobname"] = job_name if quota_index is not None: - params['quotaIndex'] = quota_index + params["quotaIndex"] = quota_index def _it(): - last_marker = params.get('marker') - if 'marker' in params and \ - (last_marker is None or len(last_marker) == 0): + last_marker = params.get("marker") + if "marker" in params and (last_marker is None or len(last_marker) == 0): return url = self.resource() resp = self._client.get(url, params=params) inst = Instances.parse(self._client, resp, obj=self) - params['marker'] = inst.marker + params["marker"] = inst.marker return inst.instances @@ -104,18 +112,22 @@ def _it(): yield instance @classmethod - def _create_job(cls, job=None, task=None, priority=None, running_cluster=None, uuid_=None): + def _create_job( + cls, job=None, task=None, priority=None, running_cluster=None, uuid_=None + ): job = job or Job() if priority is not None: if priority < 0: - raise errors.ODPSClientError('Priority must more than or equal to zero.') + raise errors.ODPSClientError( + "Priority must more than or equal to zero." + ) job.priority = priority if running_cluster is not None: job.running_cluster = running_cluster if task is not None: job.add_task(task) if job.tasks is None or len(job.tasks) == 0: - raise ValueError('Job tasks are required') + raise ValueError("Job tasks are required") guid = uuid_ or str(uuid.uuid4()) for t in job.tasks: @@ -127,21 +139,31 @@ def _fill_task_properties(cls, task, uuid_=None): if task.properties is None: task.properties = OrderedDict() if options.biz_id: - task.properties['biz_id'] = str(options.biz_id) + task.properties["biz_id"] = str(options.biz_id) guid = uuid_ or str(uuid.uuid4()) - if task.properties.get('uuid') is None: - task.set_property('uuid', guid) + if task.properties.get("uuid") is None: + task.set_property("uuid", guid) if task.name is None: - raise errors.ODPSClientError('Task name is required') + raise errors.ODPSClientError("Task name is required") @classmethod def _get_submit_instance_content(cls, job): return Instance.AnonymousSubmitInstance(job=job).serialize() - def create(self, xml=None, job=None, task=None, priority=None, running_cluster=None, - headers=None, create_callback=None, encoding=None, session_project=None, - session_name=None): + def create( + self, + xml=None, + job=None, + task=None, + priority=None, + running_cluster=None, + headers=None, + create_callback=None, + encoding=None, + session_project=None, + session_name=None, + ): if xml is None: job = self._create_job( job=job, task=task, priority=priority, running_cluster=running_cluster @@ -150,19 +172,19 @@ def create(self, xml=None, job=None, task=None, priority=None, running_cluster=N xml = self._get_submit_instance_content(job) headers = headers or dict() - headers['Content-Type'] = 'application/xml' + headers["Content-Type"] = "application/xml" url = self.resource() resp = self._client.post(url, xml, headers=headers) - location = resp.headers.get('Location') + location = resp.headers.get("Location") if location is None or len(location) == 0: raise errors.ODPSClientError( - 'Invalid response, Location header required. As it is a rare ' - 'condition, please check your network policies first.', - request_id=resp.headers.get('x-odps-request-id'), + "Invalid response, Location header required. As it is a rare " + "condition, please check your network policies first.", + request_id=resp.headers.get("x-odps-request-id"), ) - instance_id = location.rsplit('/', 1)[1] + instance_id = location.rsplit("/", 1)[1] create_callback = create_callback or options.instance_create_callback if create_callback is not None: @@ -178,60 +200,64 @@ def create(self, xml=None, job=None, task=None, priority=None, running_cluster=N results = None if session_project: - instance = SessionInstance(session_project=session_project, - session_task_name=task.name, - session_name=session_name, - name=instance_id, task_results=results, - parent=self, client=self._client) + instance = SessionInstance( + session_project=session_project, + session_task_name=task.name, + session_name=session_name, + name=instance_id, + task_results=results, + parent=self, + client=self._client, + ) else: - instance = Instance(name=instance_id, task_results=results, - parent=self, client=self._client) + instance = Instance( + name=instance_id, task_results=results, parent=self, client=self._client + ) return instance class Instances(BaseInstances): - - marker = serializers.XMLNodeField('Marker') - max_items = serializers.XMLNodeField('MaxItems') - instances = serializers.XMLNodesReferencesField(Instance, 'Instance') + marker = serializers.XMLNodeField("Marker") + max_items = serializers.XMLNodeField("MaxItems") + instances = serializers.XMLNodesReferencesField(Instance, "Instance") class CachedInstances(BaseInstances): - class _CachedInstances(serializers.JSONSerializableModel): - instance_queueing_infos = \ - serializers.JSONNodesReferencesField(Instance.InstanceQueueingInfo) + instance_queueing_infos = serializers.JSONNodesReferencesField( + Instance.InstanceQueueingInfo + ) - marker = serializers.XMLNodeField('Marker') - max_items = serializers.XMLNodeField('MaxItems') - instances = serializers.XMLNodeReferenceField(_CachedInstances, 'Content') + marker = serializers.XMLNodeField("Marker") + max_items = serializers.XMLNodeField("MaxItems") + instances = serializers.XMLNodeReferenceField(_CachedInstances, "Content") - def iterate(self, status=None, only_owner=None, - max_items=None, quota_index=None, **kw): + def iterate( + self, status=None, only_owner=None, max_items=None, quota_index=None, **kw + ): if isinstance(status, six.string_types): status = Instance.Status(status.capitalize()) params = dict() if status is not None: - params['status'] = status.value + params["status"] = status.value if only_owner is not None: - params['onlyowner'] = 'yes' if only_owner else 'no' + params["onlyowner"] = "yes" if only_owner else "no" if max_items is not None: - params['maxitems'] = max_items + params["maxitems"] = max_items if quota_index is not None: - params['quotaIndex'] = quota_index + params["quotaIndex"] = quota_index def _it(): - last_marker = params.get('marker') - if 'marker' in params and \ - (last_marker is None or len(last_marker) == 0): + last_marker = params.get("marker") + if "marker" in params and (last_marker is None or len(last_marker) == 0): return url = self.resource() resp = self._client.get(url, params=params) inst = CachedInstances.parse(self._client, resp, obj=self) - params['marker'] = inst.marker + params["marker"] = inst.marker return inst.instances.instance_queueing_infos diff --git a/odps/models/job.py b/odps/models/job.py index b54fc1c9..44131637 100644 --- a/odps/models/job.py +++ b/odps/models/job.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,29 +14,32 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .core import XMLRemoteModel from .. import serializers, utils +from .core import XMLRemoteModel from .tasks import Task class Job(XMLRemoteModel): # define __slots__ to keep the property sequence when serializing into a xml - __slots__ = 'name', 'comment', 'priority', 'running_cluster', 'tasks', 'run_mode' + __slots__ = "name", "comment", "priority", "running_cluster", "tasks", "run_mode" - _root = 'Job' + _root = "Job" - name = serializers.XMLNodeField('Name') + name = serializers.XMLNodeField("Name") comment = serializers.XMLNodeField("Comment") - owner = serializers.XMLNodeField('Owner') - creation_time = serializers.XMLNodeField('CreationTime', - parse_callback=utils.parse_rfc822) - last_modified_time = serializers.XMLNodeField('LastModifiedTime', - parse_callback=utils.parse_rfc822) - priority = serializers.XMLNodeField('Priority', - parse_callback=int, serialize_callback=int, default=9) - running_cluster = serializers.XMLNodeField('RunningCluster') - run_mode = serializers.XMLNodeField('DAG', 'RunMode', default='Sequence') - tasks = serializers.XMLNodesReferencesField(Task, 'Tasks', '*') + owner = serializers.XMLNodeField("Owner") + creation_time = serializers.XMLNodeField( + "CreationTime", parse_callback=utils.parse_rfc822 + ) + last_modified_time = serializers.XMLNodeField( + "LastModifiedTime", parse_callback=utils.parse_rfc822 + ) + priority = serializers.XMLNodeField( + "Priority", parse_callback=int, serialize_callback=int, default=9 + ) + running_cluster = serializers.XMLNodeField("RunningCluster") + run_mode = serializers.XMLNodeField("DAG", "RunMode", default="Sequence") + tasks = serializers.XMLNodesReferencesField(Task, "Tasks", "*") def add_task(self, task): if self.tasks is None: diff --git a/odps/models/ml/__init__.py b/odps/models/ml/__init__.py index 83cf20e1..e345abed 100644 --- a/odps/models/ml/__init__.py +++ b/odps/models/ml/__init__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,5 +14,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .offlinemodels import OfflineModels from .offlinemodel import OfflineModel +from .offlinemodels import OfflineModels diff --git a/odps/models/ml/offlinemodel.py b/odps/models/ml/offlinemodel.py index f2c78666..77200afc 100644 --- a/odps/models/ml/offlinemodel.py +++ b/odps/models/ml/offlinemodel.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,25 +14,25 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ..core import LazyLoad from ... import serializers, utils from ...compat import urlparse +from ..core import LazyLoad class OfflineModelInfo(serializers.XMLSerializableModel): - _root = 'Offlinemodel' + _root = "Offlinemodel" - name = serializers.XMLNodeField('Name', default='') - model_path = serializers.XMLNodeField('ModelPath', default='') - role_arn = serializers.XMLNodeField('Rolearn') - type = serializers.XMLNodeField('Type') - version = serializers.XMLNodeField('Version') - processor = serializers.XMLNodeField('Processor') - configuration = serializers.XMLNodeField('Configuration') - src_project = serializers.XMLNodeField('SrcProject') - src_model = serializers.XMLNodeField('SrcModel') - dest_project = serializers.XMLNodeField('DestProject') - dest_model = serializers.XMLNodeField('DestModel') + name = serializers.XMLNodeField("Name", default="") + model_path = serializers.XMLNodeField("ModelPath", default="") + role_arn = serializers.XMLNodeField("Rolearn") + type = serializers.XMLNodeField("Type") + version = serializers.XMLNodeField("Version") + processor = serializers.XMLNodeField("Processor") + configuration = serializers.XMLNodeField("Configuration") + src_project = serializers.XMLNodeField("SrcProject") + src_model = serializers.XMLNodeField("SrcModel") + dest_project = serializers.XMLNodeField("DestProject") + dest_model = serializers.XMLNodeField("DestModel") class OfflineModel(LazyLoad): @@ -40,12 +40,14 @@ class OfflineModel(LazyLoad): Representing an ODPS offline model. """ - name = serializers.XMLNodeField('Name') - owner = serializers.XMLNodeField('Owner') + name = serializers.XMLNodeField("Name") + owner = serializers.XMLNodeField("Owner") creation_time = serializers.XMLNodeField( - 'CreationTime', parse_callback=utils.parse_rfc822) + "CreationTime", parse_callback=utils.parse_rfc822 + ) last_modified_time = serializers.XMLNodeField( - 'LastModifiedTime', parse_callback=utils.parse_rfc822) + "LastModifiedTime", parse_callback=utils.parse_rfc822 + ) def reload(self): resp = self._client.get(self.resource()) @@ -57,7 +59,7 @@ def get_model(self): via this method might be incomplete due to size limitations. """ url = self.resource() - resp = self._client.get(url, action='data') + resp = self._client.get(url, action="data") return resp.text @@ -73,13 +75,17 @@ def copy(self, new_name, new_project=None, async_=False): url = self.parent.resource() new_project = new_project or self.project.name - info = OfflineModelInfo(src_model=self.name, src_project=self.project.name, - dest_model=new_name, dest_project=new_project) - headers = {'Content-Type': 'application/xml'} + info = OfflineModelInfo( + src_model=self.name, + src_project=self.project.name, + dest_model=new_name, + dest_project=new_project, + ) + headers = {"Content-Type": "application/xml"} resp = self._client.post(url, info.serialize(), headers=headers) - inst_url = resp.headers['Location'].rstrip('/') - inst_id = urlparse(inst_url).path.rsplit('/', 1)[-1] + inst_url = resp.headers["Location"].rstrip("/") + inst_id = urlparse(inst_url).path.rsplit("/", 1)[-1] inst = self.project.instances[inst_id] if not async_: diff --git a/odps/models/ml/offlinemodels.py b/odps/models/ml/offlinemodels.py index 66f1d3af..bbe13fcf 100644 --- a/odps/models/ml/offlinemodels.py +++ b/odps/models/ml/offlinemodels.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,17 +14,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ..core import Iterable -from ... import serializers, errors +from ... import errors, serializers from ...compat import six +from ..core import Iterable from .offlinemodel import OfflineModel class OfflineModels(Iterable): - - marker = serializers.XMLNodeField('Marker') - max_items = serializers.XMLNodeField('MaxItems', parse_callback=int) - offline_models = serializers.XMLNodesReferencesField(OfflineModel, 'OfflineModel') + marker = serializers.XMLNodeField("Marker") + max_items = serializers.XMLNodeField("MaxItems", parse_callback=int) + offline_models = serializers.XMLNodesReferencesField(OfflineModel, "OfflineModel") @property def project(self): @@ -57,23 +56,22 @@ def iterate(self, name=None, owner=None): :param owner: :return: """ - params = {'expectmarker': 'true'} + params = {"expectmarker": "true"} if name is not None: - params['name'] = name + params["name"] = name if owner is not None: - params['owner'] = owner + params["owner"] = owner def _it(): - last_marker = params.get('marker') - if 'marker' in params and \ - (last_marker is None or len(last_marker) == 0): + last_marker = params.get("marker") + if "marker" in params and (last_marker is None or len(last_marker) == 0): return url = self.resource() resp = self._client.get(url, params=params) t = OfflineModels.parse(self._client, resp, obj=self) - params['marker'] = t.marker + params["marker"] = t.marker return t.offline_models @@ -95,4 +93,4 @@ def delete(self, name): url = offline_model.resource() - self._client.delete(url) \ No newline at end of file + self._client.delete(url) diff --git a/odps/models/partition.py b/odps/models/partition.py index 12a23dec..97c3daec 100644 --- a/odps/models/partition.py +++ b/odps/models/partition.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ from datetime import datetime from .. import serializers, types, utils -from .core import LazyLoad, XMLRemoteModel, JSONRemoteModel +from .core import JSONRemoteModel, LazyLoad, XMLRemoteModel from .storage_tier import StorageTierInfo @@ -32,45 +32,67 @@ class Partition(LazyLoad): to provide the ability to read records from a partition. The behavior of these methods are the same as those in Table class except that there are no 'partition' params. """ - __slots__ = 'spec', 'creation_time', 'last_meta_modified_time', \ - 'last_data_modified_time', 'size', '_is_extend_info_loaded', \ - 'is_archived', 'is_exstore', 'lifecycle', 'physical_size', \ - 'file_num', 'reserved' - class Column(XMLRemoteModel): + __slots__ = ( + "spec", + "creation_time", + "last_meta_modified_time", + "last_data_modified_time", + "size", + "_is_extend_info_loaded", + "is_archived", + "is_exstore", + "lifecycle", + "physical_size", + "file_num", + "reserved", + ) - name = serializers.XMLNodeAttributeField(attr='Name') - value = serializers.XMLNodeAttributeField(attr='Value') + class Column(XMLRemoteModel): + name = serializers.XMLNodeAttributeField(attr="Name") + value = serializers.XMLNodeAttributeField(attr="Value") class PartitionMeta(JSONRemoteModel): - creation_time = serializers.JSONNodeField( - 'createTime', parse_callback=datetime.fromtimestamp, set_to_parent=True) + "createTime", parse_callback=datetime.fromtimestamp, set_to_parent=True + ) last_meta_modified_time = serializers.JSONNodeField( - 'lastDDLTime', parse_callback=datetime.fromtimestamp, set_to_parent=True) + "lastDDLTime", parse_callback=datetime.fromtimestamp, set_to_parent=True + ) last_data_modified_time = serializers.JSONNodeField( - 'lastModifiedTime', parse_callback=datetime.fromtimestamp, set_to_parent=True) + "lastModifiedTime", + parse_callback=datetime.fromtimestamp, + set_to_parent=True, + ) size = serializers.JSONNodeField( - 'partitionSize', parse_callback=int, set_to_parent=True) + "partitionSize", parse_callback=int, set_to_parent=True + ) class PartitionExtendedMeta(PartitionMeta): - is_archived = serializers.JSONNodeField( - 'IsArchived', parse_callback=bool, set_to_parent=True) + "IsArchived", parse_callback=bool, set_to_parent=True + ) is_exstore = serializers.JSONNodeField( - 'IsExstore', parse_callback=bool, set_to_parent=True) + "IsExstore", parse_callback=bool, set_to_parent=True + ) lifecycle = serializers.JSONNodeField( - 'LifeCycle', parse_callback=int, set_to_parent=True) + "LifeCycle", parse_callback=int, set_to_parent=True + ) physical_size = serializers.JSONNodeField( - 'PhysicalSize', parse_callback=int, set_to_parent=True) + "PhysicalSize", parse_callback=int, set_to_parent=True + ) file_num = serializers.JSONNodeField( - 'FileNum', parse_callback=int, set_to_parent=True) + "FileNum", parse_callback=int, set_to_parent=True + ) reserved = serializers.JSONNodeField( - 'Reserved', type='json', set_to_parent=True) + "Reserved", type="json", set_to_parent=True + ) - columns = serializers.XMLNodesReferencesField(Column, 'Column') - _schema = serializers.XMLNodeReferenceField(PartitionMeta, 'Schema') - _extended_schema = serializers.XMLNodeReferenceField(PartitionExtendedMeta, 'Schema') + columns = serializers.XMLNodesReferencesField(Column, "Column") + _schema = serializers.XMLNodeReferenceField(PartitionMeta, "Schema") + _extended_schema = serializers.XMLNodeReferenceField( + PartitionExtendedMeta, "Schema" + ) def __init__(self, **kwargs): self._is_extend_info_loaded = False @@ -81,12 +103,21 @@ def __str__(self): return str(self.partition_spec) def __repr__(self): - return '' % ( - str(self.table.project.name), str(self.table.name), str(self.partition_spec)) + return "" % ( + str(self.table.project.name), + str(self.table.name), + str(self.partition_spec), + ) def __getattribute__(self, attr): - if attr in ('is_archived', 'is_exstore', 'lifecycle', - 'physical_size', 'file_num', 'reserved'): + if attr in ( + "is_archived", + "is_exstore", + "lifecycle", + "physical_size", + "file_num", + "reserved", + ): if not self._is_extend_info_loaded: self.reload_extend_info() @@ -94,7 +125,7 @@ def __getattribute__(self, attr): val = object.__getattribute__(self, attr) if val is None and not self._loaded: - if attr in getattr(Partition.PartitionMeta, '__fields'): + if attr in getattr(Partition.PartitionMeta, "__fields"): self.reload() return object.__getattribute__(self, attr) @@ -125,14 +156,14 @@ def last_modified_time(self): DeprecationWarning, stacklevel=3, ) - utils.add_survey_call(".".join( - [type(self).__module__, type(self).__name__, "last_modified_time"] - )) + utils.add_survey_call( + ".".join([type(self).__module__, type(self).__name__, "last_modified_time"]) + ) return self.last_data_modified_time @property def partition_spec(self): - return self.get_partition_spec(self._getattr('columns'), self._getattr('spec')) + return self.get_partition_spec(self._getattr("columns"), self._getattr("spec")) @property def name(self): @@ -152,7 +183,7 @@ def storage_tier_info(self): def reload(self): url = self.resource() - params = {'partition': str(self.partition_spec)} + params = {"partition": str(self.partition_spec)} resp = self._client.get(url, params=params, curr_schema=self._get_schema_name()) self.parse(self._client, resp, obj=self) @@ -161,9 +192,9 @@ def reload(self): def reload_extend_info(self): url = self.resource() - params = {'partition': str(self.partition_spec)} + params = {"partition": str(self.partition_spec)} resp = self._client.get( - url, action='extended', params=params, curr_schema=self._get_schema_name() + url, action="extended", params=params, curr_schema=self._get_schema_name() ) self.parse(self._client, resp, obj=self) @@ -230,6 +261,77 @@ def open_reader(self, **kw): def open_writer(self, blocks=None, **kw): return self.table.open_writer(self.partition_spec, blocks=blocks, **kw) + def to_pandas( + self, + columns=None, + start=None, + count=None, + n_process=1, + quota_name=None, + append_partitions=None, + tags=None, + **kwargs + ): + """ + Read partition data into pandas DataFrame + + :param list columns: columns to read + :param int start: start row index from 0 + :param int count: data count to read + :param int n_process: number of processes to accelerate reading + :param str quota_name: name of tunnel quota to use + :param bool append_partitions: if True, partition values will be + appended to the output + """ + return self.table.to_pandas( + partition=self.partition_spec, + columns=columns, + arrow=True, + quota_name=quota_name, + tags=tags, + n_process=n_process, + start=start, + count=count, + append_partitions=append_partitions, + **kwargs + ) + + def iter_pandas( + self, + columns=None, + batch_size=None, + start=None, + count=None, + quota_name=None, + append_partitions=None, + tags=None, + **kwargs + ): + """ + Read partition data into pandas DataFrame + + :param list columns: columns to read + :param int batch_size: size of DataFrame batch to read + :param int start: start row index from 0 + :param int count: data count to read + :param str quota_name: name of tunnel quota to use + :param bool append_partitions: if True, partition values will be + appended to the output + """ + for batch in self.table.iter_pandas( + partition=self.partition_spec, + columns=columns, + batch_size=batch_size, + arrow=True, + quota_name=quota_name, + tags=tags, + append_partitions=append_partitions, + start=start, + count=count, + **kwargs + ): + yield batch + @utils.with_wait_argument def truncate(self, async_=False): return self.table.truncate(self.partition_spec, async_=async_) diff --git a/odps/models/partitions.py b/odps/models/partitions.py index fafa3508..95f489b8 100644 --- a/odps/models/partitions.py +++ b/odps/models/partitions.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,9 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from collections import defaultdict, OrderedDict +from collections import OrderedDict, defaultdict -from .. import serializers, errors, types +from .. import errors, serializers, types from ..compat import six from ..utils import with_wait_argument from .core import Iterable @@ -24,16 +24,18 @@ class PartitionSpecCondition(object): - _predicates = OrderedDict([ - ("==", lambda a, b: a == b), - (">=", lambda a, b: a >= b), - ("<=", lambda a, b: a <= b), - ("<>", lambda a, b: a != b), - ("!=", lambda a, b: a != b), - (">", lambda a, b: a > b), - ("<", lambda a, b: a < b), - ("=", lambda a, b: a == b), - ]) + _predicates = OrderedDict( + [ + ("==", lambda a, b: a == b), + (">=", lambda a, b: a >= b), + ("<=", lambda a, b: a <= b), + ("<>", lambda a, b: a != b), + ("!=", lambda a, b: a != b), + (">", lambda a, b: a > b), + ("<", lambda a, b: a < b), + ("=", lambda a, b: a == b), + ] + ) def __init__(self, part_fields, condition=None): self._part_to_conditions = defaultdict(list) @@ -49,7 +51,7 @@ def __init__(self, part_fields, condition=None): if len(parts) != 2: raise ValueError("Invalid partition condition %r" % split) part = parts[0].strip() - val = parts[1].strip().replace('"', '').replace("'", '') + val = parts[1].strip().replace('"', "").replace("'", "") if part not in field_set: raise ValueError("Invalid partition field %r" % part) @@ -79,9 +81,9 @@ def match(self, spec): class Partitions(Iterable): - marker = serializers.XMLNodeField('Marker') - max_items = serializers.XMLNodeField('MaxItems') - partitions = serializers.XMLNodesReferencesField(Partition, 'Partition') + marker = serializers.XMLNodeField("Marker") + max_items = serializers.XMLNodeField("MaxItems") + partitions = serializers.XMLNodesReferencesField(Partition, "Partition") def _name(self): return @@ -132,27 +134,26 @@ def iterate_partitions(self, spec=None, reverse=False): ) spec = condition.partition_spec - actions = ['partitions'] - params = {'expectmarker': 'true'} + actions = ["partitions"] + params = {"expectmarker": "true"} if reverse: - actions.append('reverse') + actions.append("reverse") if spec is not None and not spec.is_empty: - params['partition'] = str(spec) + params["partition"] = str(spec) schema_name = self._get_schema_name() if schema_name: - params['curr_schema'] = schema_name + params["curr_schema"] = schema_name def _it(): - last_marker = params.get('marker') - if 'marker' in params and \ - (last_marker is None or len(last_marker) == 0): + last_marker = params.get("marker") + if "marker" in params and (last_marker is None or len(last_marker) == 0): return url = self.resource() resp = self._client.get(url, actions=actions, params=params) t = self.parse(self._client, resp, obj=self) - params['marker'] = t.marker + params["marker"] = t.marker return t.partitions @@ -176,7 +177,7 @@ def get_max_partition(self, spec=None, skip_empty=True, reverse=False): for exist_pt, user_pt_name in zip(table_parts, spec.kv): if exist_pt.name != user_pt_name: - table_pt_str = ",".join(pt.name for pt in table_parts[:len(spec)]) + table_pt_str = ",".join(pt.name for pt in table_parts[: len(spec)]) prefix_pt_str = ",".join(spec.kv.keys()) raise ValueError( "Partition prefix %s not agree with table partitions %s", @@ -193,7 +194,9 @@ def get_max_partition(self, spec=None, skip_empty=True, reverse=False): elif not skip_empty: return max(part_values, key=lambda tp: tp[1])[0] else: - reversed_table_parts = sorted(part_values, key=lambda tp: tp[1], reverse=not reverse) + reversed_table_parts = sorted( + part_values, key=lambda tp: tp[1], reverse=not reverse + ) return next( ( part @@ -211,20 +214,23 @@ def create(self, partition_spec, if_not_exists=False, async_=False, hints=None): partition_spec = self._get_partition_spec(partition_spec) buf = six.StringIO() - buf.write('ALTER TABLE %s ADD ' % self.parent.full_table_name) + buf.write("ALTER TABLE %s ADD " % self.parent.full_table_name) if if_not_exists: - buf.write('IF NOT EXISTS ') + buf.write("IF NOT EXISTS ") - buf.write('PARTITION (%s);' % partition_spec) + buf.write("PARTITION (%s);" % partition_spec) from .tasks import SQLTask - task = SQLTask(name='SQLAddPartitionTask', query=buf.getvalue()) + + task = SQLTask(name="SQLAddPartitionTask", query=buf.getvalue()) hints = hints or {} schema_name = self._get_schema_name() if schema_name is not None: hints["odps.sql.allow.namespace.schema"] = "true" hints["odps.namespace.schema"] = "true" + if self.project.odps.quota_name: + hints["odps.task.wlm.quota"] = self.project.odps.quota_name task.update_sql_settings(hints) instance = self.project.parent[self._client.project].instances.create(task=task) @@ -242,22 +248,25 @@ def delete(self, partition_spec, if_exists=False, async_=False, hints=None): partition_spec = self._get_partition_spec(partition_spec) buf = six.StringIO() - buf.write('ALTER TABLE %s DROP ' % self.parent.full_table_name) + buf.write("ALTER TABLE %s DROP " % self.parent.full_table_name) if if_exists: - buf.write('IF EXISTS ') + buf.write("IF EXISTS ") - buf.write('PARTITION (%s);' % partition_spec) + buf.write("PARTITION (%s);" % partition_spec) from .tasks import SQLTask - task = SQLTask(name='SQLDropPartitionTask', query=buf.getvalue()) + + task = SQLTask(name="SQLDropPartitionTask", query=buf.getvalue()) hints = hints or {} - hints['odps.sql.submit.mode'] = '' + hints["odps.sql.submit.mode"] = "" schema_name = self._get_schema_name() if schema_name is not None: hints["odps.sql.allow.namespace.schema"] = "true" hints["odps.namespace.schema"] = "true" + if self.project.odps.quota_name: + hints["odps.task.wlm.quota"] = self.project.odps.quota_name task.update_sql_settings(hints) instance = self.project.parent[self._client.project].instances.create(task=task) diff --git a/odps/models/project.py b/odps/models/project.py index 6e3679ec..6889d319 100644 --- a/odps/models/project.py +++ b/odps/models/project.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,20 +20,20 @@ import weakref from .. import serializers, utils -from ..compat import six, Enum +from ..compat import Enum, six from ..errors import SecurityQueryError from .core import LazyLoad, XMLRemoteModel from .functions import Functions -from .instances import Instances, CachedInstances +from .instances import CachedInstances, Instances from .ml import OfflineModels from .resources import Resources from .schemas import Schemas +from .security.roles import Roles +from .security.users import User, Users +from .storage_tier import StorageTierInfo from .tables import Tables from .volumes import Volumes from .xflows import XFlows -from .security.users import Users, User -from .security.roles import Roles -from .storage_tier import StorageTierInfo class Project(LazyLoad): @@ -62,6 +62,7 @@ class Project(LazyLoad): "_policy_cache", "_logview_host", "_default_schema", + "_quota_name", "_tunnel_endpoint", "_all_props_loaded", "_extended_props_loaded", @@ -69,31 +70,34 @@ class Project(LazyLoad): ) class Cluster(XMLRemoteModel): - - name = serializers.XMLNodeField('Name') - quota_id = serializers.XMLNodeField('QuotaID') + name = serializers.XMLNodeField("Name") + quota_id = serializers.XMLNodeField("QuotaID") @classmethod def deserial(cls, content, obj=None, **kw): ret = super(Project.Cluster, cls).deserial(content, obj=obj, **kw) - if not getattr(ret, 'name', None) or not getattr(ret, 'quota_id', None): - raise ValueError('Missing arguments: name or quotaID') + if not getattr(ret, "name", None) or not getattr(ret, "quota_id", None): + raise ValueError("Missing arguments: name or quotaID") return ret class ExtendedProperties(XMLRemoteModel): _extended_properties = serializers.XMLNodePropertiesField( - 'ExtendedProperties', 'Property', key_tag='Name', value_tag='Value', set_to_parent=True + "ExtendedProperties", + "Property", + key_tag="Name", + value_tag="Value", + set_to_parent=True, ) class AuthQueryRequest(serializers.XMLSerializableModel): - _root = 'Authorization' - query = serializers.XMLNodeField('Query') - use_json = serializers.XMLNodeField('ResponseInJsonFormat', type='bool') - settings = serializers.XMLNodeField('Settings') + _root = "Authorization" + query = serializers.XMLNodeField("Query") + use_json = serializers.XMLNodeField("ResponseInJsonFormat", type="bool") + settings = serializers.XMLNodeField("Settings") class AuthQueryResponse(serializers.XMLSerializableModel): - _root = 'Authorization' - result = serializers.XMLNodeField('Result') + _root = "Authorization" + result = serializers.XMLNodeField("Result") class AuthQueryStatus(Enum): TERMINATED = "TERMINATED" @@ -127,9 +131,9 @@ def from_str(cls, s): class AuthQueryStatusResponse(serializers.XMLSerializableModel): _root = "AuthorizationQuery" - result = serializers.XMLNodeField('Result') + result = serializers.XMLNodeField("Result") status = serializers.XMLNodeField( - 'Status', parse_callback=lambda s: Project.AuthQueryStatus(s.upper()) + "Status", parse_callback=lambda s: Project.AuthQueryStatus(s.upper()) ) class AuthQueryInstance(object): @@ -165,30 +169,30 @@ def is_successful(self): status = self.query_status() return status.status == Project.AuthQueryStatus.TERMINATED - name = serializers.XMLNodeField('Name') - type = serializers.XMLNodeField('Type', parse_callback=ProjectType.from_str) - comment = serializers.XMLNodeField('Comment') - owner = serializers.XMLNodeField('Owner') - super_administrator = serializers.XMLNodeField('SuperAdministrator') + name = serializers.XMLNodeField("Name") + type = serializers.XMLNodeField("Type", parse_callback=ProjectType.from_str) + comment = serializers.XMLNodeField("Comment") + owner = serializers.XMLNodeField("Owner") + super_administrator = serializers.XMLNodeField("SuperAdministrator") creation_time = serializers.XMLNodeField( - 'CreationTime', parse_callback=utils.parse_rfc822 + "CreationTime", parse_callback=utils.parse_rfc822 ) last_modified_time = serializers.XMLNodeField( - 'LastModifiedTime', parse_callback=utils.parse_rfc822 + "LastModifiedTime", parse_callback=utils.parse_rfc822 ) - project_group_name = serializers.XMLNodeField('ProjectGroupName') + project_group_name = serializers.XMLNodeField("ProjectGroupName") properties = serializers.XMLNodePropertiesField( - 'Properties', 'Property', key_tag='Name', value_tag='Value' + "Properties", "Property", key_tag="Name", value_tag="Value" ) _extended_properties = serializers.XMLNodePropertiesField( - 'ExtendedProperties', 'Property', key_tag='Name', value_tag='Value' + "ExtendedProperties", "Property", key_tag="Name", value_tag="Value" ) - _state = serializers.XMLNodeField('State') - clusters = serializers.XMLNodesReferencesField(Cluster, 'Clusters', 'Cluster') - region_id = serializers.XMLNodeField('Region') - tenant_id = serializers.XMLNodeField('TenantId') - default_quota_nickname = serializers.XMLNodeField('DefaultQuotaNickname') - default_quota_region = serializers.XMLNodeField('DefaultQuotaRegion') + _state = serializers.XMLNodeField("State") + clusters = serializers.XMLNodesReferencesField(Cluster, "Clusters", "Cluster") + region_id = serializers.XMLNodeField("Region") + tenant_id = serializers.XMLNodeField("TenantId") + default_quota_nickname = serializers.XMLNodeField("DefaultQuotaNickname") + default_quota_region = serializers.XMLNodeField("DefaultQuotaRegion") def __init__(self, *args, **kwargs): self._tunnel_endpoint = None @@ -208,9 +212,9 @@ def reload(self, all_props=False): self.parse(self._client, resp, obj=self) - self.owner = resp.headers['x-odps-owner'] - self.creation_time = utils.parse_rfc822(resp.headers['x-odps-creation-time']) - self.last_modified_time = utils.parse_rfc822(resp.headers['Last-Modified']) + self.owner = resp.headers["x-odps-owner"] + self.creation_time = utils.parse_rfc822(resp.headers["x-odps-creation-time"]) + self.last_modified_time = utils.parse_rfc822(resp.headers["Last-Modified"]) self._loaded = True self._all_props_loaded = all_props @@ -221,7 +225,7 @@ def extended_properties(self): return self._getattr("_extended_properties") url = self.resource() - resp = self._client.get(url, action='extended') + resp = self._client.get(url, action="extended") Project.ExtendedProperties.parse(self._client, resp, parent=self) self._extended_props_loaded = True @@ -234,11 +238,13 @@ def schemas(self): @property def state(self): warnings.warn( - 'Project.state is deprecated and will be replaced by Project.status.', + "Project.state is deprecated and will be replaced by Project.status.", DeprecationWarning, stacklevel=3, ) - utils.add_survey_call(".".join([type(self).__module__, type(self).__name__, "state"])) + utils.add_survey_call( + ".".join([type(self).__module__, type(self).__name__, "state"]) + ) return self._state @property @@ -303,11 +309,12 @@ def roles(self): @property def security_options(self): from .security import SecurityConfiguration + return SecurityConfiguration(client=self._client, parent=self) @property def system_info(self): - resp = self._client.get(self.resource() + '/system') + resp = self._client.get(self.resource() + "/system") return json.loads(resp.content.decode() if six.PY3 else resp.content) @property @@ -337,7 +344,7 @@ def get_odps_ref(obj): schema=client.schema, tunnel_endpoint=self._tunnel_endpoint, logview_host=self._logview_host, - app_account=getattr(client, 'app_account', None), + app_account=getattr(client, "app_account", None), overwrite_global=False, ) self._odps_ref = weakref.ref(odps) @@ -356,7 +363,7 @@ def _set_tunnel_defaults(self): @property def policy(self): if self._getattr("_policy_cache") is None: - params = dict(policy='') + params = dict(policy="") resp = self._client.get(self.resource(), params=params) self._policy_cache = resp.content.decode() if six.PY3 else resp.content if self._getattr("_policy_cache"): @@ -369,34 +376,43 @@ def policy(self, value): if isinstance(value, (dict, list)): value = json.dumps(value) elif value is None: - value = '' + value = "" self._policy_cache = value - params = dict(policy='') + params = dict(policy="") self._client.put(self.resource(), data=value, params=params) @property def current_user(self): user_cache = type(self)._user_cache - user_key = self._client.account.access_id + '##' + self.name + user_key = self._client.account.access_id + "##" + self.name if user_key not in user_cache: - user = self.run_security_query('whoami') - user_cache[user_key] = User(_client=self._client, parent=self.users, - id=user['ID'], display_name=user['DisplayName']) + user = self.run_security_query("whoami") + user_cache[user_key] = User( + _client=self._client, + parent=self.users, + id=user["ID"], + display_name=user["DisplayName"], + ) return user_cache[user_key] def auth_resource(self, client=None): return self.resource(client) + "/authorization" - def run_security_query(self, query, schema=None, token=None, hints=None, output_json=True): + def run_security_query( + self, query, schema=None, token=None, hints=None, output_json=True + ): url = self.auth_resource() - headers = {'Content-Type': 'application/xml'} + headers = {"Content-Type": "application/xml"} if token: - headers['odps-x-supervision-token'] = token + headers["odps-x-supervision-token"] = token if schema is not None or self.odps.is_schema_namespace_enabled(): hints = hints or {} hints["odps.namespace.schema"] = "true" - hints["odps.default.schema"] = hints.get("odps.default.schema") or schema or "default" + hints["odps.sql.allow.namespace.schema"] = "true" + hints["odps.default.schema"] = ( + hints.get("odps.default.schema") or schema or "default" + ) req_obj = self.AuthQueryRequest( query=query, use_json=True, settings=json.dumps(hints) if hints else None @@ -414,16 +430,13 @@ def generate_auth_token(self, policy, type, expire_hours): raise SecurityQueryError("Unsupported token type " + type) url = self.auth_resource() - headers = {'Content-Type': 'application/json'} + headers = {"Content-Type": "application/json"} - policy_dict = { - 'expires_in_hours': expire_hours, - 'policy': policy - } + policy_dict = {"expires_in_hours": expire_hours, "policy": policy} data = json.dumps(policy_dict) query_resp = self._client.post( - url, data, action='sign_bearer_token', headers=headers + url, data, action="sign_bearer_token", headers=headers ) resp = self.AuthQueryResponse.parse(query_resp) return resp.result diff --git a/odps/models/projects.py b/odps/models/projects.py index 4f527886..9068ebba 100644 --- a/odps/models/projects.py +++ b/odps/models/projects.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,18 +14,18 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .. import errors, serializers +from ..compat import six from .core import Container from .project import Project -from .. import serializers, errors -from ..compat import six class Projects(Container): - __slots__ = "_odps_ref", + __slots__ = ("_odps_ref",) - marker = serializers.XMLNodeField('Marker') - max_items = serializers.XMLNodeField('MaxItems') - projects = serializers.XMLNodesReferencesField(Project, 'Project') + marker = serializers.XMLNodeField("Marker") + max_items = serializers.XMLNodeField("MaxItems") + projects = serializers.XMLNodesReferencesField(Project, "Project") def __init__(self, *args, **kwargs): self._odps_ref = None @@ -52,38 +52,43 @@ def __iter__(self): return self.iterate() def iterate( - self, owner=None, user=None, group=None, max_items=None, name=None, - region_id=None, tenant_id=None, quota_nick_name=None, quota_type=None, + self, + owner=None, + user=None, + group=None, + max_items=None, + name=None, + region_id=None, + tenant_id=None, + quota_nick_name=None, + quota_type=None, quota_name=None, ): params = { - 'expectmarker': 'true', - 'name': name, - 'owner': owner, - 'user': user, - 'group': group, - 'maxitems': max_items, - 'region': region_id, - 'tenant': tenant_id, - 'quotanickname': quota_nick_name, - 'quota_name': quota_name, - 'quota_type': quota_type, + "expectmarker": "true", + "name": name, + "owner": owner, + "user": user, + "group": group, + "maxitems": max_items, + "region": region_id, + "tenant": tenant_id, + "quotanickname": quota_nick_name, + "quota_name": quota_name, + "quota_type": quota_type, } - params = dict( - (k, v) for k, v in params.items() if v is not None - ) + params = dict((k, v) for k, v in params.items() if v is not None) def _it(): - last_marker = params.get('marker') - if 'marker' in params and \ - (last_marker is None or len(last_marker) == 0): + last_marker = params.get("marker") + if "marker" in params and (last_marker is None or len(last_marker) == 0): return url = self.resource() resp = self._client.get(url, params=params) t = Projects.parse(self._client, resp, obj=self) - params['marker'] = t.marker + params["marker"] = t.marker return t.projects diff --git a/odps/models/quota.py b/odps/models/quota.py new file mode 100644 index 00000000..ea6feb78 --- /dev/null +++ b/odps/models/quota.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 1999-2024 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .. import serializers +from ..compat import Enum +from .core import LazyLoad + + +class Quota(LazyLoad): + """ + Quota provides information about computational resources. + """ + + VERSION = "wlm" + + _root = "Quota" + + class Strategy(Enum): + NoPreempt = "NoPreempt" + Preempt = "Preempt" + + class SchedulerType(Enum): + Fifo = "Fifo" + Fair = "Fair" + + class Status(Enum): + ON = "ON" + OFF = "OFF" + INITIALIZING = "INITIALIZING" + ABNORMAL = "ABNORMAL" + + class ResourceSystemType(Enum): + FUXI_OFFLINE = "FUXI_OFFLINE" + FUXI_ONLINE = "FUXI_ONLINE" + FUXI_STANDALONE = "FUXI_STANDALONE" + FUXI_VW = "FUXI_VW" + + class BillingPolicy(serializers.JSONSerializableModel): + class BillingMethod(Enum): + payasyougo = "payasyougo" + subscription = "subscription" + + billing_method = serializers.JSONNodeField( + "billingMethod", parse_callback=serializers.none_or(BillingMethod) + ) + specification = serializers.JSONNodeField("OdpsSpecCode") + order_id = serializers.JSONNodeField("orderId") + + cluster = serializers.XMLNodeField("Cluster") + name = serializers.XMLNodeField("Name") + id = serializers.XMLNodeField("ID") + is_enabled = serializers.XMLNodeField("IsEnabled", type="bool") + resource_system_type = serializers.XMLNodeField( + "ResourceSystemType", parse_callback=serializers.none_or(ResourceSystemType) + ) + session_service_name = serializers.XMLNodeField("SessionServiceName") + creation_time = serializers.XMLNodeField("CreateTimeMs", type="timestamp_ms") + last_modified_time = serializers.XMLNodeField( + "LastModifiedTimeMs", type="timestamp_ms" + ) + + cpu = serializers.XMLNodeField("CPU", type="int") + min_cpu = serializers.XMLNodeField("MinCPU", type="int") + elastic_cpu_max = serializers.XMLNodeField("ElasticCPUMax", type="int") + elastic_cpu_min = serializers.XMLNodeField("ElasticCPUMin", type="int") + adhoc_cpu = serializers.XMLNodeField("AdhocCPU", type="int") + cpu_usage = serializers.XMLNodeField("CPUUsage", type="float") + adhoc_cpu_usage = serializers.XMLNodeField("AdhocCPUUsage", type="float") + cpu_ready_ratio = serializers.XMLNodeField("CPUReadyRatio", type="float") + + memory = serializers.XMLNodeField("Memory", type="int") + min_memory = serializers.XMLNodeField("MinMemory", type="int") + elastic_memory_max = serializers.XMLNodeField("ElasticMemoryMax", type="int") + elastic_memory_min = serializers.XMLNodeField("ElasticMemoryMin", type="int") + adhoc_memory = serializers.XMLNodeField("AdhocMemory", type="int") + memory_usage = serializers.XMLNodeField("MemoryUsage", type="float") + adhoc_memory_usage = serializers.XMLNodeField("AdhocMemoryUsage", type="float") + memory_ready_ratio = serializers.XMLNodeField("MemoryReadyRatio", type="float") + + gpu = serializers.XMLNodeField("GPU", type="int") + min_gpu = serializers.XMLNodeField("MinGPU", type="int") + elastic_gpu_max = serializers.XMLNodeField("ElasticGPUMax", type="int") + elastic_gpu_min = serializers.XMLNodeField("ElasticGPUMin", type="int") + adhoc_gpu = serializers.XMLNodeField("AdhocGPU", type="int") + + strategy = serializers.XMLNodeField( + "Strategy", parse_callback=serializers.none_or(Strategy) + ) + scheduler_type = serializers.XMLNodeField("SchedulerType") + is_parent_group = serializers.XMLNodeField("IsParGroup", type="bool") + parent_id = serializers.XMLNodeField("ParGroupId") + parent_name = serializers.XMLNodeField("ParentName") + user_defined_tags = serializers.XMLNodePropertiesField( + "UserDefinedTag", "entry", key_attr="key", value_tag="value" + ) + virtual_cluster_config = serializers.XMLNodeField( + "VirtualClusterConfig", type="json" + ) + tenant_id = serializers.XMLNodeField("TenantId") + status = serializers.XMLNodeField( + "Status", parse_callback=serializers.none_or(Status) + ) + nickname = serializers.XMLNodeField("Nickname") + parent_nickname = serializers.XMLNodeField("ParentNickname") + creator_id = serializers.XMLNodeField("CreatorId") + region_id = serializers.XMLNodeField("Region") + billing_policy = serializers.XMLNodeReferenceField(BillingPolicy, "BillingPolicy") + need_auth = serializers.XMLNodeField("NeedAuth", type="bool") + is_pure_link = serializers.XMLNodeField("IsPureLink", type="bool") + quota_version = serializers.XMLNodeField("QuotaVersion") + is_meta_only = serializers.XMLNodeField("IsMetaOnly", type="bool") + properties = serializers.XMLNodeField("Properties", type="json") + + def _name(self): + return self._getattr("nickname") + + def reload(self): + params = { + "project": self._client.project, + "version": self.VERSION, + } + try: + if self._getattr("region_id"): + params["region"] = self.region_id + except AttributeError: + pass + resp = self._client.get(self.resource(), params=params) + self.parse(self._client, resp, obj=self) + self._loaded = True diff --git a/odps/models/quotas.py b/odps/models/quotas.py new file mode 100644 index 00000000..673ff5de --- /dev/null +++ b/odps/models/quotas.py @@ -0,0 +1,79 @@ +# Copyright 1999-2024 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .. import errors, serializers +from ..compat import six +from .core import Iterable +from .quota import Quota + + +class Quotas(Iterable): + marker = serializers.XMLNodeField("Marker") + max_items = serializers.XMLNodeField("MaxItems") + quotas = serializers.XMLNodesReferencesField(Quota, "Quota") + + def _name(self): + return "quotas" + + def _get(self, nickname): + return Quota(client=self._client, parent=self, nickname=nickname) + + def __contains__(self, item): + if isinstance(item, six.string_types): + quota = self._get(item) + elif isinstance(item, Quota): + quota = item + else: + return False + + try: + quota.reload() + return True + except errors.NoSuchObject: + return False + + def __iter__(self): + return self.iterate() + + def iterate(self, region_id=None, **kw): + params = kw.copy() + params.update( + { + "expectmarker": "true", + "version": Quota.VERSION, + "project": self._client.project, + } + ) + if region_id is not None: + params["region"] = region_id + + def _it(): + last_marker = params.get("marker") + if "marker" in params and (last_marker is None or len(last_marker) == 0): + return + + url = self.resource() + resp = self._client.get(url, params=params) + + f = Quotas.parse(self._client, resp, obj=self) + params["marker"] = f.marker + + return f.quotas + + while True: + quotas = _it() + if quotas is None: + break + for quota in quotas: + yield quota diff --git a/odps/models/readers.py b/odps/models/readers.py index c8116a07..cf3d7ec7 100644 --- a/odps/models/readers.py +++ b/odps/models/readers.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ from ..compat import six from ..config import options +from ..errors import ODPSClientError from ..lib.tblib import pickling_support from ..readers import AbstractRecordReader from ..utils import call_with_retry @@ -22,19 +23,26 @@ class TunnelReaderMixin(object): - def _to_pandas_with_processes(self, start=None, count=None, columns=None, n_process=1): - import pandas as pd + @property + def count(self): + raise NotImplementedError + + def _to_pandas_with_processes( + self, start=None, count=None, columns=None, append_partitions=None, n_process=1 + ): import multiprocessing from multiprocessing import Pipe + import pandas as pd + session_id = self._download_session.id start = start or 0 count = count or self._download_session.count count = min(count, self._download_session.count - start) try: - _mp_context = multiprocessing.get_context('fork') + _mp_context = multiprocessing.get_context("fork") except ValueError: - _mp_context = multiprocessing.get_context('spawn') + _mp_context = multiprocessing.get_context("spawn") except AttributeError: # for py27 compatibility _mp_context = multiprocessing @@ -46,64 +54,65 @@ def _to_pandas_with_processes(self, start=None, count=None, columns=None, n_proc parent_conn, child_conn = Pipe() p = _mp_context.Process( - target=self._get_process_split_reader(columns=columns), + target=self._get_process_split_reader( + columns=columns, append_partitions=append_partitions + ), args=(child_conn, session_id, start, split_count, i), ) p.start() start += split_count conns.append(parent_conn) - results = [c.recv() for c in conns] + try: + results = [c.recv() for c in conns] + except EOFError: + six.raise_from( + ODPSClientError( + "Read process ended unexpectedly. Try finding errors outputed above." + ), + None, + ) splits = sorted(results, key=lambda x: x[0]) if any(not d[2] for d in splits): exc_info = next(d[1] for d in splits if not d[2]) six.reraise(*exc_info) return pd.concat([d[1] for d in splits]).reset_index(drop=True) - def _get_process_split_reader(self, columns=None): + def _get_process_split_reader(self, columns=None, append_partitions=None): raise NotImplementedError def _open_and_iter_reader( - self, start, record_count, step=None, compress=False, columns=None, counter=None + self, + start, + record_count, + step=None, + compress=False, + columns=None, + append_partitions=None, + counter=None, ): raise NotImplementedError - def _retry_iter_reader( - self, start, record_count, step=None, compress=False, columns=None + def iter_pandas( + self, batch_size=None, start=None, count=None, columns=None, **kwargs ): - end = start + record_count - retry_num = 0 - while start < end: - is_yield_err = False - counter = [0] - try: - for rec in self._open_and_iter_reader( - start, record_count, step, compress=compress, columns=columns, counter=counter - ): - try: - # next record successfully obtained, reset retry counter - retry_num = 0 - yield rec - except BaseException: - # avoid catching errors caused in yield block - is_yield_err = True - raise - break - except: - retry_num += 1 - if is_yield_err or retry_num > options.retry_times: - raise - finally: - start += counter[0] - record_count -= counter[0] + batch_size = batch_size or options.tunnel.read_row_batch_size + start = start or 0 + count = count or self.count + for st in range(start, start + count, batch_size): + cur_batch_size = min(batch_size, count - (st - start)) + yield self.to_pandas( + start=st, count=cur_batch_size, columns=columns, **kwargs + ) class TunnelRecordReader(TunnelReaderMixin, AbstractRecordReader): - def __init__(self, parent, download_session, columns=None): + def __init__(self, parent, download_session, columns=None, append_partitions=None): self._it = iter(self) self._parent = parent self._download_session = download_session self._column_names = columns + self._append_partitions = append_partitions @property def download_id(self): @@ -126,49 +135,108 @@ def __next__(self): next = __next__ - def _iter(self, start=None, end=None, step=None, compress=False, columns=None): + def _iter( + self, + start=None, + end=None, + step=None, + compress=False, + columns=None, + append_partitions=None, + ): count = self._calc_count(start, end, step) return self.read( - start=start, count=count, step=step, compress=compress, columns=columns + start=start, + count=count, + step=step, + compress=compress, + columns=columns, + append_partitions=append_partitions, ) def _open_and_iter_reader( - self, start, record_count, step=None, compress=False, columns=None, counter=None + self, + start, + record_count, + step=None, + compress=False, + columns=None, + append_partitions=None, + counter=None, ): counter = counter or [0] with call_with_retry( self._download_session.open_record_reader, - start, record_count, compress=compress, columns=columns + start, + record_count, + compress=compress, + columns=columns, + append_partitions=append_partitions, ) as reader: for record in reader[::step]: counter[0] += step yield record - def read(self, start=None, count=None, step=None, - compress=False, columns=None): + def read( + self, + start=None, + count=None, + step=None, + compress=False, + append_partitions=None, + columns=None, + ): start = start or 0 step = step or 1 max_rec_count = self.count - start - rec_count = min(max_rec_count, count * step) if count is not None else max_rec_count + rec_count = ( + min(max_rec_count, count * step) if count is not None else max_rec_count + ) columns = columns or self._column_names + append_partitions = ( + append_partitions + if append_partitions is not None + else self._append_partitions + ) if rec_count == 0: return - for record in self._retry_iter_reader( - start, rec_count, step=step, compress=compress, columns=columns + for record in self._open_and_iter_reader( + start, + rec_count, + step=step, + compress=compress, + append_partitions=append_partitions, + columns=columns, ): yield record - def to_pandas(self, start=None, count=None, columns=None, n_process=1): + def to_pandas( + self, start=None, count=None, columns=None, append_partitions=None, n_process=1 + ): columns = columns or self._column_names + append_partitions = ( + append_partitions + if append_partitions is not None + else self._append_partitions + ) + if not append_partitions and columns is None: + columns = [c.name for c in self.schema.simple_columns] if n_process == 1 or self._download_session.count == 0: return super(TunnelRecordReader, self).to_pandas( - start=start, count=count, columns=columns + start=start, + count=count, + columns=columns, + append_partitions=append_partitions, ) else: return self._to_pandas_with_processes( - start=start, count=count, columns=columns, n_process=n_process + start=start, + count=count, + columns=columns, + append_partitions=append_partitions, + n_process=n_process, ) def __enter__(self): @@ -179,11 +247,12 @@ def __exit__(self, exc_type, exc_val, exc_tb): class TunnelArrowReader(TunnelReaderMixin): - def __init__(self, parent, download_session, columns=None): + def __init__(self, parent, download_session, columns=None, append_partitions=False): self._it = iter(self) self._parent = parent self._download_session = download_session self._column_names = columns + self._append_partitions = append_partitions @property def download_id(self): @@ -207,12 +276,23 @@ def __next__(self): next = __next__ def _open_and_iter_reader( - self, start, record_count, step=None, compress=False, columns=None, counter=None + self, + start, + record_count, + step=None, + compress=False, + columns=None, + append_partitions=None, + counter=None, ): counter = counter or [0] with call_with_retry( self._download_session.open_arrow_reader, - start, record_count, compress=compress, columns=columns + start, + record_count, + compress=compress, + columns=columns, + append_partitions=append_partitions, ) as reader: while True: batch = reader.read_next_batch() @@ -222,24 +302,45 @@ def _open_and_iter_reader( else: break - def read(self, start=None, count=None, compress=False, columns=None): + def read( + self, + start=None, + count=None, + compress=False, + columns=None, + append_partitions=None, + ): start = start or 0 max_rec_count = self.count - start rec_count = min(max_rec_count, count) if count is not None else max_rec_count columns = columns or self._column_names + append_partitions = ( + append_partitions + if append_partitions is not None + else self._append_partitions + ) if rec_count == 0: return - for batch in self._retry_iter_reader( - start, rec_count, compress=compress, columns=columns + for batch in self._open_and_iter_reader( + start, + rec_count, + compress=compress, + columns=columns, + append_partitions=append_partitions, ): yield batch - def read_all(self, start=None, count=None, columns=None): + def read_all(self, start=None, count=None, columns=None, append_partitions=None): start = start or 0 count = count if count is not None else self.count - start columns = columns or self._column_names + append_partitions = ( + append_partitions + if append_partitions is not None + else self._append_partitions + ) if count == 0: from ..tunnel.io.types import odps_schema_to_arrow_schema @@ -248,14 +349,21 @@ def read_all(self, start=None, count=None, columns=None): return arrow_schema.empty_table() with self._download_session.open_arrow_reader( - start, count, columns=columns + start, count, columns=columns, append_partitions=append_partitions ) as reader: return reader.read() - def to_pandas(self, start=None, count=None, columns=None, n_process=1): + def to_pandas( + self, start=None, count=None, columns=None, append_partitions=None, n_process=1 + ): start = start or 0 count = count if count is not None else self.count - start columns = columns or self._column_names + append_partitions = ( + append_partitions + if append_partitions is not None + else self._append_partitions + ) if n_process == 1: if count == 0: @@ -265,12 +373,16 @@ def to_pandas(self, start=None, count=None, columns=None, n_process=1): return arrow_schema.empty_table().to_pandas() with self._download_session.open_arrow_reader( - start, count, columns=columns + start, count, columns=columns, append_partitions=append_partitions ) as reader: return reader.to_pandas() else: return self._to_pandas_with_processes( - start=start, count=count, columns=columns, n_process=n_process + start=start, + count=count, + columns=columns, + append_partitions=append_partitions, + n_process=n_process, ) def __enter__(self): diff --git a/odps/models/record.py b/odps/models/record.py index 56abd2a0..57fe8113 100644 --- a/odps/models/record.py +++ b/odps/models/record.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,12 +16,16 @@ from .. import types as _types from ..config import options + try: if not options.force_py: from ..src.types_c import BaseRecord - Record = _types.RecordMeta('Record', (_types.RecordReprMixin, BaseRecord), - {'__doc__': _types.Record.__doc__}) + Record = _types.RecordMeta( + "Record", + (_types.RecordReprMixin, BaseRecord), + {"__doc__": _types.Record.__doc__}, + ) else: Record = _types.Record except (ImportError, AttributeError): diff --git a/odps/models/resource.py b/odps/models/resource.py index c834f528..2661e1bf 100644 --- a/odps/models/resource.py +++ b/odps/models/resource.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,14 +18,14 @@ import sys from collections import namedtuple -from .. import serializers, utils, types, errors, compat +from .. import compat, errors, serializers, types, utils from ..compat import Enum, six from ..config import options -from .core import LazyLoad from .cache import cache, cache_parent +from .core import LazyLoad -_RESOURCE_SPLITTER = '/resources/' -_SCHEMA_SPLITTER = '/schemas/' +_RESOURCE_SPLITTER = "/resources/" +_SCHEMA_SPLITTER = "/schemas/" if sys.version_info[0] < 3: _StringIOType = type(compat.StringIO()) @@ -45,31 +45,35 @@ class Resource(LazyLoad): :class:`odps.models.TableResource` """ - __slots__ = ( - 'content_md5', 'is_temp_resource', 'volume_path', '_type_indicator' - ) + __slots__ = ("content_md5", "is_temp_resource", "volume_path", "_type_indicator") class Type(Enum): - FILE = 'FILE' - JAR = 'JAR' - PY = 'PY' - ARCHIVE = 'ARCHIVE' - TABLE = 'TABLE' - VOLUMEFILE = 'VOLUMEFILE' - VOLUMEARCHIVE = 'VOLUMEARCHIVE' - UNKOWN = 'UNKOWN' - - _type_indicator = 'type' - - name = serializers.XMLNodeField('Name') - owner = serializers.XMLNodeField('Owner') - comment = serializers.XMLNodeField('Comment') - type = serializers.XMLNodeField('ResourceType', parse_callback=lambda t: Resource.Type(t.upper())) - creation_time = serializers.XMLNodeField('CreationTime', parse_callback=utils.parse_rfc822) - last_modified_time = serializers.XMLNodeField('LastModifiedTime', parse_callback=utils.parse_rfc822) - last_updator = serializers.XMLNodeField('LastUpdator') - size = serializers.XMLNodeField('ResourceSize', parse_callback=int) - source_table_name = serializers.XMLNodeField('TableName') + FILE = "FILE" + JAR = "JAR" + PY = "PY" + ARCHIVE = "ARCHIVE" + TABLE = "TABLE" + VOLUMEFILE = "VOLUMEFILE" + VOLUMEARCHIVE = "VOLUMEARCHIVE" + UNKOWN = "UNKOWN" + + _type_indicator = "type" + + name = serializers.XMLNodeField("Name") + owner = serializers.XMLNodeField("Owner") + comment = serializers.XMLNodeField("Comment") + type = serializers.XMLNodeField( + "ResourceType", parse_callback=lambda t: Resource.Type(t.upper()) + ) + creation_time = serializers.XMLNodeField( + "CreationTime", parse_callback=utils.parse_rfc822 + ) + last_modified_time = serializers.XMLNodeField( + "LastModifiedTime", parse_callback=utils.parse_rfc822 + ) + last_updator = serializers.XMLNodeField("LastUpdator") + size = serializers.XMLNodeField("ResourceSize", parse_callback=int) + source_table_name = serializers.XMLNodeField("TableName") @classmethod def _get_cls(cls, typo): @@ -81,19 +85,19 @@ def _get_cls(cls, typo): clz = lambda name: globals()[name] if typo == Resource.Type.FILE: - return clz('FileResource') + return clz("FileResource") elif typo == Resource.Type.JAR: - return clz('JarResource') + return clz("JarResource") elif typo == Resource.Type.PY: - return clz('PyResource') + return clz("PyResource") elif typo == Resource.Type.ARCHIVE: - return clz('ArchiveResource') + return clz("ArchiveResource") elif typo == Resource.Type.TABLE: - return clz('TableResource') + return clz("TableResource") elif typo == Resource.Type.VOLUMEARCHIVE: - return clz('VolumeArchiveResource') + return clz("VolumeArchiveResource") elif typo == Resource.Type.VOLUMEFILE: - return clz('VolumeFileResource') + return clz("VolumeFileResource") else: return cls @@ -102,23 +106,23 @@ def create(self, overwrite=False, **kw): @staticmethod def _filter_cache(_, **kwargs): - return kwargs.get('type') is not None and kwargs['type'] != Resource.Type.UNKOWN + return kwargs.get("type") is not None and kwargs["type"] != Resource.Type.UNKOWN @cache def __new__(cls, *args, **kwargs): - typo = kwargs.get('type') + typo = kwargs.get("type") if typo is not None or (cls != Resource and issubclass(cls, Resource)): return object.__new__(cls._get_cls(typo)) - kwargs['type'] = Resource.Type.UNKOWN + kwargs["type"] = Resource.Type.UNKOWN obj = Resource(**kwargs) obj.reload() return Resource(**obj.extract()) def __init__(self, **kwargs): - typo = kwargs.get('type') + typo = kwargs.get("type") if isinstance(typo, six.string_types): - kwargs['type'] = Resource.Type(typo.upper()) + kwargs["type"] = Resource.Type(typo.upper()) super(Resource, self).__init__(**kwargs) @classmethod @@ -126,7 +130,13 @@ def build_full_resource_name(cls, name, project_name, schema_name=None): if project_name is None: return name elif schema_name is not None: - return project_name + _SCHEMA_SPLITTER + schema_name + _RESOURCE_SPLITTER + name + return ( + project_name + + _SCHEMA_SPLITTER + + schema_name + + _RESOURCE_SPLITTER + + name + ) else: return project_name + _RESOURCE_SPLITTER + name @@ -139,7 +149,9 @@ def split_resource_name(cls, name): if _SCHEMA_SPLITTER not in project_schema_name: project_name, schema_name = project_schema_name, None else: - project_name, schema_name = project_schema_name.split(_SCHEMA_SPLITTER, 1) + project_name, schema_name = project_schema_name.split( + _SCHEMA_SPLITTER, 1 + ) return project_name, schema_name, name @property @@ -151,31 +163,31 @@ def reload(self): params = {} schema_name = self._get_schema_name() if schema_name is not None: - params['curr_schema'] = schema_name + params["curr_schema"] = schema_name url = self.resource() - resp = self._client.get(url, action='meta', params=params) + resp = self._client.get(url, action="meta", params=params) - self.owner = resp.headers.get('x-odps-owner') - resource_type = resp.headers.get('x-odps-resource-type') + self.owner = resp.headers.get("x-odps-owner") + resource_type = resp.headers.get("x-odps-resource-type") self.type = Resource.Type(resource_type.upper()) - self.comment = resp.headers.get('x-odps-comment') - self.last_updator = resp.headers.get('x-odps-updator') + self.comment = resp.headers.get("x-odps-comment") + self.last_updator = resp.headers.get("x-odps-updator") - size = resp.headers.get('x-odps-resource-size') + size = resp.headers.get("x-odps-resource-size") self.size = None if size is None else int(size) self.creation_time = utils.parse_rfc822( - resp.headers.get('x-odps-creation-time')) - self.last_modified_time = utils.parse_rfc822( - resp.headers.get('Last-Modified')) + resp.headers.get("x-odps-creation-time") + ) + self.last_modified_time = utils.parse_rfc822(resp.headers.get("Last-Modified")) is_temp_resource_header = resp.headers.get("x-odps-resource-istemp") or "" self.is_temp_resource = is_temp_resource_header.lower() == "true" - self.source_table_name = resp.headers.get('x-odps-copy-table-source') - self.volume_path = resp.headers.get('x-odps-copy-file-source') - self.content_md5 = resp.headers.get('Content-MD5') + self.source_table_name = resp.headers.get("x-odps-copy-table-source") + self.volume_path = resp.headers.get("x-odps-copy-file-source") + self.content_md5 = resp.headers.get("Content-MD5") self._loaded = True @@ -183,12 +195,12 @@ def _reload_size(self): params = {} schema_name = self._get_schema_name() if schema_name is not None: - params['curr_schema'] = schema_name + params["curr_schema"] = schema_name url = self.resource() - resp = self._client.get(url, action='meta', params=params) + resp = self._client.get(url, action="meta", params=params) - size = resp.headers.get('x-odps-resource-size') + size = resp.headers.get("x-odps-resource-size") self.size = None if size is None else int(size) def update(self, **kw): @@ -206,25 +218,27 @@ class FileResource(Resource): Use ``open`` method to open this resource as a file-like object. """ - __slots__ = ('_fp', 'is_part_resource', 'merge_total_bytes') + __slots__ = ("_fp", "is_part_resource", "merge_total_bytes") class Mode(Enum): - READ = 'r' - WRITE = 'w' - APPEND = 'a' - READWRITE = 'r+' - TRUNCEREADWRITE = 'w+' - APPENDREADWRITE = 'a+' + READ = "r" + WRITE = "w" + APPEND = "a" + READWRITE = "r+" + TRUNCEREADWRITE = "w+" + APPENDREADWRITE = "a+" def create(self, overwrite=False, **kw): - file_obj = kw.pop('file_obj', kw.pop('fileobj', None)) + file_obj = kw.pop("file_obj", kw.pop("fileobj", None)) is_part_resource = self._getattr("is_part_resource") - is_merge_resource = self._getattr('merge_total_bytes') is not None + is_merge_resource = self._getattr("merge_total_bytes") is not None if file_obj is None: - raise ValueError('parameter `file_obj` cannot be None, either string or file-like object') + raise ValueError( + "parameter `file_obj` cannot be None, either string or file-like object" + ) if isinstance(file_obj, six.text_type): - file_obj = file_obj.encode('utf-8') + file_obj = file_obj.encode("utf-8") if ( options.upload_resource_in_chunks @@ -236,25 +250,27 @@ def create(self, overwrite=False, **kw): return self if self.name is None or len(self.name.strip()) == 0: - raise errors.ODPSError('File Resource Name should not empty.') + raise errors.ODPSError("File Resource Name should not empty.") method = self._client.post if not overwrite else self._client.put url = self.parent.resource() if not overwrite else self.resource() headers = { - 'Content-Type': 'application/octet-stream', - 'Content-Disposition': 'attachment;filename=%s' % self.name, - 'x-odps-resource-type': self.type.value.lower(), - 'x-odps-resource-name': self.name, + "Content-Type": "application/octet-stream", + "Content-Disposition": "attachment;filename=%s" % self.name, + "x-odps-resource-type": self.type.value.lower(), + "x-odps-resource-name": self.name, } params = {} - if self._getattr('comment') is not None: - headers['x-odps-comment'] = self.comment - if self._getattr('is_temp_resource'): - headers['x-odps-resource-istemp'] = 'true' if self.is_temp_resource else 'false' + if self._getattr("comment") is not None: + headers["x-odps-comment"] = self.comment + if self._getattr("is_temp_resource"): + headers["x-odps-resource-istemp"] = ( + "true" if self.is_temp_resource else "false" + ) if is_merge_resource: - headers['x-odps-resource-merge-total-bytes'] = str(self.merge_total_bytes) + headers["x-odps-resource-merge-total-bytes"] = str(self.merge_total_bytes) params["rOpMerge"] = "true" if is_part_resource: params["rIsPart"] = "true" @@ -267,7 +283,11 @@ def create(self, overwrite=False, **kw): self.size = len(content) method( - url, content, headers=headers, params=params, curr_schema=self._get_schema_name() + url, + content, + headers=headers, + params=params, + curr_schema=self._get_schema_name(), ) if overwrite: @@ -326,7 +346,7 @@ def opened(self): def mode(self): return self._fp.mode - def open(self, mode='r', encoding='utf-8', stream=False, overwrite=None): + def open(self, mode="r", encoding="utf-8", stream=False, overwrite=None): """ The argument ``mode`` stands for the open mode for this file resource. It can be binary mode if the 'b' is inside. For instance, @@ -379,9 +399,9 @@ def open(self, mode='r', encoding='utf-8', stream=False, overwrite=None): def _check_read(self): if not self.opened: - raise IOError('I/O operation on non-open resource') + raise IOError("I/O operation on non-open resource") if self.mode in (FileResource.Mode.WRITE, FileResource.Mode.APPEND): - raise IOError('Resource not open for reading') + raise IOError("Resource not open for reading") def read(self, size=-1): """ @@ -427,9 +447,9 @@ def readlines(self, sizehint=-1): def _check_write(self): if not self.opened: - raise IOError('I/O operation on non-open resource') + raise IOError("I/O operation on non-open resource") if self.mode == FileResource.Mode.READ: - raise IOError('Resource not open for writing') + raise IOError("Resource not open for writing") def write(self, content): """ @@ -565,10 +585,10 @@ class TableResource(Resource): _TableSource = namedtuple("_TableSource", "project schema table partition") def __init__(self, **kw): - project_name = kw.pop('project_name', None) - schema_name = kw.pop('schema_name', None) - table_name = kw.pop('table_name', None) - partition_spec = kw.pop('partition', None) + project_name = kw.pop("project_name", None) + schema_name = kw.pop("schema_name", None) + table_name = kw.pop("table_name", None) + partition_spec = kw.pop("partition", None) super(TableResource, self).__init__(**kw) @@ -582,31 +602,38 @@ def __init__(self, **kw): def create(self, overwrite=False, **kw): if self.name is None or len(self.name.strip()) == 0: - raise errors.ODPSError('Table Resource Name should not be empty.') + raise errors.ODPSError("Table Resource Name should not be empty.") method = self._client.post if not overwrite else self._client.put url = self.parent.resource() if not overwrite else self.resource() headers = { - 'Content-Type': 'text/plain', - 'x-odps-resource-type': self.type.value.lower(), - 'x-odps-resource-name': self.name, - 'x-odps-copy-table-source': self.source_table_name, + "Content-Type": "text/plain", + "x-odps-resource-type": self.type.value.lower(), + "x-odps-resource-name": self.name, + "x-odps-copy-table-source": self.source_table_name, } - if self._getattr('comment') is not None: - headers['x-odps-comment'] = self._getattr('comment') + if self._getattr("comment") is not None: + headers["x-odps-comment"] = self._getattr("comment") - method(url, '', headers=headers, curr_schema=self._get_schema_name()) + method(url, "", headers=headers, curr_schema=self._get_schema_name()) if overwrite: del self.parent[self.name] return self.parent[self.name] return self - def _init(self, create=False, table_project_name=None, table_schema_name=None, table_name=None, **kw): + def _init( + self, + create=False, + table_project_name=None, + table_schema_name=None, + table_name=None, + **kw + ): table_project_name = table_project_name or kw.get("project_name") - if table_name is not None and '.' in table_name: - parts = table_name.split('.') + if table_name is not None and "." in table_name: + parts = table_name.split(".") if len(parts) == 2: assert table_schema_name is None table_project_name, table_name = parts @@ -629,46 +656,58 @@ def _init(self, create=False, table_project_name=None, table_schema_name=None, t old_table_name = table_source.table old_partition = table_source.partition else: - old_table_project_name, old_schema_name, old_table_name, old_partition = [None] * 4 + ( + old_table_project_name, + old_schema_name, + old_table_name, + old_partition, + ) = [None] * 4 except AttributeError: - old_table_project_name, old_schema_name, old_table_name, old_partition = [None] * 4 + old_table_project_name, old_schema_name, old_table_name, old_partition = [ + None + ] * 4 - table_project_name = table_project_name or old_table_project_name or self.project.name - table_schema_name = table_schema_name or old_schema_name or self._get_schema_name() + table_project_name = ( + table_project_name or old_table_project_name or self.project.name + ) + table_schema_name = ( + table_schema_name or old_schema_name or self._get_schema_name() + ) table_name = table_name or old_table_name - partition = kw.get('partition', old_partition) + partition = kw.get("partition", old_partition) if table_name is not None: if table_schema_name: - self.source_table_name = '.'.join((table_project_name, table_schema_name, table_name)) + self.source_table_name = ".".join( + (table_project_name, table_schema_name, table_name) + ) else: - self.source_table_name = '.'.join((table_project_name, table_name)) + self.source_table_name = ".".join((table_project_name, table_name)) if partition is not None: if not isinstance(partition, types.PartitionSpec): partition_spec = types.PartitionSpec(partition) else: partition_spec = partition - self.source_table_name = ( - '%s partition(%s)' % ( - self.source_table_name.split(' partition(')[0], partition_spec - ) + self.source_table_name = "%s partition(%s)" % ( + self.source_table_name.split(" partition(")[0], + partition_spec, ) def _get_table_source(self): if self.source_table_name is None: - raise AttributeError('source_table_name not defined.') + raise AttributeError("source_table_name not defined.") - splits = self.source_table_name.split(' partition(') + splits = self.source_table_name.split(" partition(") if len(splits) < 2: partition = None else: - partition = splits[1].split(')', 1)[0].strip() + partition = splits[1].split(")", 1)[0].strip() src = splits[0] - if '.' not in src: - raise ValueError('Malformed source table name: %s' % src) - table_parts = src.split('.') + if "." not in src: + raise ValueError("Malformed source table name: %s" % src) + table_parts = src.split(".") if len(table_parts) == 2: schema_name = None project_name, table_name = table_parts @@ -694,11 +733,11 @@ def get_source_table_partition(self): if self.source_table_name is None: return - splits = self.source_table_name.split(' partition(') + splits = self.source_table_name.split(" partition(") if len(splits) < 2: return - partition = splits[1].split(')', 1)[0].strip() + partition = splits[1].split(")", 1)[0].strip() return types.PartitionSpec(partition) @property @@ -744,7 +783,12 @@ def open_writer(self, **kwargs): ) def update( - self, table_project_name=None, table_schema_name=None, table_name=None, *args, **kw + self, + table_project_name=None, + table_schema_name=None, + table_name=None, + *args, + **kw ): """ Update this resource. @@ -755,7 +799,7 @@ def update( :return: self """ if len(args) > 0: - kw['partition'] = args[0] + kw["partition"] = args[0] self._init( table_project_name=table_project_name, table_schema_name=table_schema_name, @@ -770,21 +814,21 @@ def update( class VolumeResource(Resource): def create(self, overwrite=False, **kw): if self.name is None or len(self.name.strip()) == 0: - raise errors.ODPSError('Volume Resource Name should not be empty.') + raise errors.ODPSError("Volume Resource Name should not be empty.") method = self._client.post if not overwrite else self._client.put url = self.parent.resource() if not overwrite else self.resource() headers = { - 'Content-Type': 'text/plain', - 'x-odps-resource-type': self.type.value.lower(), - 'x-odps-resource-name': self.name, - 'x-odps-copy-file-source': self.volume_path, + "Content-Type": "text/plain", + "x-odps-resource-type": self.type.value.lower(), + "x-odps-resource-name": self.name, + "x-odps-copy-file-source": self.volume_path, } - if self._getattr('comment') is not None: - headers['x-odps-comment'] = self._getattr('comment') + if self._getattr("comment") is not None: + headers["x-odps-comment"] = self._getattr("comment") - method(url, '', headers=headers, curr_schema=self._get_schema_name()) + method(url, "", headers=headers, curr_schema=self._get_schema_name()) if overwrite: del self.parent[self.name] @@ -800,13 +844,13 @@ class VolumeFileResource(VolumeResource): def __init__(self, **kw): okw = kw.copy() - okw.pop('volume_file', None) + okw.pop("volume_file", None) super(VolumeFileResource, self).__init__(**okw) self.type = Resource.Type.VOLUMEFILE def create(self, overwrite=False, **kw): - if 'volume_file' in kw: - vf = kw.pop('volume_file') + if "volume_file" in kw: + vf = kw.pop("volume_file") self.volume_path = vf.path return super(VolumeFileResource, self).create(overwrite, **kw) diff --git a/odps/models/resourcefile.py b/odps/models/resourcefile.py index 9bcc6e17..e69ad131 100644 --- a/odps/models/resourcefile.py +++ b/odps/models/resourcefile.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,14 +25,20 @@ class ResourceFile(object): __slots__ = ( - 'resource', 'mode', '_opened', 'size', '_open_binary', '_encoding', '_overwrite' + "resource", + "mode", + "_opened", + "size", + "_open_binary", + "_encoding", + "_overwrite", ) - def __init__(self, resource, mode='r', encoding='utf-8', overwrite=None): + def __init__(self, resource, mode="r", encoding="utf-8", overwrite=None): self.resource = resource - self._open_binary = 'b' in mode - mode = mode.replace('b', '') + self._open_binary = "b" in mode + mode = mode.replace("b", "") self.mode = FileResource.Mode(mode) self._encoding = encoding @@ -120,7 +126,7 @@ def __exit__(self, *_): class LocalResourceFile(ResourceFile): __slots__ = "_fp", "_need_commit" - def __init__(self, resource, mode='r', encoding='utf-8', overwrite=None): + def __init__(self, resource, mode="r", encoding="utf-8", overwrite=None): super(LocalResourceFile, self).__init__( resource, mode=mode, encoding=encoding, overwrite=overwrite ) @@ -151,7 +157,7 @@ def readlines(self, sizehint=-1): def _check_size(self): if self.size > RESOURCE_SIZE_MAX: raise IOError( - "Single resource's max size is %sM" % (RESOURCE_SIZE_MAX / (1024 ** 2)) + "Single resource's max size is %sM" % (RESOURCE_SIZE_MAX / (1024**2)) ) def write(self, content): @@ -238,12 +244,17 @@ def _next(self): class StreamResourceFile(ResourceFile): __slots__ = ( - "_md5_digest", "_buffer", "_buffered_size", "_resource_parts", - "_resource_counter", "_chunk_size", "_is_source_exhausted", + "_md5_digest", + "_buffer", + "_buffered_size", + "_resource_parts", + "_resource_counter", + "_chunk_size", + "_is_source_exhausted", "_source_offset", ) - def __init__(self, resource, mode='r', encoding='utf-8', overwrite=None): + def __init__(self, resource, mode="r", encoding="utf-8", overwrite=None): mode = mode.replace("+", "") super(StreamResourceFile, self).__init__( @@ -308,7 +319,7 @@ def read(self, size=-1): return buf.getvalue() def _is_line_terminated(self, line): - terminator = b'\n' if self._open_binary else os.linesep + terminator = b"\n" if self._open_binary else os.linesep return line.endswith(terminator) def readline(self, size=-1): @@ -423,7 +434,11 @@ def flush(self): if value.tell() > 0: res = self.resource.parent.create( - name=self._build_part_resource_name(), type="file", temp=True, part=True, fileobj=value + name=self._build_part_resource_name(), + type="file", + temp=True, + part=True, + fileobj=value, ) self._resource_parts.append(res) if self._open_binary: diff --git a/odps/models/resources.py b/odps/models/resources.py index fef67a76..60e5de6b 100644 --- a/odps/models/resources.py +++ b/odps/models/resources.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,19 +14,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .core import Iterable -from .resource import Resource, FileResource -from .. import serializers, errors +from .. import errors, serializers from ..compat import six +from .core import Iterable +from .resource import FileResource, Resource DEFAULT_RESOURCE_CHUNK_SIZE = 64 << 20 class Resources(Iterable): - - marker = serializers.XMLNodeField('Marker') - max_items = serializers.XMLNodeField('MaxItems') - resources = serializers.XMLNodesReferencesField(Resource, 'Resource') + marker = serializers.XMLNodeField("Marker") + max_items = serializers.XMLNodeField("MaxItems") + resources = serializers.XMLNodesReferencesField(Resource, "Resource") def get_typed(self, name, type, **kw): type_cls = Resource._get_cls(type) @@ -69,26 +68,25 @@ def get(self, name, type=None): return self.get_typed(name, type) def iterate(self, name=None, owner=None): - params = {'expectmarker': 'true'} + params = {"expectmarker": "true"} if name is not None: - params['name'] = name + params["name"] = name if owner is not None: - params['owner'] = owner + params["owner"] = owner schema_name = self._get_schema_name() if schema_name is not None: - params['curr_schema'] = schema_name + params["curr_schema"] = schema_name def _it(): - last_marker = params.get('marker') - if 'marker' in params and \ - (last_marker is None or len(last_marker) == 0): + last_marker = params.get("marker") + if "marker" in params and (last_marker is None or len(last_marker) == 0): return url = self.resource() resp = self._client.get(url, params=params) r = Resources.parse(self._client, resp, obj=self) - params['marker'] = r.marker + params["marker"] = r.marker return r.resources @@ -100,21 +98,21 @@ def _it(): yield resource def create(self, obj=None, **kwargs): - if obj is None and 'type' not in kwargs: - raise ValueError('Unknown resource type to create.') + if obj is None and "type" not in kwargs: + raise ValueError("Unknown resource type to create.") - if 'temp' in kwargs: - kwargs['is_temp_resource'] = kwargs.pop('temp') - if 'part' in kwargs: - kwargs['is_part_resource'] = kwargs.pop('part') + if "temp" in kwargs: + kwargs["is_temp_resource"] = kwargs.pop("temp") + if "part" in kwargs: + kwargs["is_part_resource"] = kwargs.pop("part") ctor_kw = kwargs.copy() - ctor_kw.pop('file_obj', None) - ctor_kw.pop('fileobj', None) + ctor_kw.pop("file_obj", None) + ctor_kw.pop("fileobj", None) obj = obj or Resource(parent=self, client=self._client, **ctor_kw) if obj.type == Resource.Type.UNKOWN: - raise ValueError('Unknown resource type to create.') + raise ValueError("Unknown resource type to create.") if obj.parent is None: obj._parent = self if obj._client is None: @@ -142,7 +140,7 @@ def _request(self, name, stream=False, offset=None, read_size=None): res = Resource(name, parent=self, client=self._client) url = res.resource() - headers = {'Content-Type': 'application/octet-stream'} + headers = {"Content-Type": "application/octet-stream"} params = {} if offset is not None: params["rOffset"] = str(offset) @@ -150,7 +148,11 @@ def _request(self, name, stream=False, offset=None, read_size=None): params["rSize"] = str(read_size) resp = self._client.get( - url, headers=headers, params=params, stream=stream, curr_schema=self._get_schema_name() + url, + headers=headers, + params=params, + stream=stream, + curr_schema=self._get_schema_name(), ) return resp @@ -160,7 +162,7 @@ def iter_resource_content(self, name, text_mode=False): return resp.iter_content(decode_unicode=text_mode) def read_resource( - self, name, encoding='utf-8', text_mode=False, offset=None, read_size=None + self, name, encoding="utf-8", text_mode=False, offset=None, read_size=None ): resp = self._request(name, offset=offset, read_size=read_size) diff --git a/odps/models/schema.py b/odps/models/schema.py index d7862a39..57173dd5 100644 --- a/odps/models/schema.py +++ b/odps/models/schema.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -39,32 +39,36 @@ class SchemaType(Enum): class SchemaDescription(JSONRemoteModel): - name = serializers.JSONNodeField('name', set_to_parent=True) - owner = serializers.JSONNodeField('owner', set_to_parent=True) - description = serializers.JSONNodeField('description', set_to_parent=True) + name = serializers.JSONNodeField("name", set_to_parent=True) + owner = serializers.JSONNodeField("owner", set_to_parent=True) + description = serializers.JSONNodeField("description", set_to_parent=True) creation_time = serializers.JSONNodeField( - 'createTime', parse_callback=_parse_schema_time, set_to_parent=True + "createTime", parse_callback=_parse_schema_time, set_to_parent=True ) last_modified_time = serializers.JSONNodeField( - 'modifyTime', parse_callback=_parse_schema_time, set_to_parent=True + "modifyTime", parse_callback=_parse_schema_time, set_to_parent=True ) type = serializers.JSONNodeField( - 'type', parse_callback=lambda x: getattr(SchemaType, x.upper()) + "type", parse_callback=lambda x: getattr(SchemaType, x.upper()) ) class Schema(LazyLoad): default_schema_name = "DEFAULT" - _root = 'Schema' + _root = "Schema" - name = serializers.XMLNodeField('Name') - owner = serializers.XMLNodeField('Owner') - description = serializers.XMLNodeField('Description') - creation_time = serializers.XMLNodeField('CreateTime', parse_callback=_parse_schema_time) - last_modified_time = serializers.XMLNodeField('ModifyTime', parse_callback=_parse_schema_time) + name = serializers.XMLNodeField("Name") + owner = serializers.XMLNodeField("Owner") + description = serializers.XMLNodeField("Description") + creation_time = serializers.XMLNodeField( + "CreateTime", parse_callback=_parse_schema_time + ) + last_modified_time = serializers.XMLNodeField( + "ModifyTime", parse_callback=_parse_schema_time + ) type = serializers.XMLNodeField( - 'Type', parse_callback=lambda x: getattr(SchemaType, x.upper()) + "Type", parse_callback=lambda x: getattr(SchemaType, x.upper()) ) def reload(self): @@ -73,9 +77,9 @@ def reload(self): resp = self._client.get(self.resource() + "/schemas/" + self.name) self.parse(self._client, resp, obj=self) - self.owner = resp.headers.get('x-odps-owner') + self.owner = resp.headers.get("x-odps-owner") self.creation_time = parse_rfc822( - resp.headers.get('x-odps-creation-time'), use_legacy_parsedate=False + resp.headers.get("x-odps-creation-time"), use_legacy_parsedate=False ) self.last_modified_time = parse_rfc822( resp.headers.get("Last-Modified"), use_legacy_parsedate=False @@ -91,8 +95,8 @@ def reload(self): @property def create_time(self): warnings.warn( - 'Schema.create_time is deprecated and will be replaced ' - 'by Schema.creation_time.', + "Schema.create_time is deprecated and will be replaced " + "by Schema.creation_time.", DeprecationWarning, stacklevel=3, ) @@ -101,8 +105,8 @@ def create_time(self): @property def modify_time(self): warnings.warn( - 'Schema.modify_time is deprecated and will be replaced ' - 'by Schema.last_modified_time.', + "Schema.modify_time is deprecated and will be replaced " + "by Schema.last_modified_time.", DeprecationWarning, stacklevel=3, ) diff --git a/odps/models/schemas.py b/odps/models/schemas.py index 54011ce7..a7498ef0 100644 --- a/odps/models/schemas.py +++ b/odps/models/schemas.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,12 @@ from .. import serializers from ..compat import six -from ..errors import InternalServerError, InvalidParameter, MethodNotAllowed, NoSuchObject +from ..errors import ( + InternalServerError, + InvalidParameter, + MethodNotAllowed, + NoSuchObject, +) from ..utils import with_wait_argument from .core import Iterable from .schema import Schema @@ -61,13 +66,14 @@ def iter_wrapper(self, *args, **kwargs): yield item return iter_wrapper if is_iter else wrapper + return decorator class Schemas(Iterable): - marker = serializers.XMLNodeField('Marker') - max_items = serializers.XMLNodeField('MaxItems') - schemas = serializers.XMLNodesReferencesField(Schema, 'Schema') + marker = serializers.XMLNodeField("Marker") + max_items = serializers.XMLNodeField("MaxItems") + schemas = serializers.XMLNodesReferencesField(Schema, "Schema") def __iter__(self): return self.iterate() @@ -86,32 +92,33 @@ def _iterate_legacy(self, name=None, owner=None): "Iterating schemas with name or owner not supported on current service" ) inst = self.parent.odps.execute_sql("SHOW SCHEMAS IN %s" % self.parent.name) - schema_names = inst.get_task_results().get("AnonymousSQLTask").strip().split("\n") + schema_names = ( + inst.get_task_results().get("AnonymousSQLTask").strip().split("\n") + ) for schema_name in schema_names: yield Schema(name=schema_name, parent=self, client=self._client) @with_schema_api_fallback(fallback_fun=_iterate_legacy, is_iter=True) def iterate(self, name=None, owner=None): - params = {'expectmarker': 'true'} + params = {"expectmarker": "true"} if name is not None: - params['name'] = name + params["name"] = name if owner is not None: - params['owner'] = owner + params["owner"] = owner schema_name = self._get_schema_name() if schema_name is not None: - params['curr_schema'] = schema_name + params["curr_schema"] = schema_name def _it(): - last_marker = params.get('marker') - if 'marker' in params and \ - (last_marker is None or len(last_marker) == 0): + last_marker = params.get("marker") + if "marker" in params and (last_marker is None or len(last_marker) == 0): return url = self.resource() + "/schemas" resp = self._client.get(url, params=params) r = Schemas.parse(self._client, resp, obj=self) - params['marker'] = r.marker + params["marker"] = r.marker return r.schemas @@ -150,7 +157,7 @@ def create(self, obj=None, **kwargs): if schema._client is None: schema._client = self._client - headers = {'Content-Type': 'application/xml'} + headers = {"Content-Type": "application/xml"} data = schema.serialize() resource = self.resource() + "/schemas" diff --git a/odps/models/security/config.py b/odps/models/security/config.py index b4cd4d0a..bdf47211 100644 --- a/odps/models/security/config.py +++ b/odps/models/security/config.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,38 +19,56 @@ class SecurityConfiguration(LazyLoad): - _root = 'SecurityConfiguration' + _root = "SecurityConfiguration" class ProjectProtection(serializers.XMLSerializableModel): - _root = 'ProjectProtection' - protected = serializers.XMLNodeAttributeField(attr='Protected', type='bool') - exception = serializers.XMLNodeField('Exceptions') + _root = "ProjectProtection" + protected = serializers.XMLNodeAttributeField(attr="Protected", type="bool") + exception = serializers.XMLNodeField("Exceptions") - check_permission_using_acl = serializers.XMLNodeField('CheckPermissionUsingAcl', type='bool') - check_permission_using_policy = serializers.XMLNodeField('CheckPermissionUsingPolicy', type='bool') - label_security = serializers.XMLNodeField('LabelSecurity', type='bool') - object_creator_has_access_permission = serializers.XMLNodeField('ObjectCreatorHasAccessPermission', type='bool') - object_creator_has_grant_permission = serializers.XMLNodeField('ObjectCreatorHasGrantPermission', type='bool') - project_protection = serializers.XMLNodeReferenceField(ProjectProtection, 'ProjectProtection') - check_permission_using_acl_v2 = serializers.XMLNodeField('CheckPermissionUsingAclV2', type='bool') - check_permission_using_policy_v2 = serializers.XMLNodeField('CheckPermissionUsingPolicyV2', type='bool') - support_acl = serializers.XMLNodeField('SupportACL', type='bool') - support_policy = serializers.XMLNodeField('SupportPolicy', type='bool') - support_package = serializers.XMLNodeField('SupportPackage', type='bool') - support_acl_v2 = serializers.XMLNodeField('SupportACLV2', type='bool') - support_package_v2 = serializers.XMLNodeField('SupportPackageV2', type='bool') - check_permission_using_package = serializers.XMLNodeField('CheckPermissionUsingPackage', type='bool') - create_package = serializers.XMLNodeField('CreatePackage', type='bool') - create_package_v2 = serializers.XMLNodeField('CreatePackageV2', type='bool') + check_permission_using_acl = serializers.XMLNodeField( + "CheckPermissionUsingAcl", type="bool" + ) + check_permission_using_policy = serializers.XMLNodeField( + "CheckPermissionUsingPolicy", type="bool" + ) + label_security = serializers.XMLNodeField("LabelSecurity", type="bool") + object_creator_has_access_permission = serializers.XMLNodeField( + "ObjectCreatorHasAccessPermission", type="bool" + ) + object_creator_has_grant_permission = serializers.XMLNodeField( + "ObjectCreatorHasGrantPermission", type="bool" + ) + project_protection = serializers.XMLNodeReferenceField( + ProjectProtection, "ProjectProtection" + ) + check_permission_using_acl_v2 = serializers.XMLNodeField( + "CheckPermissionUsingAclV2", type="bool" + ) + check_permission_using_policy_v2 = serializers.XMLNodeField( + "CheckPermissionUsingPolicyV2", type="bool" + ) + support_acl = serializers.XMLNodeField("SupportACL", type="bool") + support_policy = serializers.XMLNodeField("SupportPolicy", type="bool") + support_package = serializers.XMLNodeField("SupportPackage", type="bool") + support_acl_v2 = serializers.XMLNodeField("SupportACLV2", type="bool") + support_package_v2 = serializers.XMLNodeField("SupportPackageV2", type="bool") + check_permission_using_package = serializers.XMLNodeField( + "CheckPermissionUsingPackage", type="bool" + ) + create_package = serializers.XMLNodeField("CreatePackage", type="bool") + create_package_v2 = serializers.XMLNodeField("CreatePackageV2", type="bool") def reload(self): resp = self._client.get( - self.project.resource(), params=dict(security_configuration='') + self.project.resource(), params=dict(security_configuration="") ) self.parse(self._client, resp, obj=self) def update(self): content = self.serialize() self._client.put( - self.project.resource(), params=dict(security_configuration=''), data=content + self.project.resource(), + params=dict(security_configuration=""), + data=content, ) diff --git a/odps/models/security/roles.py b/odps/models/security/roles.py index 81540936..5287645e 100644 --- a/odps/models/security/roles.py +++ b/odps/models/security/roles.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,15 +16,15 @@ import json -from ..core import Iterable, LazyLoad +from ... import errors, serializers from ...compat import six -from ... import serializers, errors +from ..core import Iterable, LazyLoad class Role(LazyLoad): - __slots__ = '_policy_cache', - name = serializers.XMLNodeField('Name') - comment = serializers.XMLNodeField('Comment') + __slots__ = ("_policy_cache",) + name = serializers.XMLNodeField("Name") + comment = serializers.XMLNodeField("Comment") def __init__(self, **kw): super(Role, self).__init__(**kw) @@ -38,7 +38,8 @@ def reload(self): @property def users(self): from .users import Users - params = dict(users='') + + params = dict(users="") resp = self._client.get(self.resource(), params=params) users = Users.parse(self._client, resp, parent=self.project) users._iter_local = True @@ -47,7 +48,7 @@ def users(self): @property def policy(self): if self._policy_cache is None: - params = dict(policy='') + params = dict(policy="") resp = self._client.get(self.resource(), params=params) self._policy_cache = resp.content.decode() if six.PY3 else resp.content return json.loads(self._policy_cache) @@ -57,25 +58,27 @@ def policy(self, value): if isinstance(value, (dict, list)): value = json.dumps(value) self._policy_cache = value - params = dict(policy='') + params = dict(policy="") self._client.put(self.resource(), data=value, params=params) def grant_to(self, name): from .users import User + if isinstance(name, User): name = name.display_name - self.project.run_security_query('grant %s to %s' % (self.name, name)) + self.project.run_security_query("grant %s to %s" % (self.name, name)) def revoke_from(self, name): from .users import User + if isinstance(name, User): name = name.display_name - self.project.run_security_query('revoke %s from %s' % (self.name, name)) + self.project.run_security_query("revoke %s from %s" % (self.name, name)) class Roles(Iterable): - __slots__ = '_iter_local', - roles = serializers.XMLNodesReferencesField(Role, 'Role') + __slots__ = ("_iter_local",) + roles = serializers.XMLNodesReferencesField(Role, "Role") def __init__(self, **kw): self._iter_local = False @@ -109,7 +112,7 @@ def project(self): return self.parent def create(self, name): - self.project.run_security_query('create role %s' % name) + self.project.run_security_query("create role %s" % name) return Role(client=self._client, parent=self, name=name) def iterate(self, name=None): @@ -119,7 +122,7 @@ def iterate(self, name=None): if not self._iter_local: params = dict() if name is not None: - params['name'] = name + params["name"] = name url = self.resource() resp = self._client.get(url, params=params) @@ -134,4 +137,4 @@ def delete(self, name): name = name.name del self[name] # delete from cache - self.project.run_security_query('drop role %s' % name) + self.project.run_security_query("drop role %s" % name) diff --git a/odps/models/security/users.py b/odps/models/security/users.py index 831b5988..afb36da7 100644 --- a/odps/models/security/users.py +++ b/odps/models/security/users.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,23 +14,25 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ..core import Iterable, LazyLoad +from ... import errors, serializers from ...compat import six -from ... import serializers, errors +from ..core import Iterable, LazyLoad class User(LazyLoad): - id = serializers.XMLNodeField('ID') - display_name = serializers.XMLNodeField('DisplayName') - comment = serializers.XMLNodeField('Comment') + id = serializers.XMLNodeField("ID") + display_name = serializers.XMLNodeField("DisplayName") + comment = serializers.XMLNodeField("Comment") def _name(self): return self.id def reload(self): - if self._getattr('id') is None: - resp = self._client.get(self.parent.resource() + '/' + self._encode(self.display_name), - params=dict(type='displayname')) + if self._getattr("id") is None: + resp = self._client.get( + self.parent.resource() + "/" + self._encode(self.display_name), + params=dict(type="displayname"), + ) self.parse(self._client, resp, obj=self) else: resp = self._client.get(self.resource()) @@ -39,7 +41,8 @@ def reload(self): @property def roles(self): from .roles import Roles - params = dict(roles='', type='displayname') + + params = dict(roles="", type="displayname") resp = self._client.get(self.resource(), params=params) roles = Roles.parse(self._client, resp, parent=self.project) roles._iter_local = True @@ -47,20 +50,22 @@ def roles(self): def grant_role(self, name): from .roles import Role + if isinstance(name, Role): name = name.name - self.project.run_security_query('grant %s to %s' % (name, self.display_name)) + self.project.run_security_query("grant %s to %s" % (name, self.display_name)) def revoke_role(self, name): from .roles import Role + if isinstance(name, Role): name = name.name - self.project.run_security_query('revoke %s from %s' % (name, self.display_name)) + self.project.run_security_query("revoke %s from %s" % (name, self.display_name)) class Users(Iterable): - __slots__ = '_iter_local', - users = serializers.XMLNodesReferencesField(User, 'User') + __slots__ = ("_iter_local",) + users = serializers.XMLNodesReferencesField(User, "User") def __init__(self, **kw): self._iter_local = False @@ -94,7 +99,7 @@ def project(self): return self.parent def create(self, name): - self.project.run_security_query('add user %s' % name) + self.project.run_security_query("add user %s" % name) return User(client=self._client, parent=self, display_name=name) def iterate(self): @@ -115,4 +120,4 @@ def delete(self, name): if isinstance(name, User): name = name.display_name - self.project.run_security_query('remove user %s' % name) + self.project.run_security_query("remove user %s" % name) diff --git a/odps/models/session.py b/odps/models/session.py index 7597edd7..d3478aa6 100644 --- a/odps/models/session.py +++ b/odps/models/session.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,22 +14,28 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy +import glob +import hashlib import itertools import json +import os import re import sys import time import warnings from .. import errors, readers, utils -from ..compat import six, enum +from ..compat import enum, six from ..lib.monotonic import monotonic from ..models import tasks from .instance import Instance, InstanceArrowReader, InstanceRecordReader - DEFAULT_TASK_NAME = "AnonymousSQLRTTask" PUBLIC_SESSION_NAME = "public.default" +_SUBQUERY_ID_PATTERN = re.compile(r"[\d\D]*_query_(\d+)_[\d\D]*") +_SESSION_FILE_PREFIX = "mcqa-session-" +_SESSION_FILE_EXPIRE_TIME = 3600 * 24 class FallbackMode(enum.Enum): @@ -76,7 +82,9 @@ def get_mode_from_exception(self, exc_value): return FallbackMode.OFFLINE elif self.unsupported and isinstance(exc_value, errors.SQAUnsupportedFeature): return FallbackMode.OFFLINE - elif self.upgrading and isinstance(exc_value, (errors.SQAServiceUnavailable, errors.SQAAccessDenied)): + elif self.upgrading and isinstance( + exc_value, (errors.SQAServiceUnavailable, errors.SQAAccessDenied) + ): return FallbackMode.OFFLINE elif self.noresource and isinstance(exc_value, errors.SQAResourceNotEnough): return FallbackMode.OFFLINE @@ -92,7 +100,8 @@ def get_mode_from_exception(self, exc_value): def __repr__(self): policies = [ - s for s in ["generic", "unsupported", "upgrading", "noresource", "timeout"] + s + for s in ["generic", "unsupported", "upgrading", "noresource", "timeout"] if getattr(self, s, None) ] return "" % ",".join(policies) @@ -103,6 +112,7 @@ class SessionTaskStatus(enum.Enum): """ Possible statuses of tasks executing inside a session. """ + Running = 2 Failed = 4 Terminated = 5 @@ -114,7 +124,7 @@ class SessionTaskStatus(enum.Enum): 2: SessionTaskStatus.Running, 4: SessionTaskStatus.Failed, 5: SessionTaskStatus.Terminated, - 6: SessionTaskStatus.Cancelled + 6: SessionTaskStatus.Cancelled, } @@ -126,7 +136,7 @@ def _get_session_failure_info(task_results): try: taskname, result_txt = list(task_results.items())[0] except BaseException: - return '' + return "" return result_txt @@ -137,11 +147,13 @@ class SessionInstance(Instance): Further SQL tasks has to be created using this instance. """ - __slots__ = ('_project', '_task_name', '_session_name') + __slots__ = ("_project", "_task_name", "_session_name") def __init__(self, **kw): - if 'session_task_name' not in kw or 'session_project' not in kw: - raise errors.InvalidArgument("Creating InSessionInstance without enough information.") + if "session_task_name" not in kw or "session_project" not in kw: + raise errors.InvalidArgument( + "Creating InSessionInstance without enough information." + ) self._task_name = kw.pop("session_task_name", "") self._project = kw.pop("session_project", None) self._session_name = kw.pop("session_name", "") @@ -205,10 +217,11 @@ def _check_is_select(sql_statement): splited = utils.split_sql_by_semicolon(sql_statement) except Exception as ex: warnings.warn( - "Cannot split sql statement %s: %s" % (sql_statement, str(ex)), RuntimeWarning + "Cannot split sql statement %s: %s" % (sql_statement, str(ex)), + RuntimeWarning, ) return False - return splited[-1].lower().strip(' \t\r\n(').startswith("select") + return splited[-1].lower().strip(" \t\r\n(").startswith("select") def _create_internal_instance(self, task=None): project_name = self._project.name @@ -220,7 +233,7 @@ def _create_internal_instance(self, task=None): rquery = rquery + ";" query_object = { "query": rquery, - "settings": json.loads(task.properties['settings']) + "settings": json.loads(task.properties["settings"]), } query_json = json.dumps(query_object) @@ -234,7 +247,8 @@ def _create_internal_instance(self, task=None): query_status = query_result["status"] if query_status != "ok": raise errors.ODPSError( - 'Failed to run subquery: [' + query_status + "]: " + query_result["result"] + "Failed to run subquery: [%s]: %s" + % (query_status, query_result["result"]) ) query_subresult = json.loads(query_result["result"]) created_subquery_id = query_subresult["queryId"] @@ -243,18 +257,24 @@ def _create_internal_instance(self, task=None): except KeyError as ex: six.raise_from( errors.ODPSError( - "Invalid Response Format: %s\n Response JSON:%s\n" % (str(ex), resp_content.decode()) - ), None + "Invalid Response Format: %s\n Response JSON:%s\n" + % (str(ex), resp_content.decode()) + ), + None, ) instance = InSessionInstance( - session_project_name=project_name, session_task_name=self._task_name, - name=self.id, session_subquery_id=created_subquery_id, - session_instance=self, parent=self.parent, session_is_select=is_select, + session_project_name=project_name, + session_task_name=self._task_name, + name=self.id, + session_subquery_id=created_subquery_id, + session_instance=self, + parent=self.parent, + session_is_select=is_select, client=self._client, ) return instance - def reload(self): + def reload(self, blocking=False): resp_text = self.get_task_info(self._task_name, "status") session_status = SessionTaskStatus.Unknown try: @@ -311,14 +331,16 @@ def status(self): self._download_session.reload() return self._download_session.status - def read(self, start=None, count=None, step=None, - compress=False, columns=None): + def read( + self, start=None, count=None, step=None, compress=False, columns=None, **kw + ): start = start or 0 step = step or 1 stop = None if count is None else start + step * count with self._download_session.open_record_reader( - 0, 1, compress=compress, columns=columns) as reader: + 0, 1, compress=compress, columns=columns + ) as reader: for record in itertools.islice(reader, start, stop, step): yield record @@ -333,35 +355,47 @@ class InSessionInstanceRecordReader(InSessionTunnelReaderMixin, InstanceRecordRe class InSessionInstance(Instance): """ - This represents the instance created - for SQL tasks that run inside a session. This instance is useful - when fetching results. + This represents the instance created + for SQL tasks that run inside a session. This instance is useful + when fetching results. """ __slots__ = ( - '_project_name', '_session_task_name', '_session', '_session_instance', - '_is_select', '_subquery_id', '_report_result', '_report_warning', - '_session_task_status', + "_project_name", + "_session_task_name", + "_session", + "_session_instance", + "_is_select", + "_subquery_id", + "_report_result", + "_report_warning", + "_session_task_status", + "_task_data", ) def __init__(self, **kw): if ( - 'session_task_name' not in kw - or 'session_project_name' not in kw - or 'session_instance' not in kw - or 'session_subquery_id' not in kw + "session_task_name" not in kw + or "session_project_name" not in kw + or "session_instance" not in kw + or "session_subquery_id" not in kw ): - raise errors.InvalidArgument("Creating InSessionInstance without enough information.") + raise errors.InvalidArgument( + "Creating InSessionInstance without enough information." + ) self._session_task_name = kw.pop("session_task_name", "") self._project_name = kw.pop("session_project_name", "") self._session_instance = kw.pop("session_instance", None) self._is_select = kw.pop("session_is_select", False) self._subquery_id = kw.pop("session_subquery_id", -1) - self._report_result = '' - self._report_warning = '' + self._report_result = "" + self._report_warning = "" self._session_task_status = -1 + self._task_data = None if self._subquery_id < 0: - raise errors.InternalServerError("Subquery id not legal: " + str(self._subquery_id)) + raise errors.InternalServerError( + "Subquery id not legal: %s" % self._subquery_id + ) super(InSessionInstance, self).__init__(**kw) @property @@ -379,30 +413,35 @@ def _open_result_reader(self, schema=None, task_name=None, **kwargs): self.reload() if not self.is_successful(retry=True): raise errors.ODPSError( - 'Cannot open reader, instance(%s) may fail or has not finished yet' % self.id) + "Cannot open reader, instance(%s) may fail or has not finished yet" + % self.id + ) return readers.CsvRecordReader(schema, self._report_result) + def _wait_subquery_id_ready(self): + while self._subquery_id == -1 and self._status != Instance.Status.TERMINATED: + self.reload() + if self._subquery_id == -1: + raise errors.InternalServerError("SubQueryId not returned by the server.") + def _open_tunnel_reader(self, **kw): if not self._is_select: raise errors.InstanceTypeNotSupported( "InstanceTunnel cannot be opened at a non-select SQL Task." ) - while (self._subquery_id == -1) and (self._status != Instance.Status.TERMINATED): - self.reload() - - if self._subquery_id == -1: - raise errors.InternalServerError("SubQueryId not returned by the server.") + self._wait_subquery_id_ready() - kw.pop('reopen', False) + kw.pop("reopen", False) arrow = kw.pop("arrow", False) - endpoint = kw.pop('endpoint', None) - kw['sessional'] = True - kw['session_subquery_id'] = self._subquery_id - if 'session_task_name' not in kw: - kw['session_task_name'] = self._session_task_name + endpoint = kw.pop("endpoint", None) + quota_name = kw.pop("quota_name", None) + kw["sessional"] = True + kw["session_subquery_id"] = self._subquery_id + if "session_task_name" not in kw: + kw["session_task_name"] = self._session_task_name - tunnel = self._create_instance_tunnel(endpoint=endpoint) + tunnel = self._create_instance_tunnel(endpoint=endpoint, quota_name=quota_name) try: download_session = tunnel.create_download_session(instance=self, **kw) @@ -418,8 +457,10 @@ def _open_tunnel_reader(self, **kw): else: return InSessionInstanceRecordReader(self, download_session) - def reload(self): - resp_text = self.get_task_info(self._session_task_name, "result_" + str(self._subquery_id)) + def reload(self, blocking=False): + resp_text = self.get_task_info( + self._session_task_name, "result_%s" % self._subquery_id + ) try: query_result = json.loads(resp_text) query_status = query_result["status"] @@ -427,7 +468,9 @@ def reload(self): self._report_warning = query_result["warnings"] self._session_task_status = _task_status_value_to_enum(query_status) if self._session_task_status in ( - SessionTaskStatus.Terminated, SessionTaskStatus.Failed, SessionTaskStatus.Cancelled + SessionTaskStatus.Terminated, + SessionTaskStatus.Failed, + SessionTaskStatus.Cancelled, ): self._status = Instance.Status.TERMINATED elif self._session_task_status == SessionTaskStatus.Running: @@ -437,10 +480,11 @@ def reload(self): self._subquery_id = int(query_result.get("subQueryId", -1)) except BaseException as ex: raise errors.ODPSError( - "Invalid Response Format: %s\n Response JSON:%s\n" % (str(ex), resp_text) + "Invalid Response Format: %s\n Response JSON:%s\n" + % (str(ex), resp_text) ) - def is_successful(self, retry=False): + def is_successful(self, retry=False, retry_timeout=None): """ If the instance runs successfully. @@ -448,25 +492,37 @@ def is_successful(self, retry=False): :rtype: bool """ - if not self.is_terminated(retry=retry): + if not self.is_terminated(retry=retry, retry_timeout=retry_timeout): return False - if self._session_task_status in (SessionTaskStatus.Failed, SessionTaskStatus.Cancelled): + if self._session_task_status in ( + SessionTaskStatus.Failed, + SessionTaskStatus.Cancelled, + ): return False return True - def wait_for_success(self, interval=1, timeout=None, max_interval=None): + def wait_for_success( + self, interval=1, timeout=None, max_interval=None, blocking=True + ): """ Wait for instance to complete, and check if the instance is successful. :param interval: time interval to check :param max_interval: if specified, next check interval will be multiplied by 2 till max_interval is reached. + :param blocking: whether to block waiting at server side. Note that this option does + not affect client behavior. :param timeout: time :return: None :raise: :class:`odps.errors.ODPSError` if the instance failed """ - self.wait_for_completion(interval=interval, max_interval=max_interval, timeout=timeout) + self.wait_for_completion( + interval=interval, + max_interval=max_interval, + timeout=timeout, + blocking=blocking, + ) if not self.is_successful(retry=True): raise errors.parse_instance_error(self._report_result) @@ -494,3 +550,387 @@ def get_printable_result(self): if self.is_terminated() and not self.is_successful(): raise errors.parse_instance_error(self._report_result) return self._report_result + + def _get_sql_task(self): + resp_text_list = [None] + + def _load_task_data(): + resp_text_list[0] = self.get_task_info( + self._session_task_name, "sourcexml_%s" % self._subquery_id + ) + xml_data = json.loads(resp_text_list[0])["result"] + return tasks.SQLTask.parse(None, xml_data) + + if not self._task_data: + self._wait_subquery_id_ready() + try: + self._task_data = utils.call_with_retry(_load_task_data) + except BaseException as ex: + raise errors.ODPSError( + "Invalid Response Format: %s\n Response JSON:%s\n" + % (ex, resp_text_list[0]) + ) + return self._task_data + + def get_sql_query(self): + try: + return self._get_sql_task().query + except errors.ODPSError: + return None + + def _parse_subquery_id(self, job_name): + if not job_name: + return "" + match = _SUBQUERY_ID_PATTERN.match(job_name) + if match: + return match.group(1) + elif self.id in job_name: + return job_name.split(self.id, 1)[1].replace("_", "") + else: + return job_name + + def get_task_detail2(self, task_name=None, **kw): + assert task_name is None or task_name == self._session_task_name + self._wait_subquery_id_ready() + kw["subquery_id"] = "session_query_%d" % self._subquery_id + return super(InSessionInstance, self).get_task_detail2( + task_name=task_name, **kw + ) + + def _get_queueing_info(self, **kw): + self._wait_subquery_id_ready() + kw["subquery_id"] = "session_query_%d" % self._subquery_id + return super(InSessionInstance, self)._get_queueing_info(**kw) + + def get_logview_address(self, hours=None): + self._wait_subquery_id_ready() + subquery_suffix = "&subQuery=%s" % self.subquery_id + return ( + super(InSessionInstance, self).get_logview_address(hours=hours) + + subquery_suffix + ) + + +class SessionMethods(object): + @classmethod + @utils.deprecated( + "You no longer have to manipulate session instances to use MaxCompute QueryAcceleration. " + "Try `run_sql_interactive`." + ) + def attach_session(cls, odps, session_name, taskname=None, hints=None): + """ + Attach to an existing session. + + :param session_name: The session name. + :param taskname: The created sqlrt task name. If not provided, the default value is used. + Mostly doesn't matter, default works. + :return: A SessionInstance you may execute select tasks within. + """ + return cls._attach_mcqa_session( + odps, session_name, task_name=taskname, hints=hints + ) + + @classmethod + def _attach_mcqa_session(cls, odps, session_name=None, task_name=None, hints=None): + session_name = session_name or PUBLIC_SESSION_NAME + task_name = task_name or DEFAULT_TASK_NAME + + task = tasks.SQLRTTask(name=task_name) + task.update_sql_rt_settings(hints) + task.update_sql_rt_settings( + { + "odps.sql.session.share.id": session_name, + "odps.sql.submit.mode": "script", + } + ) + project = odps.get_project() + return project.instances.create( + task=task, session_project=project, session_name=session_name + ) + + @classmethod + @utils.deprecated( + "You no longer have to manipulate session instances to use MaxCompute QueryAcceleration. " + "Try `run_sql_interactive`." + ) + def default_session(cls, odps): + """ + Attach to the default session of your project. + + :return: A SessionInstance you may execute select tasks within. + """ + return cls._get_default_mcqa_session(odps, wait=False) + + @classmethod + def _get_default_mcqa_session( + cls, odps, session_name=None, hints=None, wait=True, service_startup_timeout=60 + ): + session_name = session_name or PUBLIC_SESSION_NAME + if odps._default_session is None: + odps._default_session = cls._attach_mcqa_session( + odps, session_name, hints=hints + ) + odps._default_session_name = session_name + if wait: + odps._default_session.wait_for_startup( + 0.1, service_startup_timeout, max_interval=1 + ) + return odps._default_session + + @classmethod + @utils.deprecated( + "You no longer have to manipulate session instances to use MaxCompute QueryAcceleration. " + "Try `run_sql_interactive`." + ) + def create_session( + cls, + odps, + session_worker_count, + session_worker_memory, + session_name=None, + worker_spare_span=None, + taskname=None, + hints=None, + ): + """ + Create session. + + :param session_worker_count: How much workers assigned to the session. + :param session_worker_memory: How much memory each worker consumes. + :param session_name: The session name. Not specifying to use its ID as name. + :param worker_spare_span: format "00-24", allocated workers will be reduced during this time. + Not specifying to disable this. + :param taskname: The created sqlrt task name. If not provided, the default value is used. + Mostly doesn't matter, default works. + :param hints: Extra hints provided to the session. Parameters of this method will override + certain hints. + :return: A SessionInstance you may execute select tasks within. + """ + return cls._create_mcqa_session( + odps, + session_worker_count, + session_worker_memory, + session_name, + worker_spare_span, + taskname, + hints, + ) + + @classmethod + def _create_mcqa_session( + cls, + odps, + session_worker_count, + session_worker_memory, + session_name=None, + worker_spare_span=None, + task_name=None, + hints=None, + ): + if not task_name: + task_name = DEFAULT_TASK_NAME + + session_hints = { + "odps.sql.session.worker.count": str(session_worker_count), + "odps.sql.session.worker.memory": str(session_worker_memory), + "odps.sql.submit.mode": "script", + } + if session_name: + session_hints["odps.sql.session.name"] = session_name + if worker_spare_span: + session_hints["odps.sql.session.worker.sparespan"] = worker_spare_span + task = tasks.SQLRTTask(name=task_name) + task.update_sql_rt_settings(hints) + task.update_sql_rt_settings(session_hints) + project = odps.get_project() + return project.instances.create( + task=task, session_project=project, session_name=session_name + ) + + @classmethod + def _get_mcqa_session_file(cls, odps): + try: + dir_name = utils.build_pyodps_dir() + if not os.path.exists(dir_name): + os.makedirs(dir_name) + expire_time = time.time() - _SESSION_FILE_EXPIRE_TIME + for session_file in glob.glob( + os.path.join(dir_name, _SESSION_FILE_PREFIX + "*") + ): + if os.path.getctime(session_file) < expire_time: + try: + os.unlink(session_file) + except OSError: + pass + access_id_digest = hashlib.md5( + utils.to_binary(odps.account.access_id) + ).hexdigest() + return os.path.join(dir_name, _SESSION_FILE_PREFIX + access_id_digest) + except: + return None + + @classmethod + def run_sql_interactive(cls, odps, sql, hints=None, **kwargs): + """ + Run SQL query in interactive mode (a.k.a MaxCompute QueryAcceleration). + Won't fallback to offline mode automatically if query not supported or fails + + :param sql: the sql query. + :param hints: settings for sql query. + :return: instance. + """ + cached_is_running = False + service_name = kwargs.pop("service_name", PUBLIC_SESSION_NAME) + task_name = kwargs.pop("task_name", None) + service_startup_timeout = kwargs.pop("service_startup_timeout", 60) + force_reattach = kwargs.pop("force_reattach", False) + + session_file_name = cls._get_mcqa_session_file(odps) + if ( + odps._default_session is None + and session_file_name + and os.path.exists(session_file_name) + ): + try: + with open(session_file_name, "r") as session_file: + session_info = json.loads(session_file.read()) + instance_obj = odps.get_instance(session_info.pop("id")) + session_project = odps.get_project( + session_info.pop("session_project_name") + ) + odps._default_session_name = session_info["session_name"] + odps._default_session = SessionInstance.from_instance( + instance_obj, session_project=session_project, **session_info + ) + except: + pass + + if odps._default_session is not None: + try: + cached_is_running = odps._default_session.is_running() + except: + pass + if ( + force_reattach + or not cached_is_running + or odps._default_session_name != service_name + ): + # should reattach, for whatever reason (timed out, terminated, never created, + # forced using another session) + odps._default_session = cls._attach_mcqa_session( + odps, service_name, task_name=task_name + ) + odps._default_session.wait_for_startup( + 0.1, service_startup_timeout, max_interval=1 + ) + odps._default_session_name = service_name + + if session_file_name: + try: + with open(session_file_name, "w") as session_file: + session_file.write( + json.dumps(odps._default_session._extract_json_info()) + ) + except: + pass + return odps._default_session.run_sql(sql, hints, **kwargs) + + @classmethod + @utils.deprecated( + "The method `run_sql_interactive_with_fallback` is deprecated. " + "Try `execute_sql_interactive` with fallback=True argument instead." + ) + def run_sql_interactive_with_fallback(cls, odps, sql, hints=None, **kwargs): + return cls.execute_sql_interactive( + odps, sql, hints=hints, fallback="all", wait_fallback=False, **kwargs + ) + + @classmethod + def execute_sql_interactive( + cls, + odps, + sql, + hints=None, + fallback=True, + wait_fallback=True, + offline_quota_name=None, + **kwargs + ): + """ + Run SQL query in interactive mode (a.k.a MaxCompute QueryAcceleration). + If query is not supported or fails, and fallback is True, + will fallback to offline mode automatically + + :param sql: the sql query. + :param hints: settings for sql query. + :param fallback: fallback query to non-interactive mode, True by default. + Both boolean type and policy names separated by commas are acceptable. + :param bool wait_fallback: wait fallback instance to finish, True by default. + :return: instance. + """ + if isinstance(fallback, (six.string_types, set, list, tuple)): + fallback_policy = FallbackPolicy(fallback) + elif fallback is False: + fallback_policy = None + elif fallback is True: + fallback_policy = FallbackPolicy("all") + else: + assert isinstance(fallback, FallbackPolicy) + fallback_policy = fallback + + inst = None + use_tunnel = kwargs.pop("tunnel", True) + fallback_callback = kwargs.pop("fallback_callback", None) + offline_hints = kwargs.pop("offline_hints", None) or {} + try: + inst = cls.run_sql_interactive(odps, sql, hints=hints, **kwargs) + inst.wait_for_success(interval=0.1, max_interval=1) + try: + rd = inst.open_reader(tunnel=use_tunnel, limit=True) + if not rd: + raise errors.ODPSError("Get sql result fail") + except errors.InstanceTypeNotSupported: + # sql is not a select, just skip creating reader + pass + return inst + except BaseException as ex: + if fallback_policy is None: + raise + fallback_mode = fallback_policy.get_mode_from_exception(ex) + if fallback_mode is None: + raise + elif fallback_mode == FallbackMode.INTERACTIVE: + kwargs["force_reattach"] = True + return cls.execute_sql_interactive( + odps, + sql, + hints=hints, + fallback=fallback, + wait_fallback=wait_fallback, + **kwargs + ) + else: + kwargs.pop("service_name", None) + kwargs.pop("force_reattach", None) + kwargs.pop("service_startup_timeout", None) + hints = copy.copy(offline_hints or hints or {}) + hints["odps.task.sql.sqa.enable"] = "false" + + if fallback_callback is not None: + fallback_callback(inst, ex) + + if inst is not None: + hints["odps.sql.session.fallback.instance"] = "%s_%s" % ( + inst.id, + inst.subquery_id, + ) + else: + hints[ + "odps.sql.session.fallback.instance" + ] = "fallback4AttachFailed" + inst = odps.execute_sql( + sql, hints=hints, quota_name=offline_quota_name, **kwargs + ) + if wait_fallback: + inst.wait_for_success() + return inst diff --git a/odps/models/storage_tier.py b/odps/models/storage_tier.py index f9022c70..2d7f4276 100644 --- a/odps/models/storage_tier.py +++ b/odps/models/storage_tier.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ class StorageTier(Enum): class StorageTierInfo(serializers.JSONSerializableModel): - __slots__ = "storage_size", + __slots__ = ("storage_size",) _storage_tier_size_names = { StorageTier.STANDARD: "StandardSize", @@ -42,10 +42,10 @@ class StorageTierInfo(serializers.JSONSerializableModel): } storage_tier = serializers.JSONNodeField( - 'StorageTier', parse_callback=lambda x: StorageTier(x.lower()) if x else None + "StorageTier", parse_callback=lambda x: StorageTier(x.lower()) if x else None ) last_modified_time = serializers.JSONNodeField( - 'StorageLastModifiedTime', + "StorageLastModifiedTime", parse_callback=lambda x: datetime.datetime.fromtimestamp(int(x)), ) @@ -56,7 +56,10 @@ def __init__(self, **kwargs): @classmethod def deserial(cls, content, obj=None, **kw): res = super(StorageTierInfo, cls).deserial(content, obj=obj, **kw) - for key_defs in (cls._storage_tier_size_names, cls._storage_tier_charge_size_names): + for key_defs in ( + cls._storage_tier_size_names, + cls._storage_tier_charge_size_names, + ): for tier, key in key_defs.items(): if key not in content: continue diff --git a/odps/models/table.py b/odps/models/table.py index 17cf3a7a..ce9111ce 100644 --- a/odps/models/table.py +++ b/odps/models/table.py @@ -22,11 +22,13 @@ except (AttributeError, ImportError): pa = None -from .. import types as odps_types, serializers, utils, readers -from ..compat import dir2, six, Enum +from .. import readers, serializers +from .. import types as odps_types +from .. import utils +from ..compat import Enum, dir2, six from ..config import options from .cluster_info import ClusterInfo, ClusterType -from .core import LazyLoad, JSONRemoteModel +from .core import JSONRemoteModel, LazyLoad from .partitions import Partitions from .record import Record from .storage_tier import StorageTier, StorageTierInfo @@ -59,18 +61,19 @@ class TableSchema(odps_types.OdpsSchema, JSONRemoteModel): """ class Shard(JSONRemoteModel): - - hub_lifecycle = serializers.JSONNodeField('HubLifecycle') - shard_num = serializers.JSONNodeField('ShardNum') - distribute_cols = serializers.JSONNodeField('DistributeCols') - sort_cols = serializers.JSONNodeField('SortCols') + hub_lifecycle = serializers.JSONNodeField("HubLifecycle") + shard_num = serializers.JSONNodeField("ShardNum") + distribute_cols = serializers.JSONNodeField("DistributeCols") + sort_cols = serializers.JSONNodeField("SortCols") class TableColumn(odps_types.Column, JSONRemoteModel): - name = serializers.JSONNodeField('name') - type = serializers.JSONNodeField('type', parse_callback=odps_types.validate_data_type) - comment = serializers.JSONNodeField('comment') - label = serializers.JSONNodeField('label') - nullable = serializers.JSONNodeField('isNullable') + name = serializers.JSONNodeField("name") + type = serializers.JSONNodeField( + "type", parse_callback=odps_types.validate_data_type + ) + comment = serializers.JSONNodeField("comment") + label = serializers.JSONNodeField("label") + nullable = serializers.JSONNodeField("isNullable") def __init__(self, **kwargs): kwargs.setdefault("nullable", True) @@ -83,8 +86,8 @@ def __init__(self, **kwargs): TableSchema.TableColumn.__init__(self, **kwargs) def __init__(self, **kwargs): - kwargs['_columns'] = columns = kwargs.pop('columns', None) - kwargs['_partitions'] = partitions = kwargs.pop('partitions', None) + kwargs["_columns"] = columns = kwargs.pop("columns", None) + kwargs["_partitions"] = partitions = kwargs.pop("partitions", None) JSONRemoteModel.__init__(self, **kwargs) odps_types.OdpsSchema.__init__(self, columns=columns, partitions=partitions) @@ -92,60 +95,68 @@ def load(self): self.update(self._columns, self._partitions) self.build_snapshot() - comment = serializers.JSONNodeField('comment', set_to_parent=True) - owner = serializers.JSONNodeField('owner', set_to_parent=True) + comment = serializers.JSONNodeField("comment", set_to_parent=True) + owner = serializers.JSONNodeField("owner", set_to_parent=True) creation_time = serializers.JSONNodeField( - 'createTime', parse_callback=datetime.fromtimestamp, - set_to_parent=True) + "createTime", parse_callback=datetime.fromtimestamp, set_to_parent=True + ) last_data_modified_time = serializers.JSONNodeField( - 'lastModifiedTime', parse_callback=datetime.fromtimestamp, - set_to_parent=True) + "lastModifiedTime", parse_callback=datetime.fromtimestamp, set_to_parent=True + ) last_meta_modified_time = serializers.JSONNodeField( - 'lastDDLTime', parse_callback=datetime.fromtimestamp, - set_to_parent=True) + "lastDDLTime", parse_callback=datetime.fromtimestamp, set_to_parent=True + ) is_virtual_view = serializers.JSONNodeField( - 'isVirtualView', parse_callback=bool, set_to_parent=True) + "isVirtualView", parse_callback=bool, set_to_parent=True + ) is_materialized_view = serializers.JSONNodeField( - 'isMaterializedView', parse_callback=bool, set_to_parent=True) + "isMaterializedView", parse_callback=bool, set_to_parent=True + ) is_materialized_view_rewrite_enabled = serializers.JSONNodeField( - 'isMaterializedViewRewriteEnabled', + "isMaterializedViewRewriteEnabled", parse_callback=lambda x: x is not None and str(x).lower() == "true", set_to_parent=True, ) is_materialized_view_outdated = serializers.JSONNodeField( - 'isMaterializedViewOutdated', + "isMaterializedViewOutdated", parse_callback=lambda x: x is not None and str(x).lower() == "true", set_to_parent=True, ) lifecycle = serializers.JSONNodeField( - 'lifecycle', parse_callback=int, set_to_parent=True) - view_text = serializers.JSONNodeField('viewText', set_to_parent=True) - view_expanded_text = serializers.JSONNodeField('viewExpandedText', set_to_parent=True) + "lifecycle", parse_callback=int, set_to_parent=True + ) + view_text = serializers.JSONNodeField("viewText", set_to_parent=True) + view_expanded_text = serializers.JSONNodeField( + "viewExpandedText", set_to_parent=True + ) size = serializers.JSONNodeField("size", parse_callback=int, set_to_parent=True) is_archived = serializers.JSONNodeField( - 'IsArchived', parse_callback=bool, set_to_parent=True) + "IsArchived", parse_callback=bool, set_to_parent=True + ) physical_size = serializers.JSONNodeField( - 'PhysicalSize', parse_callback=int, set_to_parent=True) + "PhysicalSize", parse_callback=int, set_to_parent=True + ) file_num = serializers.JSONNodeField( - 'FileNum', parse_callback=int, set_to_parent=True) + "FileNum", parse_callback=int, set_to_parent=True + ) record_num = serializers.JSONNodeField( - 'recordNum', parse_callback=int, set_to_parent=True) - location = serializers.JSONNodeField( - 'location', set_to_parent=True) - storage_handler = serializers.JSONNodeField( - 'storageHandler', set_to_parent=True) - resources = serializers.JSONNodeField( - 'resources', set_to_parent=True) + "recordNum", parse_callback=int, set_to_parent=True + ) + location = serializers.JSONNodeField("location", set_to_parent=True) + storage_handler = serializers.JSONNodeField("storageHandler", set_to_parent=True) + resources = serializers.JSONNodeField("resources", set_to_parent=True) serde_properties = serializers.JSONNodeField( - 'serDeProperties', type='json', set_to_parent=True) - reserved = serializers.JSONNodeField( - 'Reserved', type='json', set_to_parent=True) + "serDeProperties", type="json", set_to_parent=True + ) + reserved = serializers.JSONNodeField("Reserved", type="json", set_to_parent=True) shard = serializers.JSONNodeReferenceField( - Shard, 'shardInfo', check_before=['shardExist'], set_to_parent=True) + Shard, "shardInfo", check_before=["shardExist"], set_to_parent=True + ) table_label = serializers.JSONNodeField( - 'tableLabel', callback=lambda t: t if t != '0' else '', set_to_parent=True) - _columns = serializers.JSONNodesReferencesField(TableColumn, 'columns') - _partitions = serializers.JSONNodesReferencesField(TablePartition, 'partitionKeys') + "tableLabel", callback=lambda t: t if t != "0" else "", set_to_parent=True + ) + _columns = serializers.JSONNodesReferencesField(TableColumn, "columns") + _partitions = serializers.JSONNodesReferencesField(TablePartition, "partitionKeys") def __getstate__(self): return self._columns, self._partitions @@ -191,17 +202,37 @@ class Table(LazyLoad): >>> writer.write(0, gen_records(block=0)) >>> writer.write(1, gen_records(block=1)) # we can do this parallel """ + _extend_args = ( - 'is_archived', 'physical_size', 'file_num', 'location', 'storage_handler', - 'resources', 'serde_properties', 'reserved', 'is_transactional', - 'primary_key', 'storage_tier_info', 'cluster_info' + "is_archived", + "physical_size", + "file_num", + "location", + "storage_handler", + "resources", + "serde_properties", + "reserved", + "is_transactional", + "primary_key", + "storage_tier_info", + "cluster_info", + "acid_data_retain_hours", ) __slots__ = ( - '_is_extend_info_loaded', 'last_meta_modified_time', 'is_virtual_view', - 'is_materialized_view', 'is_materialized_view_rewrite_enabled', - 'is_materialized_view_outdated', 'lifecycle', 'view_text', - 'view_expanded_text', 'size', 'shard', 'record_num', - '_table_tunnel', '_id_thread_local' + "_is_extend_info_loaded", + "last_meta_modified_time", + "is_virtual_view", + "is_materialized_view", + "is_materialized_view_rewrite_enabled", + "is_materialized_view_outdated", + "lifecycle", + "view_text", + "view_expanded_text", + "size", + "shard", + "record_num", + "_table_tunnel", + "_id_thread_local", ) __slots__ += _extend_args @@ -212,38 +243,39 @@ class Type(Enum): VIRTUAL_VIEW = "VIRTUAL_VIEW" MATERIALIZED_VIEW = "MATERIALIZED_VIEW" - name = serializers.XMLNodeField('Name') - table_id = serializers.XMLNodeField('TableId') - format = serializers.XMLNodeAttributeField(attr='format') - table_schema = serializers.XMLNodeReferenceField(TableSchema, 'Schema') - comment = serializers.XMLNodeField('Comment') - owner = serializers.XMLNodeField('Owner') - table_label = serializers.XMLNodeField('TableLabel') + name = serializers.XMLNodeField("Name") + table_id = serializers.XMLNodeField("TableId") + format = serializers.XMLNodeAttributeField(attr="format") + table_schema = serializers.XMLNodeReferenceField(TableSchema, "Schema") + comment = serializers.XMLNodeField("Comment") + owner = serializers.XMLNodeField("Owner") + table_label = serializers.XMLNodeField("TableLabel") creation_time = serializers.XMLNodeField( - 'CreationTime', parse_callback=utils.parse_rfc822 + "CreationTime", parse_callback=utils.parse_rfc822 ) last_data_modified_time = serializers.XMLNodeField( - 'LastModifiedTime', parse_callback=utils.parse_rfc822 + "LastModifiedTime", parse_callback=utils.parse_rfc822 ) last_access_time = serializers.XMLNodeField( - 'LastAccessTime', parse_callback=utils.parse_rfc822 + "LastAccessTime", parse_callback=utils.parse_rfc822 ) type = serializers.XMLNodeField( - 'Type', parse_callback=lambda s: Table.Type(s.upper()) if s is not None else None + "Type", + parse_callback=lambda s: Table.Type(s.upper()) if s is not None else None, ) - _download_ids = utils.thread_local_attribute('_id_thread_local', dict) - _upload_ids = utils.thread_local_attribute('_id_thread_local', dict) + _download_ids = utils.thread_local_attribute("_id_thread_local", dict) + _upload_ids = utils.thread_local_attribute("_id_thread_local", dict) def __init__(self, **kwargs): self._is_extend_info_loaded = False - if 'schema' in kwargs: + if "schema" in kwargs: warnings.warn( - 'Argument schema is deprecated and will be replaced by table_schema.', + "Argument schema is deprecated and will be replaced by table_schema.", DeprecationWarning, stacklevel=2, ) - kwargs['table_schema'] = kwargs.pop('schema') + kwargs["table_schema"] = kwargs.pop("schema") super(Table, self).__init__(**kwargs) try: @@ -271,9 +303,9 @@ def table_resource(self, client=None, endpoint=None, force_schema=False): def full_table_name(self): schema_name = self._get_schema_name() if schema_name is None: - return '{0}.`{1}`'.format(self.project.name, self.name) + return "{0}.`{1}`".format(self.project.name, self.name) else: - return '{0}.{1}.`{2}`'.format(self.project.name, schema_name, self.name) + return "{0}.{1}.`{2}`".format(self.project.name, schema_name, self.name) def reload(self): url = self.resource() @@ -291,11 +323,13 @@ def reset(self): @property def schema(self): warnings.warn( - 'Table.schema is deprecated and will be replaced by Table.table_schema.', + "Table.schema is deprecated and will be replaced by Table.table_schema.", DeprecationWarning, stacklevel=3, ) - utils.add_survey_call(".".join([type(self).__module__, type(self).__name__, "schema"])) + utils.add_survey_call( + ".".join([type(self).__module__, type(self).__name__, "schema"]) + ) return self.table_schema @property @@ -306,9 +340,9 @@ def last_modified_time(self): DeprecationWarning, stacklevel=3, ) - utils.add_survey_call(".".join( - [type(self).__module__, type(self).__name__, "last_modified_time"] - )) + utils.add_survey_call( + ".".join([type(self).__module__, type(self).__name__, "last_modified_time"]) + ) return self.last_data_modified_time def _parse_reserved(self): @@ -317,21 +351,25 @@ def _parse_reserved(self): self.primary_key = None self.storage_tier_info = None self.cluster_info = None + self.acid_data_retain_hours = -1 return is_transactional = self.reserved.get("Transactional") self.is_transactional = ( is_transactional is not None and is_transactional.lower() == "true" ) - self.primary_key = self.reserved.get('PrimaryKey') + self.primary_key = self.reserved.get("PrimaryKey") self.storage_tier_info = StorageTierInfo.deserial(self.reserved) self.cluster_info = ClusterInfo.deserial(self.reserved) + self.acid_data_retain_hours = int( + self.reserved.get("acid.data.retain.hours", "-1") + ) def reload_extend_info(self): params = {} schema_name = self._get_schema_name() if schema_name is not None: - params['curr_schema'] = schema_name - resp = self._client.get(self.resource(), action='extended', params=params) + params["curr_schema"] = schema_name + resp = self._client.get(self.resource(), action="extended", params=params) self.parse(self._client, resp, obj=self) self._is_extend_info_loaded = True @@ -349,7 +387,7 @@ def __getattribute__(self, attr): val = object.__getattribute__(self, attr) if val is None and not self._loaded: - if attr in getattr(TableSchema, '__fields'): + if attr in getattr(TableSchema, "__fields"): self.reload() return object.__getattribute__(self, attr) @@ -358,55 +396,75 @@ def __getattribute__(self, attr): def _repr(self): buf = six.StringIO() - buf.write('odps.Table\n') - buf.write(' name: {0}\n'.format(self.full_table_name)) + buf.write("odps.Table\n") + buf.write(" name: {0}\n".format(self.full_table_name)) if self.type: - buf.write(' type: {0}\n'.format(self.type.value)) + buf.write(" type: {0}\n".format(self.type.value)) name_space = 2 * max(len(col.name) for col in self.table_schema.columns) type_space = 2 * max(len(repr(col.type)) for col in self.table_schema.columns) not_empty = lambda field: field is not None and len(field.strip()) > 0 - buf.write(' schema:\n') + buf.write(" schema:\n") cols_strs = [] for col in self.table_schema._columns: - cols_strs.append('{0}: {1}{2}'.format( - col.name.ljust(name_space), - repr(col.type).ljust(type_space), - '# {0}'.format(utils.to_str(col.comment)) if not_empty(col.comment) else '' - )) - buf.write(utils.indent('\n'.join(cols_strs), 4)) - buf.write('\n') + cols_strs.append( + "{0}: {1}{2}".format( + col.name.ljust(name_space), + repr(col.type).ljust(type_space), + "# {0}".format(utils.to_str(col.comment)) + if not_empty(col.comment) + else "", + ) + ) + buf.write(utils.indent("\n".join(cols_strs), 4)) + buf.write("\n") if self.table_schema.partitions: - buf.write(' partitions:\n') + buf.write(" partitions:\n") partition_strs = [] for partition in self.table_schema.partitions: - partition_strs.append('{0}: {1}{2}'.format( - partition.name.ljust(name_space), - repr(partition.type).ljust(type_space), - '# {0}'.format(utils.to_str(partition.comment)) if not_empty(partition.comment) else '' - )) - buf.write(utils.indent('\n'.join(partition_strs), 4)) + partition_strs.append( + "{0}: {1}{2}".format( + partition.name.ljust(name_space), + repr(partition.type).ljust(type_space), + "# {0}".format(utils.to_str(partition.comment)) + if not_empty(partition.comment) + else "", + ) + ) + buf.write(utils.indent("\n".join(partition_strs), 4)) if self.view_text: - buf.write(' view text:\n{0}'.format(utils.indent(self.view_text, 4))) + buf.write(" view text:\n{0}".format(utils.indent(self.view_text, 4))) return buf.getvalue() @property def stored_as(self): - return (self.reserved or dict()).get('StoredAs') + return (self.reserved or dict()).get("StoredAs") @classmethod def gen_create_table_sql( - cls, table_name, table_schema, comment=None, if_not_exists=False, - lifecycle=None, shard_num=None, hub_lifecycle=None, - with_column_comments=True, transactional=False, primary_key=None, - storage_tier=None, project=None, schema=None, table_type=None, - view_text=None, **kw + cls, + table_name, + table_schema, + comment=None, + if_not_exists=False, + lifecycle=None, + shard_num=None, + hub_lifecycle=None, + with_column_comments=True, + transactional=False, + primary_key=None, + storage_tier=None, + project=None, + schema=None, + table_type=None, + view_text=None, + **kw ): from ..utils import escape_odps_string @@ -418,34 +476,38 @@ def gen_create_table_sql( view_text = utils.to_text(view_text) table_type = cls.Type(table_type or cls.Type.MANAGED_TABLE) is_view = table_type in (cls.Type.VIRTUAL_VIEW, cls.Type.MATERIALIZED_VIEW) - primary_key = [primary_key] if isinstance(primary_key, six.string_types) else primary_key + primary_key = ( + [primary_key] if isinstance(primary_key, six.string_types) else primary_key + ) - stored_as = kw.get('stored_as') - external_stored_as = kw.get('external_stored_as') - storage_handler = kw.get('storage_handler') + stored_as = kw.get("stored_as") + external_stored_as = kw.get("external_stored_as") + storage_handler = kw.get("storage_handler") table_properties = kw.get("table_properties") or {} - cluster_info = kw.get('cluster_info') + cluster_info = kw.get("cluster_info") - rewrite_enabled = kw.get('rewrite_enabled') + rewrite_enabled = kw.get("rewrite_enabled") rewrite_enabled = rewrite_enabled if rewrite_enabled is not None else True if table_type == cls.Type.EXTERNAL_TABLE: - type_str = u'EXTERNAL TABLE' + type_str = u"EXTERNAL TABLE" elif table_type == cls.Type.VIRTUAL_VIEW: - type_str = u'VIEW' + type_str = u"VIEW" elif table_type == cls.Type.MATERIALIZED_VIEW: - type_str = u'MATERIALIZED VIEW' + type_str = u"MATERIALIZED VIEW" else: - type_str = u'EXTERNAL TABLE' if storage_handler or external_stored_as else u'TABLE' + type_str = ( + u"EXTERNAL TABLE" if storage_handler or external_stored_as else u"TABLE" + ) - buf.write(u'CREATE %s ' % type_str) + buf.write(u"CREATE %s " % type_str) if if_not_exists: - buf.write(u'IF NOT EXISTS ') + buf.write(u"IF NOT EXISTS ") if project is not None: - buf.write(u'%s.' % project) + buf.write(u"%s." % project) if schema is not None: - buf.write(u'%s.' % schema) - buf.write(u'`%s` ' % table_name) + buf.write(u"%s." % schema) + buf.write(u"`%s` " % table_name) if is_view and lifecycle is not None and lifecycle > 0: buf.write("LIFECYCLE %s " % lifecycle) @@ -455,52 +517,58 @@ def _write_primary_key(prev=""): return if not prev.strip().endswith(","): buf.write(u",\n") - buf.write(u" PRIMARY KEY (%s)" % ", ".join("`%s`" % c for c in primary_key)) + buf.write( + u" PRIMARY KEY (%s)" % ", ".join("`%s`" % c for c in primary_key) + ) if isinstance(table_schema, six.string_types): - buf.write(u'(\n') + buf.write(u"(\n") buf.write(table_schema) _write_primary_key(table_schema) - buf.write(u'\n)\n') + buf.write(u"\n)\n") if comment: buf.write(u"COMMENT '%s'\n" % escape_odps_string(comment)) elif isinstance(table_schema, tuple): - buf.write(u'(\n') + buf.write(u"(\n") buf.write(table_schema[0]) _write_primary_key(table_schema[0]) - buf.write(u'\n)\n') + buf.write(u"\n)\n") if comment: buf.write(u"COMMENT '%s'\n" % escape_odps_string(comment)) - buf.write(u'PARTITIONED BY ') - buf.write(u'(\n') + buf.write(u"PARTITIONED BY ") + buf.write(u"(\n") buf.write(table_schema[1]) - buf.write(u'\n)\n') + buf.write(u"\n)\n") else: + def write_columns(col_array, with_pk=False): size = len(col_array) - buf.write(u'(\n') + buf.write(u"(\n") for idx, column in enumerate(col_array): - buf.write(u' `%s` %s' % (utils.to_text(column.name), utils.to_text(column.type))) + buf.write( + u" `%s` %s" + % (utils.to_text(column.name), utils.to_text(column.type)) + ) if not column.nullable and not options.sql.ignore_fields_not_null: buf.write(u" NOT NULL") if with_column_comments and column.comment: buf.write(u" COMMENT '%s'" % utils.to_text(column.comment)) if idx < size - 1: - buf.write(u',\n') + buf.write(u",\n") if with_pk: _write_primary_key() - buf.write(u'\n)\n') + buf.write(u"\n)\n") def write_view_columns(col_array): size = len(col_array) - buf.write(u'(\n') + buf.write(u"(\n") for idx, column in enumerate(col_array): - buf.write(u' `%s`' % (utils.to_text(column.name))) + buf.write(u" `%s`" % (utils.to_text(column.name))) if with_column_comments and column.comment: buf.write(u" COMMENT '%s'" % utils.to_text(column.comment)) if idx < size - 1: - buf.write(u',\n') - buf.write(u'\n)\n') + buf.write(u",\n") + buf.write(u"\n)\n") if not is_view: write_columns(table_schema.simple_columns, with_pk=True) @@ -510,13 +578,13 @@ def write_view_columns(col_array): if comment: buf.write(u"COMMENT '%s'\n" % comment) if table_type == cls.Type.MATERIALIZED_VIEW and not rewrite_enabled: - buf.write(u'DISABLE REWRITE\n') + buf.write(u"DISABLE REWRITE\n") if table_schema.partitions: if not is_view: - buf.write(u'PARTITIONED BY ') + buf.write(u"PARTITIONED BY ") write_columns(table_schema.partitions) else: - buf.write(u'PARTITIONED ON ') + buf.write(u"PARTITIONED ON ") write_view_columns(table_schema.partitions) if cluster_info is not None: @@ -524,7 +592,9 @@ def write_view_columns(col_array): cluster_type_str = u"RANGE " else: cluster_type_str = u"" - cluster_cols = u", ".join(u"`%s`" % col for col in cluster_info.cluster_cols) + cluster_cols = u", ".join( + u"`%s`" % col for col in cluster_info.cluster_cols + ) buf.write("%sCLUSTERED BY (%s)" % (cluster_type_str, cluster_cols)) if cluster_info.sort_cols: sort_cols = u", ".join( @@ -539,31 +609,39 @@ def write_view_columns(col_array): table_properties["transactional"] = "true" if storage_tier: if isinstance(storage_tier, six.string_types): - storage_tier = StorageTier(utils.underline_to_camel(storage_tier).lower()) + storage_tier = StorageTier( + utils.underline_to_camel(storage_tier).lower() + ) table_properties["storagetier"] = storage_tier.value if table_properties: buf.write(u"TBLPROPERTIES (\n") for k, v in table_properties.items(): buf.write(u' "%s"="%s"' % (k, v)) - buf.write(u'\n)\n') + buf.write(u"\n)\n") - serde_properties = kw.get('serde_properties') - location = kw.get('location') - resources = kw.get('resources') + serde_properties = kw.get("serde_properties") + location = kw.get("location") + resources = kw.get("resources") if storage_handler or external_stored_as: if storage_handler: buf.write("STORED BY '%s'\n" % escape_odps_string(storage_handler)) else: buf.write("STORED AS %s\n" % escape_odps_string(external_stored_as)) if serde_properties: - buf.write('WITH SERDEPROPERTIES (\n') + buf.write("WITH SERDEPROPERTIES (\n") for idx, k in enumerate(serde_properties): - buf.write(" '%s' = '%s'" % (escape_odps_string(k), escape_odps_string(serde_properties[k]))) + buf.write( + " '%s' = '%s'" + % ( + escape_odps_string(k), + escape_odps_string(serde_properties[k]), + ) + ) if idx + 1 < len(serde_properties): - buf.write(',') - buf.write('\n') - buf.write(')\n') + buf.write(",") + buf.write("\n") + buf.write(")\n") if location: buf.write("LOCATION '%s'\n" % location) if resources: @@ -571,16 +649,16 @@ def write_view_columns(col_array): if stored_as: buf.write("STORED AS %s\n" % escape_odps_string(stored_as)) if not is_view and lifecycle is not None and lifecycle > 0: - buf.write(u'LIFECYCLE %s\n' % lifecycle) + buf.write(u"LIFECYCLE %s\n" % lifecycle) if shard_num is not None: - buf.write(u'INTO %s SHARDS' % shard_num) + buf.write(u"INTO %s SHARDS" % shard_num) if hub_lifecycle is not None: - buf.write(u' HUBLIFECYCLE %s\n' % hub_lifecycle) + buf.write(u" HUBLIFECYCLE %s\n" % hub_lifecycle) else: - buf.write(u'\n') + buf.write(u"\n") if is_view and view_text: - buf.write(u'AS %s\n' % view_text) + buf.write(u"AS %s\n" % view_text) return buf.getvalue().strip() def get_ddl(self, with_comments=True, if_not_exists=False, force_table_ddl=False): @@ -594,17 +672,29 @@ def get_ddl(self, with_comments=True, if_not_exists=False, force_table_ddl=False """ shard_num = self.shard.shard_num if self.shard is not None else None storage_tier = ( - self.storage_tier_info.storage_tier.value if self.storage_tier_info else None + self.storage_tier_info.storage_tier.value + if self.storage_tier_info + else None ) table_type = self.type if not force_table_ddl else self.Type.MANAGED_TABLE return self.gen_create_table_sql( - self.name, self.table_schema, self.comment if with_comments else None, - if_not_exists=if_not_exists, with_column_comments=with_comments, - lifecycle=self.lifecycle, shard_num=shard_num, project=self.project.name, - storage_handler=self.storage_handler, serde_properties=self.serde_properties, - location=self.location, resources=self.resources, table_type=table_type, - storage_tier=storage_tier, cluster_info=self.cluster_info, - transactional=self.is_transactional, primary_key=self.primary_key, + self.name, + self.table_schema, + self.comment if with_comments else None, + if_not_exists=if_not_exists, + with_column_comments=with_comments, + lifecycle=self.lifecycle, + shard_num=shard_num, + project=self.project.name, + storage_handler=self.storage_handler, + serde_properties=self.serde_properties, + location=self.location, + resources=self.resources, + table_type=table_type, + storage_tier=storage_tier, + cluster_info=self.cluster_info, + transactional=self.is_transactional, + primary_key=self.primary_key, view_text=self.view_text, rewrite_enabled=self.is_materialized_view_rewrite_enabled, ) @@ -612,34 +702,59 @@ def get_ddl(self, with_comments=True, if_not_exists=False, force_table_ddl=False @utils.survey def _head_by_data(self, limit, partition=None, columns=None, timeout=None): if limit <= 0: - raise ValueError('limit number should >= 0.') + raise ValueError("limit number should >= 0.") - params = {'linenum': limit} + params = {"linenum": limit} if partition is not None: if not isinstance(partition, odps_types.PartitionSpec): partition = odps_types.PartitionSpec(partition) - params['partition'] = str(partition) + params["partition"] = str(partition) if columns is not None and len(columns) > 0: - col_name = lambda col: col.name if isinstance(col, odps_types.Column) else col - params['cols'] = ','.join(col_name(col) for col in columns) + col_name = ( + lambda col: col.name if isinstance(col, odps_types.Column) else col + ) + params["cols"] = ",".join(col_name(col) for col in columns) schema_name = self._get_schema_name() if schema_name is not None: - params['schema_name'] = schema_name + params["schema_name"] = schema_name resp = self._client.get( - self.resource(), action='data', params=params, stream=True, timeout=timeout + self.resource(), action="data", params=params, stream=True, timeout=timeout ) return readers.CsvRecordReader(self.table_schema, resp) - def _head_by_preview(self, limit, partition=None, columns=None, compress_algo=None, timeout=None): + def _head_by_preview( + self, + limit, + partition=None, + columns=None, + compress_algo=None, + timeout=None, + tags=None, + ): table_tunnel = self._create_table_tunnel() return table_tunnel.open_preview_reader( - self, partition_spec=partition, columns=columns, limit=limit, - compress_algo=compress_algo, arrow=False, timeout=timeout, read_all=True + self, + partition_spec=partition, + columns=columns, + limit=limit, + compress_algo=compress_algo, + arrow=False, + timeout=timeout, + read_all=True, + tags=tags, ) - def head(self, limit, partition=None, columns=None, use_legacy=True, timeout=None): + def head( + self, + limit, + partition=None, + columns=None, + use_legacy=True, + timeout=None, + tags=None, + ): """ Get the head records of a table or its partition. @@ -653,9 +768,17 @@ def head(self, limit, partition=None, columns=None, use_legacy=True, timeout=Non """ try: if pa is not None and not use_legacy: - timeout = timeout if timeout is not None else options.tunnel.legacy_fallback_timeout + timeout = ( + timeout + if timeout is not None + else options.tunnel.legacy_fallback_timeout + ) return self._head_by_preview( - limit, partition=partition, columns=columns, timeout=timeout + limit, + partition=partition, + columns=columns, + timeout=timeout, + tags=tags, ) except: # only raises when under tests and @@ -691,6 +814,8 @@ def open_reader( columns=None, quota_name=None, async_mode=True, + append_partitions=None, + tags=None, **kw ): """ @@ -702,6 +827,7 @@ def open_reader( :param endpoint: the tunnel service URL :param download_id: use existing download_id to download table contents :param arrow: use arrow tunnel to read data + :param columns: columns to read :param quota_name: name of tunnel quota :param async_mode: enable async mode to create tunnels, can set True if session creation takes a long time. @@ -711,6 +837,8 @@ def open_reader( can be ``zlib``, ``snappy`` :param compress_level: used for ``zlib``, work when ``compress_option`` is not provided :param compress_strategy: used for ``zlib``, work when ``compress_option`` is not provided + :param bool append_partitions: if True, partition values will be + appended to the output :return: reader, ``count`` means the full size, ``status`` means the tunnel status :Example: @@ -732,22 +860,37 @@ def open_reader( download_id = download_ids.get(partition) if not reopen else None download_session = utils.call_with_retry( tunnel.create_download_session, - table=self, partition_spec=partition, download_id=download_id, - timeout=timeout, async_mode=async_mode, **kw + table=self, + partition_spec=partition, + download_id=download_id, + timeout=timeout, + async_mode=async_mode, + tags=tags, + **kw ) - if download_id and download_session.status != TableDownloadSession.Status.Normal: + if ( + download_id + and download_session.status != TableDownloadSession.Status.Normal + ): download_session = utils.call_with_retry( tunnel.create_download_session, - table=self, partition_spec=partition, timeout=timeout, - async_mode=async_mode, **kw + table=self, + partition_spec=partition, + timeout=timeout, + async_mode=async_mode, + tags=tags, + **kw ) download_ids[partition] = download_session.id - if arrow: - return TableArrowReader(self, download_session, columns=columns) - else: - return TableRecordReader(self, download_session, partition, columns=columns) + reader_cls = TableArrowReader if arrow else TableRecordReader + kw = ( + {"append_partitions": append_partitions} + if append_partitions is not None + else {} + ) + return reader_cls(self, download_session, partition, columns=columns, **kw) def open_writer( self, @@ -760,6 +903,7 @@ def open_writer( upload_id=None, arrow=False, quota_name=None, + tags=None, mp_context=None, **kw ): @@ -768,10 +912,8 @@ def open_writer( :param partition: partition of this table :param blocks: block ids to open - :param reopen: the reader will reuse last one, reopen is true means open a new reader. - :type reopen: bool - :param create_partition: if true, the partition will be created if not exist - :type create_partition: bool + :param bool reopen: the reader will reuse last one, reopen is true means open a new reader. + :param bool create_partition: if true, the partition will be created if not exist :param endpoint: the tunnel service URL :param upload_id: use existing upload_id to upload data :param arrow: use arrow tunnel to write data @@ -809,13 +951,24 @@ def open_writer( upload_ids = dict() upload_session = utils.call_with_retry( tunnel.create_upload_session, - table=self, partition_spec=partition, upload_id=upload_id, **kw + table=self, + partition_spec=partition, + upload_id=upload_id, + tags=tags, + **kw ) - if upload_id and upload_session.status.value != TableUploadSession.Status.Normal.value: + if ( + upload_id + and upload_session.status.value != TableUploadSession.Status.Normal.value + ): # check upload session status upload_session = utils.call_with_retry( - tunnel.create_upload_session, table=self, partition_spec=partition, **kw + tunnel.create_upload_session, + table=self, + partition_spec=partition, + tags=tags, + **kw ) upload_id = None upload_ids[partition] = upload_session.id @@ -829,15 +982,103 @@ def _writer_on_close(): upload_ids[partition] = None return writer_cls( - self, upload_session, blocks, commit, on_close=_writer_on_close, mp_context=mp_context + self, + upload_session, + blocks, + commit, + on_close=_writer_on_close, + mp_context=mp_context, ) + def to_pandas( + self, + partition=None, + columns=None, + start=None, + count=None, + n_process=1, + quota_name=None, + append_partitions=None, + tags=None, + **kwargs + ): + """ + Read table data into pandas DataFrame + + :param partition: partition of this table + :param list columns: columns to read + :param int start: start row index from 0 + :param int count: data count to read + :param int n_process: number of processes to accelerate reading + :param bool append_partitions: if True, partition values will be + appended to the output + :param str quota_name: name of tunnel quota to use + """ + if partition is None and self.table_schema.partitions: + raise ValueError( + "You must specify a partition when calling to_pandas on a partitioned table" + ) + with self.open_reader( + partition=partition, + columns=columns, + arrow=True, + quota_name=quota_name, + append_partitions=append_partitions, + tags=tags, + **kwargs + ) as reader: + return reader.to_pandas(start=start, count=count, n_process=n_process) + + def iter_pandas( + self, + partition=None, + columns=None, + batch_size=None, + start=None, + count=None, + quota_name=None, + append_partitions=None, + tags=None, + **kwargs + ): + """ + Iterate table data in blocks as pandas DataFrame + + :param partition: partition of this table + :param list columns: columns to read + :param int batch_size: size of DataFrame batch to read + :param int start: start row index from 0 + :param int count: data count to read + :param bool append_partitions: if True, partition values will be + appended to the output + :param str quota_name: name of tunnel quota to use + """ + if partition is None and self.table_schema.partitions: + raise ValueError( + "You must specify a partition when calling to_pandas on a partitioned table" + ) + with self.open_reader( + partition=partition, + columns=columns, + arrow=True, + quota_name=quota_name, + append_partitions=append_partitions, + tags=tags, + **kwargs + ) as reader: + for batch in reader.iter_pandas( + batch_size, start=start, count=count, columns=columns + ): + yield batch + @property def partitions(self): return Partitions(parent=self, client=self._client) @utils.with_wait_argument - def create_partition(self, partition_spec, if_not_exists=False, async_=False, hints=None): + def create_partition( + self, partition_spec, if_not_exists=False, async_=False, hints=None + ): """ Create a partition within the table. @@ -853,7 +1094,9 @@ def create_partition(self, partition_spec, if_not_exists=False, async_=False, hi ) @utils.with_wait_argument - def delete_partition(self, partition_spec, if_exists=False, async_=False, hints=None): + def delete_partition( + self, partition_spec, if_exists=False, async_=False, hints=None + ): """ Delete a partition within the table. @@ -943,22 +1186,24 @@ def truncate(self, partition_spec=None, async_=False, hints=None): ) # as data of partition changed, remove existing download id to avoid TableModified error - for part in (partition_spec or [None]): + for part in partition_spec or [None]: if isinstance(part, six.string_types): part = odps_types.PartitionSpec(part) self._download_ids.pop(part, None) task = SQLTask( - name='SQLTruncateTableTask', - query='TRUNCATE TABLE %s%s;' % (self.full_table_name, partition_expr) + name="SQLTruncateTableTask", + query="TRUNCATE TABLE %s%s;" % (self.full_table_name, partition_expr), ) hints = hints or {} - hints['odps.sql.submit.mode'] = '' + hints["odps.sql.submit.mode"] = "" schema_name = self._get_schema_name() if schema_name is not None: hints["odps.sql.allow.namespace.schema"] = "true" hints["odps.namespace.schema"] = "true" + if self.project.odps.quota_name: + hints["odps.task.wlm.quota"] = self.project.odps.quota_name task.update_sql_settings(hints) instance = self.project.parent[self._client.project].instances.create(task=task) @@ -981,7 +1226,9 @@ def drop(self, async_=False, if_exists=False, hints=None): return self.parent.delete(self, async_=async_, if_exists=if_exists, hints=hints) @utils.with_wait_argument - def set_storage_tier(self, storage_tier, partition_spec=None, async_=False, hints=None): + def set_storage_tier( + self, storage_tier, partition_spec=None, async_=False, hints=None + ): """ Set storage tier of """ @@ -996,7 +1243,7 @@ def set_storage_tier(self, storage_tier, partition_spec=None, async_=False, hint ) # as data of partition changed, remove existing download id to avoid TableModified error - for part in (partition_spec or [None]): + for part in partition_spec or [None]: if isinstance(part, six.string_types): part = odps_types.PartitionSpec(part) self._download_ids.pop(part, None) @@ -1006,19 +1253,20 @@ def set_storage_tier(self, storage_tier, partition_spec=None, async_=False, hint property_item = "TBLPROPERTIES" if not partition_spec else "PARTITIONPROPERTIES" task = SQLTask( - name='SQLSetStorageTierTask', - query="ALTER TABLE %s%s SET %s('storagetier'='%s')" % ( - self.full_table_name, partition_expr, property_item, storage_tier.value - ) + name="SQLSetStorageTierTask", + query="ALTER TABLE %s%s SET %s('storagetier'='%s')" + % (self.full_table_name, partition_expr, property_item, storage_tier.value), ) hints = hints or {} - hints['odps.sql.submit.mode'] = '' - hints['odps.tiered.storage.enable'] = 'true' + hints["odps.sql.submit.mode"] = "" + hints["odps.tiered.storage.enable"] = "true" schema_name = self._get_schema_name() if schema_name is not None: hints["odps.sql.allow.namespace.schema"] = "true" hints["odps.namespace.schema"] = "true" + if self.project.odps.quota_name: + hints["odps.task.wlm.quota"] = self.project.odps.quota_name task.update_sql_settings(hints) instance = self.project.parent[self._client.project].instances.create(task=task) @@ -1048,7 +1296,17 @@ def new_record(self, values=None): .. seealso:: :class:`odps.models.Record` """ - return Record(schema=self.table_schema, values=values) + try: + project_field_size = self.project.get_property( + "odps.sql.cfile2.field.maxsize", None + ) + max_field_size = int(project_field_size or 0) * 1024 + except: + max_field_size = 0 + + return Record( + schema=self.table_schema, values=values, max_field_size=max_field_size + ) def to_df(self): """ diff --git a/odps/models/tableio.py b/odps/models/tableio.py index cd16b59d..31e6f4b0 100644 --- a/odps/models/tableio.py +++ b/odps/models/tableio.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -21,15 +22,22 @@ import struct import sys import threading +from collections import defaultdict from types import GeneratorType, MethodType try: import pyarrow as pa except (AttributeError, ImportError): pa = None - -from .. import types as odps_types, utils -from ..compat import six +try: + import pandas as pd +except ImportError: + pd = None + +from .. import errors +from .. import types as odps_types +from .. import utils +from ..compat import Iterable, six from ..config import options from ..lib import cloudpickle from ..lib.tblib import pickling_support @@ -52,6 +60,7 @@ def _load_classmethod(cls, func_name): class InstanceMethodWrapper(object): """Trick for classmethods under Python 2.7 to be pickleable""" + def __init__(self, func): assert isinstance(func, MethodType) assert isinstance(func.im_self, type) @@ -65,6 +74,7 @@ def __reduce__(self): _wrap_classmethod = InstanceMethodWrapper else: + def _ord_if_possible(x): return x @@ -79,10 +89,21 @@ def schema(self): @staticmethod def _read_table_split( - conn, download_id, start, count, idx, - rest_client=None, project=None, table_name=None, partition_spec=None, - tunnel_endpoint=None, quota_name=None, columns=None, arrow=False, - schema_name=None + conn, + download_id, + start, + count, + idx, + rest_client=None, + project=None, + table_name=None, + partition_spec=None, + tunnel_endpoint=None, + quota_name=None, + columns=None, + arrow=False, + schema_name=None, + append_partitions=None, ): # read part data from ..tunnel import TableTunnel @@ -101,19 +122,35 @@ def _read_table_split( download_id=download_id, partition_spec=partition_spec, ) - if not arrow: - data = utils.call_with_retry( - session.open_record_reader, start, count, columns=columns - ).to_pandas() - else: - data = utils.call_with_retry( - session.open_arrow_reader, start, count, columns=columns - ).to_pandas() + + def _data_to_pandas(): + if not arrow: + with session.open_record_reader( + start, + count, + columns=columns, + append_partitions=append_partitions, + ) as reader: + return reader.to_pandas() + else: + with session.open_arrow_reader( + start, + count, + columns=columns, + append_partitions=append_partitions, + ) as reader: + return reader.to_pandas() + + data = utils.call_with_retry(_data_to_pandas) conn.send((idx, data, True)) except: - conn.send((idx, sys.exc_info(), False)) + try: + conn.send((idx, sys.exc_info(), False)) + except: + logger.exception("Failed to write in process %d", idx) + raise - def _get_process_split_reader(self, columns=None): + def _get_process_split_reader(self, columns=None, append_partitions=None): rest_client = self._parent._client table_name = self._parent.name schema_name = self._parent.get_schema() @@ -133,21 +170,42 @@ def _get_process_split_reader(self, columns=None): arrow=isinstance(self, TunnelArrowReader), columns=columns or self._column_names, schema_name=schema_name, + append_partitions=append_partitions, ) class TableRecordReader(SpawnedTableReaderMixin, TunnelRecordReader): - def __init__(self, table, download_session, partition_spec=None, columns=None): + def __init__( + self, + table, + download_session, + partition_spec=None, + columns=None, + append_partitions=True, + ): super(TableRecordReader, self).__init__( - table, download_session, columns=columns + table, + download_session, + columns=columns, + append_partitions=append_partitions, ) self._partition_spec = partition_spec class TableArrowReader(SpawnedTableReaderMixin, TunnelArrowReader): - def __init__(self, table, download_session, partition_spec=None, columns=None): + def __init__( + self, + table, + download_session, + partition_spec=None, + columns=None, + append_partitions=False, + ): super(TableArrowReader, self).__init__( - table, download_session, columns=columns + table, + download_session, + columns=columns, + append_partitions=append_partitions, ) self._partition_spec = partition_spec @@ -184,22 +242,28 @@ def _serve_thread(self): self._sock.sendto(data, addr) elif cmd_code == _PUT_WRITTEN_BLOCKS_CMD: blocks_queue = self._writer._used_block_id_queue - count, = struct.unpack(">> for record in odps.read_table('test_table', 100): + >>> # deal with such 100 records + >>> for record in odps.read_table('test_table', partition='pt=test', start=100, limit=100): + >>> # read the `pt=test` partition, skip 100 records and read 100 records + + .. seealso:: :class:`odps.models.Record` + """ + + table = cls._get_table_obj(odps, name, project=project, schema=schema) + + compress = kw.pop("compress", False) + columns = kw.pop("columns", None) + + with table.open_reader(partition=partition, **kw) as reader: + for record in reader.read( + start, limit, step=step, compress=compress, columns=columns + ): + yield record + + @classmethod + def _is_pa_collection(cls, obj): + return pa is not None and isinstance(obj, (pa.Table, pa.RecordBatch)) + + @classmethod + def _is_pd_df(cls, obj): + return pd is not None and isinstance(obj, pd.DataFrame) + + @classmethod + def _resolve_schema( + cls, records_list, unknown_as_string=False, partition=None, partitions=None + ): + from ..df.backends.odpssql.types import df_schema_to_odps_schema + from ..df.backends.pd.types import pd_to_df_schema + from ..tunnel.io.types import arrow_schema_to_odps_schema + from . import Record + + if isinstance(records_list, list) and records_list: + records_list = records_list[0] + + if cls._is_pa_collection(records_list[0]): + schema = arrow_schema_to_odps_schema(records_list[0].schema) + elif cls._is_pd_df(records_list[0]): + schema = df_schema_to_odps_schema( + pd_to_df_schema(records_list[0], unknown_as_string=unknown_as_string) + ) + elif isinstance(records_list[0][0], Record): + schema = records_list[0][0].schema + else: + raise TypeError( + "Inferring schema from provided data not implemented. " + "You need to supply a pandas DataFrame or records." + ) + + part_col_names = partitions or [] + if partition is not None: + part_spec = odps_types.PartitionSpec(partition) + part_col_names.extend(k for k in part_spec.keys()) + if part_col_names: + part_col_set = set(part_col_names) + simple_cols = [c for c in schema.columns if c.name not in part_col_set] + part_cols = [ + odps_types.Column(n, odps_types.string) for n in part_col_names + ] + schema = odps_types.OdpsSchema(simple_cols, part_cols) + return schema + + @classmethod + def _split_block_data_in_partitions(cls, table_schema, block_data, partitions=None): + from . import Record + + if not partitions: + is_arrow = cls._is_pa_collection(block_data) or cls._is_pd_df(block_data) + return {(is_arrow, None): [block_data]} + + input_cols = list(table_schema.simple_columns) + [ + odps_types.Column(part, odps_types.string) for part in partitions + ] + input_schema = odps_types.OdpsSchema(input_cols) + + parted_data = defaultdict(list) + if ( + cls._is_pa_collection(block_data) + or cls._is_pd_df(block_data) + or isinstance(block_data, Record) + or ( + isinstance(block_data, list) + and block_data + and not isinstance(block_data[0], list) + ) + ): + # pd dataframes, arrow RecordBatch, single record or single record-like array + block_data = [block_data] + for data in block_data: + if cls._is_pa_collection(data): + data = data.to_pandas() + elif isinstance(data, list): + if len(data) != len(input_schema): + raise ValueError( + "Need to specify %d values when writing table with dynamic partition." + % len(input_schema) + ) + data = Record(schema=input_schema, values=data) + + if cls._is_pd_df(data): + part_set = set(partitions) + col_names = [c.name for c in input_cols if c.name not in part_set] + for name, group in data.groupby(partitions): + name = name if isinstance(name, tuple) else (name,) + pt_name = ",".join( + "=".join([str(n), str(v)]) for n, v in zip(partitions, name) + ) + parted_data[(True, pt_name)].append(group[col_names]) + elif isinstance(data, Record): + pt_name = ",".join("=".join([str(n), data[str(n)]]) for n in partitions) + values = [data[str(c.name)] for c in table_schema.simple_columns] + if not parted_data[(False, pt_name)]: + parted_data[(False, pt_name)].append([]) + parted_data[(False, pt_name)][0].append( + Record(schema=table_schema, values=values) + ) + else: + raise ValueError( + "Cannot accept data with type %s" % type(data).__name__ + ) + return parted_data + + @classmethod + def write_table(cls, odps, name, *block_data, **kw): + """ + Write records into given table. + + :param name: table or table name + :type name: :class:`.models.table.Table` or str + :param block_data: records / DataFrame, or block ids and records / DataFrame. + If given records or DataFrame only, the block id will be 0 as default. + :param str project: project name, if not provided, will be the default project + :param str schema: schema name, if not provided, will be the default schema + :param partition: the partition of this table to write + :param list partitions: fields representing partitions + :param bool overwrite: if True, will overwrite existing data + :param bool create_table: if true, the table will be created if not exist + :param int lifecycle: specify table lifecycle when creating tables + :param bool create_partition: if true, the partition will be created if not exist + :param bool compress: if True, the data will be compressed during uploading + :param compress_option: the compression algorithm, level and strategy + :type compress_option: :class:`odps.tunnel.CompressOption` + :param str endpoint: tunnel service URL + :param bool reopen: writing the table will reuse the session which opened last time, + if set to True will open a new upload session, default as False + :return: None + + :Example: + + Write records into a specified table. + + >>> odps.write_table('test_table', data) + + Write records into multiple blocks. + + >>> odps.write_table('test_table', 0, records1, 1, records2) + + Write into a given partition. + + >>> odps.write_table('test_table', data, partition='pt=test') + + Write a pandas DataFrame. + + >>> import pandas as pd + >>> df = pd.DataFrame([ + >>> [111, 'aaa', True], + >>> [222, 'bbb', False], + >>> [333, 'ccc', True], + >>> [444, '中文', False] + >>> ], columns=['num_col', 'str_col', 'bool_col']) + >>> o.write_table('test_table', df, partition='pt=test', create_table=True, create_partition=True) + + Write a dynamic partition. + + >>> import pandas as pd + >>> df = pd.DataFrame([ + >>> [111, 'aaa', True, 'p1'], + >>> [222, 'bbb', False, 'p1'], + >>> [333, 'ccc', True, 'p2'], + >>> [444, '中文', False, 'p2'] + >>> ], columns=['num_col', 'str_col', 'bool_col', 'pt']) + >>> o.write_table('test_part_table', df, partitions=['pt'], create_partition=True) + + .. seealso:: :class:`odps.models.Record` + """ + project = kw.pop("project", None) + schema = kw.pop("schema", None) + + single_block_types = (Iterable,) + if pa is not None: + single_block_types += (pa.RecordBatch, pa.Table) + + if len(block_data) == 1 and isinstance(block_data[0], single_block_types): + blocks = [None] + data_list = block_data + else: + blocks = block_data[::2] + data_list = block_data[1::2] + + if len(blocks) != len(data_list): + raise ValueError( + "Should invoke like odps.write_table(block_id, records, " + "block_id2, records2, ..., **kw)" + ) + + unknown_as_string = kw.pop("unknown_as_string", False) + create_table = kw.pop("create_table", False) + create_partition = kw.pop( + "create_partition", kw.pop("create_partitions", False) + ) + partition = kw.pop("partition", None) + partitions = kw.pop("partitions", None) + lifecycle = kw.pop("lifecycle", None) + if isinstance(partitions, six.string_types): + partitions = [partitions] + if not odps.exist_table(name, project=project, schema=schema): + if not create_table: + raise errors.NoSuchTable("Target table %s not exist" % name) + table_schema = cls._resolve_schema( + data_list, + unknown_as_string=unknown_as_string, + partition=partition, + partitions=partitions, + ) + table = odps.create_table( + name, table_schema, project=project, schema=schema, lifecycle=lifecycle + ) + else: + table = cls._get_table_obj(odps, name, project=project, schema=schema) + + data_lists = defaultdict(lambda: defaultdict(list)) + for block, data in zip(blocks, data_list): + for key, parted_data in cls._split_block_data_in_partitions( + table.table_schema, data, partitions=partitions + ).items(): + data_lists[key][block].extend(parted_data) + + if partition is None or isinstance(partition, six.string_types): + partition_str = partition + else: + partition_str = str(odps_types.PartitionSpec(partition)) + + for (is_arrow, pt_name), block_to_data in data_lists.items(): + if not block_to_data: + continue + + blocks, data_list = [], [] + for block, data in block_to_data.items(): + blocks.append(block) + data_list.extend(data) + + if len(blocks) == 1 and blocks[0] is None: + blocks = None + + final_pt = ",".join(p for p in (pt_name, partition_str) if p is not None) + with table.open_writer( + partition=final_pt or None, + blocks=blocks, + arrow=is_arrow, + create_partition=create_partition, + **kw + ) as writer: + if blocks is None: + for data in data_list: + writer.write(data) + else: + for block, data in zip(blocks, data_list): + writer.write(block, data) diff --git a/odps/models/tables.py b/odps/models/tables.py index 2f1280d0..edf9589e 100644 --- a/odps/models/tables.py +++ b/odps/models/tables.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,17 +14,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .. import serializers, errors, utils +from .. import errors, serializers, utils from ..compat import six from .core import Iterable from .table import Table class Tables(Iterable): - - marker = serializers.XMLNodeField('Marker') - max_items = serializers.XMLNodeField('MaxItems', parse_callback=int) - tables = serializers.XMLNodesReferencesField(Table, 'Table') + marker = serializers.XMLNodeField("Marker") + max_items = serializers.XMLNodeField("MaxItems", parse_callback=int) + tables = serializers.XMLNodesReferencesField(Table, "Table") def _get(self, item): return Table(client=self._client, parent=self, name=item) @@ -55,32 +54,32 @@ def iterate(self, name=None, owner=None, type=None, extended=False): :return: """ actions = [] - params = {'expectmarker': 'true'} + params = {"expectmarker": "true"} if name is not None: - params['name'] = name + params["name"] = name if owner is not None: - params['owner'] = owner + params["owner"] = owner if type is not None: table_type = type.upper() if isinstance(type, str) else type table_type = Table.Type(table_type) - params['type'] = table_type.value + params["type"] = table_type.value if extended: actions.append("extended") schema_name = self._get_schema_name() if schema_name is not None: - params['curr_schema'] = schema_name + params["curr_schema"] = schema_name def _it(): - last_marker = params.get('marker') - if 'marker' in params and (last_marker is None or len(last_marker) == 0): + last_marker = params.get("marker") + if "marker" in params and (last_marker is None or len(last_marker) == 0): return url = self.resource() resp = self._client.get(url, actions=actions, params=params) t = Tables.parse(self._client, resp, obj=self) - params['marker'] = t.marker + params["marker"] = t.marker return t.tables @@ -92,20 +91,40 @@ def _it(): yield table @utils.with_wait_argument - def create(self, table_name, table_schema, comment=None, if_not_exists=False, - lifecycle=None, shard_num=None, hub_lifecycle=None, hints=None, - transactional=False, storage_tier=None, async_=False, **kw): + def create( + self, + table_name, + table_schema, + comment=None, + if_not_exists=False, + lifecycle=None, + shard_num=None, + hub_lifecycle=None, + hints=None, + transactional=False, + storage_tier=None, + async_=False, + **kw + ): project_name = self._parent.project.name schema_name = self._get_schema_name() sql = Table.gen_create_table_sql( - table_name, table_schema, comment=comment, if_not_exists=if_not_exists, - lifecycle=lifecycle, shard_num=shard_num, hub_lifecycle=hub_lifecycle, - transactional=transactional, project=project_name, - schema=schema_name, **kw) + table_name, + table_schema, + comment=comment, + if_not_exists=if_not_exists, + lifecycle=lifecycle, + shard_num=shard_num, + hub_lifecycle=hub_lifecycle, + transactional=transactional, + project=project_name, + schema=schema_name, + **kw + ) from .tasks import SQLTask - task = SQLTask(name='SQLCreateTableTask', query=sql) + task = SQLTask(name="SQLCreateTableTask", query=sql) hints = hints or {} if schema_name is not None: hints["odps.sql.allow.namespace.schema"] = "true" @@ -113,7 +132,9 @@ def create(self, table_name, table_schema, comment=None, if_not_exists=False, else: hints["odps.namespace.schema"] = "false" if storage_tier: - hints['odps.tiered.storage.enable'] = 'true' + hints["odps.tiered.storage.enable"] = "true" + if self._parent.project.odps.quota_name: + hints["odps.task.wlm.quota"] = self._parent.project.odps.quota_name task.update_sql_settings(hints) instance = self._parent.project.instances.create(task=task) @@ -124,50 +145,64 @@ def create(self, table_name, table_schema, comment=None, if_not_exists=False, else: return instance - def _gen_delete_table_sql(self, table_name, if_exists=False): + def _gen_delete_table_sql(self, table_name, if_exists=False, table_type=None): project_name = self._parent.project.name schema_name = self._get_schema_name() buf = six.StringIO() - tb = self._get(table_name) - if tb._getattr("type") is None: - # if table not loaded, use 'TABLE' type to reduce request - type_str = 'TABLE' - elif tb.type == Table.Type.VIRTUAL_VIEW: - type_str = 'VIEW' - elif tb.type == Table.Type.MATERIALIZED_VIEW: - type_str = 'MATERIALIZED VIEW' + + if table_type is not None and isinstance(table_type, six.string_types): + table_type = Table.Type(table_type.upper()) + + # override provided type if the object is already cached + cached_table_type = self._get(table_name)._getattr("type") + if cached_table_type is not None and ( + table_type is None or table_type == Table.Type.MANAGED_TABLE + ): + table_type = cached_table_type + + if table_type == Table.Type.VIRTUAL_VIEW: + type_str = "VIEW" + elif table_type == Table.Type.MATERIALIZED_VIEW: + type_str = "MATERIALIZED VIEW" else: - type_str = 'TABLE' + type_str = "TABLE" - buf.write('DROP %s ' % type_str) + buf.write("DROP %s " % type_str) if if_exists: - buf.write('IF EXISTS ') + buf.write("IF EXISTS ") if schema_name is not None: - buf.write('%s.%s.`%s`' % (project_name, schema_name, table_name)) + buf.write("%s.%s.`%s`" % (project_name, schema_name, table_name)) else: - buf.write('%s.`%s`' % (project_name, table_name)) + buf.write("%s.`%s`" % (project_name, table_name)) return buf.getvalue() @utils.with_wait_argument - def delete(self, table_name, if_exists=False, async_=False, hints=None): + def delete( + self, table_name, if_exists=False, async_=False, hints=None, table_type=None + ): if isinstance(table_name, Table): table_name = table_name.name - sql = self._gen_delete_table_sql(table_name, if_exists=if_exists) + sql = self._gen_delete_table_sql( + table_name, if_exists=if_exists, table_type=table_type + ) del self[table_name] # release table in cache from .tasks import SQLTask - task = SQLTask(name='SQLDropTableTask', query=sql) + + task = SQLTask(name="SQLDropTableTask", query=sql) hints = hints or {} - hints['odps.sql.submit.mode'] = '' + hints["odps.sql.submit.mode"] = "" schema_name = self._get_schema_name() if schema_name is not None: hints["odps.sql.allow.namespace.schema"] = "true" hints["odps.namespace.schema"] = "true" + if self._parent.project.odps.quota_name: + hints["odps.task.wlm.quota"] = self._parent.project.odps.quota_name task.update_sql_settings(hints) instance = self._parent.project.instances.create(task=task) diff --git a/odps/models/tasks.py b/odps/models/tasks.py deleted file mode 100644 index 48e05a42..00000000 --- a/odps/models/tasks.py +++ /dev/null @@ -1,342 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json # don't remove -import random -import sys -import time -import warnings -from collections import OrderedDict - -from .core import AbstractXMLRemoteModel -from .. import serializers, errors, utils -from ..compat import enum, six -from ..config import options - - -class Task(AbstractXMLRemoteModel): - - __slots__ = 'name', 'comment', 'properties' - - _type_indicator = 'type' - - name = serializers.XMLNodeField('Name') - type = serializers.XMLTagField('.') - comment = serializers.XMLNodeField('Comment') - properties = serializers.XMLNodePropertiesField('Config', 'Property', - key_tag='Name', value_tag='Value') - - def __new__(cls, *args, **kwargs): - typo = kwargs.get('type') - - if typo is not None: - task_cls = None - for v in six.itervalues(globals()): - if not isinstance(v, type) or not issubclass(v, Task): - continue - cls_type = getattr(v, '_root', v.__name__) - if typo == cls_type: - task_cls = v - if task_cls is None: - task_cls = cls - else: - task_cls = cls - - return object.__new__(task_cls) - - def set_property(self, key, value): - if self.properties is None: - self.properties = OrderedDict() - self.properties[key] = value - - def _update_property_json(self, field, value): - def update(kv, dest): - if not kv: - return - for k, v in six.iteritems(kv): - if isinstance(v, bool): - dest[k] = 'true' if v else 'false' - else: - dest[k] = str(v) - - if self.properties is None: - self.properties = OrderedDict() - if field in self.properties: - settings = json.loads(self.properties[field]) - else: - settings = OrderedDict() - update(value, settings) - self.properties[field] = json.dumps(settings) - - def update_settings(self, value): - self._update_property_json('settings', value) - - def serialize(self): - if type(self) is Task: - raise errors.ODPSError('Unknown task type') - return super(Task, self).serialize() - - @property - def instance(self): - return self.parent.parent - - @property - def progress(self): - """ - Get progress of a task. - """ - return self.instance.get_task_progress(self.name) - - @property - def stages(self): - """ - Get execution stages of a task. - """ - return self.instance.get_task_progress(self.name).stages - - @property - def result(self): - """ - Get execution result of the task. - """ - return self.instance.get_task_result(self.name) - - @property - def summary(self): - """ - Get execution summary of the task. - """ - return self.instance.get_task_summary(self.name) - - @property - def detail(self): - """ - Get execution details of the task. - """ - return self.instance.get_task_detail(self.name) - - @property - def quota(self): - """ - Get quota json of the task. - """ - return self.instance.get_task_quota(self.name) - - @property - def workers(self): - """ - Get workers of the task. - """ - return self.instance.get_task_workers(self.name) - - def get_info(self, key): - """ - Get associated information of the task. - """ - return self.instance.get_task_info(self.name, key) - - def put_info(self, key, value): - """ - Put associated information of the task. - """ - return self.instance.put_task_info(self.name, key, value) - - -def format_cdata(query, semicolon=False): - stripped_query = query.strip() - if semicolon and not stripped_query.endswith(';'): - stripped_query += ';' - return '' % stripped_query - - -def collect_sql_settings(value, glob): - from .. import __version__ - - settings = OrderedDict() - if options.default_task_settings: - settings = options.default_task_settings - - settings["PYODPS_VERSION"] = __version__ - settings["PYODPS_PYTHON_VERSION"] = sys.version - - if glob: - if options.sql.use_odps2_extension: - settings['odps.sql.type.system.odps2'] = True - if options.local_timezone is not None: - if not options.local_timezone: - settings['odps.sql.timezone'] = 'Etc/GMT' - elif isinstance(options.local_timezone, bool): - from ..lib import tzlocal - - zone = tzlocal.get_localzone() - settings['odps.sql.timezone'] = utils.get_zone_name(zone) - elif isinstance(options.local_timezone, six.string_types): - settings['odps.sql.timezone'] = options.local_timezone - else: - zone = options.local_timezone - zone_str = utils.get_zone_name(zone) - if zone_str is None: - warnings.warn('Failed to get timezone string from options.local_timezone. ' - 'You need to deal with timezone in the return data yourself.') - else: - settings['odps.sql.timezone'] = zone_str - if options.sql.settings: - settings.update(options.sql.settings) - if value: - settings.update(value) - return settings - - -class SQLTask(Task): - __slots__ = '_anonymous_sql_task_name', - - _root = 'SQL' - _anonymous_sql_task_name = 'AnonymousSQLTask' - - query = serializers.XMLNodeField('Query', - serialize_callback=lambda s: format_cdata(s, True)) - - def __init__(self, **kwargs): - if 'name' not in kwargs: - kwargs['name'] = SQLTask._anonymous_sql_task_name - super(SQLTask, self).__init__(**kwargs) - - def serial(self): - if self.properties is None: - self.properties = OrderedDict() - - key = 'settings' - if key not in self.properties: - self.properties[key] = '{"odps.sql.udf.strict.mode": "true"}' - - return super(SQLTask, self).serial() - - def update_sql_settings(self, value=None, glob=True): - settings = collect_sql_settings(value, glob) - self.update_settings(settings) - - def update_aliases(self, value): - self._update_property_json('aliases', value) - - @property - def warnings(self): - return json.loads(self.get_info('warnings')).get('warnings') - - -class MergeTask(Task): - _root = 'Merge' - - table = serializers.XMLNodeField('TableName') - - def __init__(self, name=None, **kwargs): - if name is None: - name = 'merge_task_{0}_{1}'.format(int(time.time()), random.randint(100000, 999999)) - kwargs['name'] = name - super(MergeTask, self).__init__(**kwargs) - - -class CupidTask(Task): - _root = 'CUPID' - - plan = serializers.XMLNodeField('Plan', serialize_callback=format_cdata) - - def __init__(self, name=None, plan=None, hints=None, **kwargs): - kwargs['name'] = name - kwargs['plan'] = plan - super(CupidTask, self).__init__(**kwargs) - hints = hints or {} - self.set_property('type', 'cupid') - if hints: - self.set_property('settings', json.dumps(hints)) - - -class SQLCostTask(Task): - __slots__ = '_anonymous_sql_cost_task_name', - - _root = 'SQLCost' - _anonymous_sql_cost_task_name = 'AnonymousSQLCostTask' - - query = serializers.XMLNodeField('Query', - serialize_callback=lambda s: format_cdata(s, True)) - - def __init__(self, **kwargs): - if 'name' not in kwargs: - kwargs['name'] = self._anonymous_sql_cost_task_name - super(SQLCostTask, self).__init__(**kwargs) - - def update_sql_cost_settings(self, value=None, glob=True): - settings = collect_sql_settings(value, glob) - self.update_settings(settings) - - -class SQLRTTask(Task): - _root = "SQLRT" - - def update_sql_rt_settings(self, value=None, glob=True): - settings = collect_sql_settings(value, glob) - self.update_settings(settings) - - -class MaxFrameTask(Task): - __slots__ = ("_output_format", "_major_version", "_service_endpoint") - _root = "MaxFrame" - _anonymous_task_name = "AnonymousMaxFrameTask" - - class CommandType(enum.Enum): - CREATE_SESSION = "CREATE_SESSION" - PYTHON_PACK = "PYTHON_PACK" - RAY_CLUSTER_INIT = "RAY_CLUSTER_INIT" - RAY_CLUSTER_FREE = "RAY_CLUSTER_FREE" - - command = serializers.XMLNodeField( - "Command", - default=CommandType.CREATE_SESSION, - parse_callback=lambda t: MaxFrameTask.CommandType(t.upper()), - serialize_callback=lambda t: t.value, - ) - - def __init__(self, **kwargs): - kwargs["name"] = kwargs.get("name") or self._anonymous_task_name - self._major_version = kwargs.pop("major_version", None) - self._service_endpoint = kwargs.pop("service_endpoint", None) - super(MaxFrameTask, self).__init__(**kwargs) - - if self.properties is None: - self.properties = OrderedDict() - self.properties["settings"] = "{}" - - def serial(self): - if options.default_task_settings: - settings = options.default_task_settings.copy() - else: - settings = OrderedDict() - - if self._major_version is not None: - settings["odps.task.major.version"] = self._major_version - if self._service_endpoint is not None: - settings["odps.service.endpoint"] = self._service_endpoint - - if "settings" in self.properties: - settings.update(json.loads(self.properties["settings"])) - - self.properties["settings"] = json.dumps(settings) - return super(MaxFrameTask, self).serial() - - -try: - from ..internal.models.tasks import * # noqa: F401 -except ImportError: - pass diff --git a/odps/tunnel/pdio/__init__.py b/odps/models/tasks/__init__.py similarity index 66% rename from odps/tunnel/pdio/__init__.py rename to odps/models/tasks/__init__.py index d3e1cc37..ff8863b6 100644 --- a/odps/tunnel/pdio/__init__.py +++ b/odps/models/tasks/__init__.py @@ -1,5 +1,4 @@ -# -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .copy import CopyTask +from .core import Task +from .maxframe import MaxFrameTask +from .merge import MergeTask +from .misc import CupidTask +from .sql import SQLCostTask, SQLRTTask, SQLTask + try: - import numpy as np - from .pdreader_c import TunnelPandasReader -except ImportError: - pass -try: - import numpy as np - from .pdwriter import BasePandasWriter, TunnelPandasWriter + from ...internal.models.tasks import * # noqa: F401 except ImportError: pass diff --git a/odps/models/tasks/copy.py b/odps/models/tasks/copy.py new file mode 100644 index 00000000..61d4fe58 --- /dev/null +++ b/odps/models/tasks/copy.py @@ -0,0 +1,71 @@ +# Copyright 1999-2024 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import compat, serializers +from .core import Task + + +class CopyDataSource(serializers.XMLSerializableModel): + class CopyDirection(compat.Enum): + IMPORT = "IMPORT" + EXPORT = "EXPORT" + + copy_type = serializers.XMLNodeField("Type") + project = serializers.XMLNodeField("Project") + table = serializers.XMLNodeField("Table") + partition = serializers.XMLNodeField("Partition") + + def __init__(self, direction=None, **kw): + kw["type"] = ( + "Destination" if direction == self.CopyDirection.IMPORT else "Source" + ) + super(CopyDataSource, self).__init__(**kw) + + +class LocalCopyDataSource(CopyDataSource): + _root = "Local" + + +class TunnelCopyDataSource(CopyDataSource): + version = serializers.XMLNodeField("Version", default="1") + endpoint = serializers.XMLNodeField("EndPoint") + odps_endpoint = serializers.XMLNodeField("OdpsEndPoint") + signature = serializers.XMLNodeField("Signature") + application_signature = serializers.XMLNodeField("ApplicationSignature") + signature_type = serializers.XMLNodeField("SignatureType") + + +class MappingItem(serializers.XMLSerializableModel): + _root = "MappingItem" + + src = serializers.XMLNodeField("SourceColumn") + dest = serializers.XMLNodeField("DestColumn") + + +class CopyTask(Task): + _root = "COPY" + + local = serializers.XMLNodeReferenceField(LocalCopyDataSource, "Local") + tunnel = serializers.XMLNodeReferenceField(TunnelCopyDataSource, "Tunnel") + _mapping_items = serializers.XMLNodesReferencesField(MappingItem, "MappingItems") + mode = serializers.XMLNodeField("Mode") + job_instance_number = serializers.XMLNodeField("JobInstanceNumber") + + @property + def mapping_items(self): + return self._mapping_items or [] + + @mapping_items.setter + def mapping_items(self, value): + self._mapping_items = value diff --git a/odps/models/tasks/core.py b/odps/models/tasks/core.py new file mode 100644 index 00000000..4e997b13 --- /dev/null +++ b/odps/models/tasks/core.py @@ -0,0 +1,197 @@ +# Copyright 1999-2024 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import itertools +import json +import textwrap +from collections import OrderedDict + +from ... import errors, serializers +from ...compat import six +from ..core import AbstractXMLRemoteModel + +_type_to_task_classes = dict() + + +class Task(AbstractXMLRemoteModel): + __slots__ = ("name", "comment", "properties") + + _type_indicator = "type" + + name = serializers.XMLNodeField("Name") + type = serializers.XMLTagField(".") + comment = serializers.XMLNodeField("Comment") + properties = serializers.XMLNodePropertiesField( + "Config", "Property", key_tag="Name", value_tag="Value" + ) + + @classmethod + def _load_task_classes(cls): + if _type_to_task_classes: + return + mod = importlib.import_module("odps.models.tasks") + for v in six.itervalues(mod.__dict__): + if not isinstance(v, type) or not issubclass(v, Task) or v is Task: + continue + cls_type = getattr(v, "_root", v.__name__) + _type_to_task_classes[cls_type] = v + + def __new__(cls, *args, **kwargs): + typo = kwargs.get("type") + + if typo is not None: + cls._load_task_classes() + task_cls = _type_to_task_classes.get(typo, cls) + else: + task_cls = cls + + return object.__new__(task_cls) + + def set_property(self, key, value): + if self.properties is None: + self.properties = OrderedDict() + self.properties[key] = value + + def _update_property_json(self, field, value): + def update(kv, dest): + if not kv: + return + for k, v in six.iteritems(kv): + if isinstance(v, bool): + dest[k] = "true" if v else "false" + else: + dest[k] = str(v) + + if self.properties is None: + self.properties = OrderedDict() + if field in self.properties: + settings = json.loads(self.properties[field]) + else: + settings = OrderedDict() + update(value, settings) + self.properties[field] = json.dumps(settings) + + def update_settings(self, value): + self._update_property_json("settings", value) + + def serialize(self): + if type(self) is Task: + raise errors.ODPSError("Unknown task type") + return super(Task, self).serialize() + + @property + def instance(self): + return self.parent.parent + + @property + def progress(self): + """ + Get progress of a task. + """ + return self.instance.get_task_progress(self.name) + + @property + def stages(self): + """ + Get execution stages of a task. + """ + return self.instance.get_task_progress(self.name).stages + + @property + def result(self): + """ + Get execution result of the task. + """ + return self.instance.get_task_result(self.name) + + @property + def summary(self): + """ + Get execution summary of the task. + """ + return self.instance.get_task_summary(self.name) + + @property + def detail(self): + """ + Get execution details of the task. + """ + return self.instance.get_task_detail(self.name) + + @property + def quota(self): + """ + Get quota json of the task. + """ + return self.instance.get_task_quota(self.name) + + @property + def workers(self): + """ + Get workers of the task. + """ + return self.instance.get_task_workers(self.name) + + def get_info(self, key, raise_empty=False): + """ + Get associated information of the task. + """ + return self.instance.get_task_info(self.name, key, raise_empty=raise_empty) + + def put_info(self, key, value, raise_empty=False): + """ + Put associated information of the task. + """ + return self.instance.put_task_info( + self.name, key, value, raise_empty=raise_empty + ) + + +def format_cdata(query, semicolon=False): + stripped_query = query.strip() + if semicolon and not stripped_query.endswith(";"): + stripped_query += ";" + return "" % stripped_query + + +def build_execute_method(func, head_docstr): + ext_wrapper = None + unwrap_func = func + if isinstance(func, classmethod): + unwrap_func = func.__func__ + ext_wrapper = classmethod + + @six.wraps(unwrap_func) + def wrapped(cls, *args, **kw): + inst = unwrap_func(cls, *args, **kw) + inst.wait_for_success() + return inst + + wrapped.__name__ = unwrap_func.__name__.replace("run_", "execute_") + + dent_count = min( + len(list(itertools.takewhile(lambda c: c == " ", line))) + for line in unwrap_func.__doc__.splitlines() + if line.strip() + ) + _, rest_doc = textwrap.dedent(unwrap_func.__doc__).split("\n\n", 1) + doc = "\n" + head_docstr.strip() + "\n\n" + rest_doc + wrapped.__doc__ = "\n".join( + " " * dent_count + line if line else "" for line in doc.splitlines() + ) + + if ext_wrapper is not None: + wrapped = ext_wrapper(wrapped) + return wrapped diff --git a/odps/models/tasks/maxframe.py b/odps/models/tasks/maxframe.py new file mode 100644 index 00000000..55305ea3 --- /dev/null +++ b/odps/models/tasks/maxframe.py @@ -0,0 +1,67 @@ +# Copyright 1999-2024 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from collections import OrderedDict + +from ... import serializers +from ...compat import enum +from ...config import options +from .core import Task + + +class MaxFrameTask(Task): + __slots__ = ("_output_format", "_major_version", "_service_endpoint") + _root = "MaxFrame" + _anonymous_task_name = "AnonymousMaxFrameTask" + + class CommandType(enum.Enum): + CREATE_SESSION = "CREATE_SESSION" + PYTHON_PACK = "PYTHON_PACK" + RAY_CLUSTER_INIT = "RAY_CLUSTER_INIT" + RAY_CLUSTER_FREE = "RAY_CLUSTER_FREE" + + command = serializers.XMLNodeField( + "Command", + default=CommandType.CREATE_SESSION, + parse_callback=lambda t: MaxFrameTask.CommandType(t.upper()), + serialize_callback=lambda t: t.value, + ) + + def __init__(self, **kwargs): + kwargs["name"] = kwargs.get("name") or self._anonymous_task_name + self._major_version = kwargs.pop("major_version", None) + self._service_endpoint = kwargs.pop("service_endpoint", None) + super(MaxFrameTask, self).__init__(**kwargs) + + if self.properties is None: + self.properties = OrderedDict() + self.properties["settings"] = "{}" + + def serial(self): + if options.default_task_settings: + settings = options.default_task_settings.copy() + else: + settings = OrderedDict() + + if self._major_version is not None: + settings["odps.task.major.version"] = self._major_version + if self._service_endpoint is not None: + settings["odps.service.endpoint"] = self._service_endpoint + + if "settings" in self.properties: + settings.update(json.loads(self.properties["settings"])) + + self.properties["settings"] = json.dumps(settings) + return super(MaxFrameTask, self).serial() diff --git a/odps/models/tasks/merge.py b/odps/models/tasks/merge.py new file mode 100644 index 00000000..a2b5988b --- /dev/null +++ b/odps/models/tasks/merge.py @@ -0,0 +1,410 @@ +# Copyright 1999-2024 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import re +import time +import warnings +from collections import namedtuple + +from ... import compat, serializers +from ...config import options +from .core import Task, build_execute_method + +_ARCHIVE_TABLE_REGEX = re.compile( + r"^alter\s+table\s+(?P
[^\s;]+)\s+(|partition\s*\((?P[^)]+)\s*\))\s*" + r"archive[\s;]*$", + re.I, +) +_FREEZE_COMMAND_REGEX = re.compile( + r"^alter\s+table\s+(?P
[^\s;]+)\s+(|partition\s*\((?P[^)]+)\s*\))\s*" + r"(?Pfreeze|restore)[\s;]*$", + re.I, +) +_MERGE_SMALL_FILES_REGEX = re.compile( + r"^alter\s+table\s+(?P
[^\s;]+)\s+(|partition\s*\((?P[^)]+)\s*\))\s*" + r"(merge\s+smallfiles|compact\s+(?P[^\s;]+)(|\s+(?P[^;]+)))[\s;]*$", + re.I, +) + +_COMPACT_TYPES = { + "major": "major_compact", + "minor": "minor_compact", +} +_FREEZE_TYPES = { + "freeze": "backup", + "restore": "restore", +} + + +_MergeTaskTableProps = namedtuple( + "_MergeTaskTableProps", "table, schema, task_table_name" +) + + +class MergeTask(Task): + _root = "Merge" + + table = serializers.XMLNodeField("TableName") + + def __init__(self, name=None, **kwargs): + name_prefix = kwargs.pop("name_prefix", None) or "merge_task" + if name is None: + name = "{0}_{1}_{2}".format( + name_prefix, int(time.time()), random.randint(100000, 999999) + ) + kwargs["name"] = name + super(MergeTask, self).__init__(**kwargs) + + @staticmethod + def _extract_table_props(odps, table, partition=None, schema=None, project=None): + from ...core import ODPS + + schema = schema or odps.schema + if not isinstance(table, compat.six.string_types): + if table.get_schema(): + schema = table.get_schema().name + table_name = table.full_table_name + else: + table_name = table + table = odps.get_table(table, project=project, schema=schema) + _, schema, _ = odps._split_object_dots(table_name) + if partition: + table_name += " partition(%s)" % (ODPS._parse_partition_string(partition)) + return _MergeTaskTableProps(table, schema, table_name.replace("`", "")) + + @staticmethod + def _parse_compact_opts(force_mode, recent_hours, kwargs): + compact_opts = kwargs.pop("compact_opts", None) + if not compact_opts: + return force_mode, recent_hours + if force_mode is not None or recent_hours is not None: + raise TypeError( + "compact_opts and force_mode / recent_hours " + "can not be specified at the same time" + ) + compact_opts_list = compact_opts.lower().split() + if "-f" in compact_opts_list: + force_mode = True + try: + hours_index = compact_opts_list.index("-h") + except ValueError: + hours_index = None + + if hours_index is not None: + if ( + hours_index + 1 >= len(compact_opts_list) + or not compact_opts_list[hours_index + 1].isnumeric() + ): + raise ValueError("Need to specify hours after -h suffix") + recent_hours = int(compact_opts_list[hours_index + 1]) + + return force_mode, recent_hours + + @classmethod + def _create_base_merge_task( + cls, + odps, + table, + partition=None, + project=None, + schema=None, + hints=None, + quota_name=None, + name_prefix=None, + ): + props = cls._extract_table_props( + odps, table, partition=partition, schema=schema, project=project + ) + + hints = hints or dict() + if options.default_task_settings: + hints.update(options.default_task_settings) + + if odps.is_schema_namespace_enabled(hints) or props.schema is not None: + hints.update( + { + "odps.sql.allow.namespace.schema": "true", + "odps.namespace.schema": "true", + } + ) + if props.schema is not None: + hints["odps.default.schema"] = props.schema + if quota_name or odps.quota_name: + hints["odps.task.wlm.quota"] = quota_name or odps.quota_name + + task = cls(table=props.task_table_name, name_prefix=name_prefix) + task.update_settings(hints) + return task, props + + @classmethod + def _submit_merge_task( + cls, + odps, + task, + project=None, + priority=None, + running_cluster=None, + create_callback=None, + ): + priority = priority if priority is not None else options.priority + if priority is None and options.get_priority is not None: + priority = options.get_priority(odps) + + project = odps.get_project(name=project) + return project.instances.create( + task=task, + running_cluster=running_cluster, + priority=priority, + create_callback=create_callback, + ) + + @classmethod + def run_merge_files( + cls, + odps, + table, + partition=None, + project=None, + schema=None, + hints=None, + priority=None, + running_cluster=None, + compact_type=None, + force_mode=None, + recent_hours=None, + quota_name=None, + create_callback=None, + **kwargs + ): + """ + Start running a task to merge multiple files in tables. + + :param table: name of the table to optimize + :param partition: partition to optimize + :param project: project name, if not provided, will be the default project + :param str schema: schema name, if not provided, will be the default schema + :param hints: settings for merge task. + :param priority: instance priority, 9 as default + :param running_cluster: cluster to run this instance + :param compact_type: compact option for transactional table, can be major or minor. + :return: instance + :rtype: :class:`odps.models.Instance` + """ + force_mode, recent_hours = cls._parse_compact_opts( + force_mode, recent_hours, kwargs + ) + if kwargs: + raise TypeError("Unsupported keyword arguments %s" % ", ".join(kwargs)) + + prefix = "merge_task" if compact_type is None else "compact_task" + task, props = cls._create_base_merge_task( + odps, + table, + partition=partition, + project=project, + schema=schema, + hints=hints, + name_prefix=prefix, + quota_name=quota_name, + ) + + hints = hints or dict() + compact_type = _COMPACT_TYPES.get(compact_type) + if compact_type: + hints.update( + { + "odps.merge.txn.table.compact": compact_type, + "odps.merge.restructure.action": "hardlink", + } + ) + if compact_type == "minor_compact": + if ( + recent_hours is not None + and recent_hours < props.table.acid_data_retain_hours + and not force_mode + ): + warnings.warn( + "setting 'recentHoursThresholdForPartialCompact' below the data " + "retention period (%s hours) prevents past time travel. " + "It's now set to match the retention period. " + "Use -f to override." % props.table.acid_data_retain_hours + ) + recent_hours = props.table.acid_data_retain_hours + recent_hours = recent_hours or -1 + hints["odps.merge.txn.table.compact.txn.id"] = str(recent_hours) + + task.update_settings(hints) + + return cls._submit_merge_task( + odps, + task, + project=project, + priority=priority, + running_cluster=running_cluster, + create_callback=create_callback, + ) + + execute_merge_files = build_execute_method( + run_merge_files, + """ + Execute a task to merge multiple files in tables and wait for termination. + """, + ) + + @classmethod + def run_archive_table( + cls, + odps, + table, + partition=None, + project=None, + schema=None, + hints=None, + priority=None, + running_cluster=None, + quota_name=None, + create_callback=None, + ): + """ + Start running a task to archive tables. + + :param table: name of the table to archive + :param partition: partition to archive + :param project: project name, if not provided, will be the default project + :param hints: settings for table archive task. + :param priority: instance priority, 9 as default + :return: instance + :rtype: :class:`odps.models.Instance` + """ + task, props = cls._create_base_merge_task( + odps, + table, + partition=partition, + project=project, + schema=schema, + hints=hints, + name_prefix="archive_task", + quota_name=quota_name, + ) + task._update_property_json("archiveSettings", {"odps.merge.archive.flag": True}) + return cls._submit_merge_task( + odps, + task, + project=project, + priority=priority, + running_cluster=running_cluster, + create_callback=create_callback, + ) + + execute_archive_table = build_execute_method( + run_archive_table, + """ + Execute a task to archive tables and wait for termination. + """, + ) + + @classmethod + def run_freeze_command( + cls, + odps, + table, + partition=None, + command=None, + project=None, + schema=None, + hints=None, + priority=None, + running_cluster=None, + quota_name=None, + create_callback=None, + ): + """ + Start running a task to freeze or restore tables. + + :param table: name of the table to archive + :param partition: partition to archive + :param command: freeze command to execute, can be freeze or restore + :param project: project name, if not provided, will be the default project + :param hints: settings for table archive task. + :param priority: instance priority, 9 as default + :return: instance + :rtype: :class:`odps.models.Instance` + """ + task, props = cls._create_base_merge_task( + odps, + table, + partition=partition, + project=project, + schema=schema, + hints=hints, + quota_name=quota_name, + name_prefix=command.lower() + "_task", + ) + + hints = hints or dict() + hints["odps.merge.cold.storage.mode"] = _FREEZE_TYPES[command.lower()] + task.update_settings(hints) + + return cls._submit_merge_task( + odps, + task, + project=project, + priority=priority, + running_cluster=running_cluster, + create_callback=create_callback, + ) + + execute_freeze_command = build_execute_method( + run_freeze_command, + """ + Execute a task to archive tables and wait for termination. + """, + ) + + @classmethod + def submit_alter_table_instance( + cls, + odps, + sql, + project=None, + schema=None, + priority=None, + running_cluster=None, + hints=None, + quota_name=None, + create_callback=None, + ): + command_to_call = [ + (_ARCHIVE_TABLE_REGEX, cls.run_archive_table), + (_FREEZE_COMMAND_REGEX, cls.run_freeze_command), + (_MERGE_SMALL_FILES_REGEX, cls.run_merge_files), + ] + for cmd_regex, run_cmd in command_to_call: + cmd_regex_match = cmd_regex.match(sql) + if cmd_regex_match: + kwargs = cmd_regex_match.groupdict().copy() + kwargs.update( + { + "project": project, + "schema": schema, + "hints": hints, + "running_cluster": running_cluster, + "priority": priority, + "quota_name": quota_name, + "create_callback": create_callback, + } + ) + return run_cmd(odps, **kwargs) + return None diff --git a/odps/lab_extension/pyodps-lab-extension/__init__.py b/odps/models/tasks/misc.py similarity index 50% rename from odps/lab_extension/pyodps-lab-extension/__init__.py rename to odps/models/tasks/misc.py index 77c5bc7a..206f7c89 100644 --- a/odps/lab_extension/pyodps-lab-extension/__init__.py +++ b/odps/models/tasks/misc.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,15 +13,21 @@ # limitations under the License. import json -from pathlib import Path -from ._version import __version__ +from ... import serializers +from .core import Task, format_cdata -HERE = Path(__file__).parent.resolve() -with (HERE / "labextension" / "package.json").open() as fid: - data = json.load(fid) +class CupidTask(Task): + _root = "CUPID" + plan = serializers.XMLNodeField("Plan", serialize_callback=format_cdata) -def _jupyter_labextension_paths(): - return [{"src": "labextension", "dest": data["name"]}] + def __init__(self, name=None, plan=None, hints=None, **kwargs): + kwargs["name"] = name + kwargs["plan"] = plan + super(CupidTask, self).__init__(**kwargs) + hints = hints or {} + self.set_property("type", "cupid") + if hints: + self.set_property("settings", json.dumps(hints)) diff --git a/odps/models/tasks/sql.py b/odps/models/tasks/sql.py new file mode 100644 index 00000000..c0a17427 --- /dev/null +++ b/odps/models/tasks/sql.py @@ -0,0 +1,128 @@ +# Copyright 1999-2024 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import sys +import warnings +from collections import OrderedDict + +from ... import serializers, utils +from ...compat import six +from ...config import options +from .core import Task, format_cdata + + +def collect_sql_settings(value, glob): + from ... import __version__ + + settings = OrderedDict() + if options.default_task_settings: + settings = options.default_task_settings + + settings["PYODPS_VERSION"] = __version__ + settings["PYODPS_PYTHON_VERSION"] = sys.version + + if glob: + if options.sql.use_odps2_extension: + settings["odps.sql.type.system.odps2"] = True + if options.local_timezone is not None: + if not options.local_timezone: + settings["odps.sql.timezone"] = "Etc/GMT" + elif isinstance(options.local_timezone, bool): + from ...lib import tzlocal + + zone = tzlocal.get_localzone() + settings["odps.sql.timezone"] = utils.get_zone_name(zone) + elif isinstance(options.local_timezone, six.string_types): + settings["odps.sql.timezone"] = options.local_timezone + else: + zone = options.local_timezone + zone_str = utils.get_zone_name(zone) + if zone_str is None: + warnings.warn( + "Failed to get timezone string from options.local_timezone. " + "You need to deal with timezone in the return data yourself." + ) + else: + settings["odps.sql.timezone"] = zone_str + if options.sql.settings: + settings.update(options.sql.settings) + if value: + settings.update(value) + return settings + + +class SQLTask(Task): + __slots__ = ("_anonymous_sql_task_name",) + + _root = "SQL" + _anonymous_sql_task_name = "AnonymousSQLTask" + + query = serializers.XMLNodeField( + "Query", serialize_callback=lambda s: format_cdata(s, True) + ) + + def __init__(self, **kwargs): + if "name" not in kwargs: + kwargs["name"] = SQLTask._anonymous_sql_task_name + super(SQLTask, self).__init__(**kwargs) + + def serial(self): + if self.properties is None: + self.properties = OrderedDict() + + key = "settings" + if key not in self.properties: + self.properties[key] = '{"odps.sql.udf.strict.mode": "true"}' + + return super(SQLTask, self).serial() + + def update_sql_settings(self, value=None, glob=True): + settings = collect_sql_settings(value, glob) + self.update_settings(settings) + + def update_aliases(self, value): + self._update_property_json("aliases", value) + + @property + def warnings(self): + return json.loads(self.get_info("warnings")).get("warnings") + + +class SQLCostTask(Task): + __slots__ = ("_anonymous_sql_cost_task_name",) + + _root = "SQLCost" + _anonymous_sql_cost_task_name = "AnonymousSQLCostTask" + + query = serializers.XMLNodeField( + "Query", serialize_callback=lambda s: format_cdata(s, True) + ) + + def __init__(self, **kwargs): + if "name" not in kwargs: + kwargs["name"] = self._anonymous_sql_cost_task_name + super(SQLCostTask, self).__init__(**kwargs) + + def update_sql_cost_settings(self, value=None, glob=True): + settings = collect_sql_settings(value, glob) + self.update_settings(settings) + + +class SQLRTTask(Task): + _root = "SQLRT" + + def update_sql_rt_settings(self, value=None, glob=True): + settings = collect_sql_settings(value, glob) + self.update_settings(settings) diff --git a/odps/tunnel/pdio/errno.py b/odps/models/tasks/tests/__init__.py similarity index 72% rename from odps/tunnel/pdio/errno.py rename to odps/models/tasks/tests/__init__.py index 7f794641..7d6a6d7f 100644 --- a/odps/tunnel/pdio/errno.py +++ b/odps/models/tasks/tests/__init__.py @@ -1,5 +1,4 @@ -# -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,10 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -BD_SUCCESS = 0 -BD_BUFFER_EXHAUSTED = 1 -BD_CHECKSUM_INVALID = 2 -BD_COUNT_NOT_MATCH = 3 -BD_INVALID_STREAM_DATA = 4 -BD_INVALID_PB_TAG = 5 diff --git a/odps/models/tasks/tests/test_merge.py b/odps/models/tasks/tests/test_merge.py new file mode 100644 index 00000000..f0015d76 --- /dev/null +++ b/odps/models/tasks/tests/test_merge.py @@ -0,0 +1,133 @@ +import json +import os + +import pytest + +from ....tests.core import tn, wait_filled +from ....utils import to_text +from .. import MergeTask, Task + +merge_template = """ + + %(name)s + + + settings + {"odps.merge.cross.paths": "true"} + + + %(table)s + +""" + + +@pytest.fixture +def test_table(odps): + _, table_suffix = os.environ["PYTEST_CURRENT_TEST"].rsplit("::", 1) + table_suffix, _ = table_suffix.split(" ", 1) + table_name = tn("pyodps_test_merge_task_table_" + table_suffix) + if odps.exist_table(table_name): + odps.delete_table(table_name) + + table = odps.create_table(table_name, ("col string", "part1 string, part2 string")) + table.create_partition("part1=1,part2=1", if_not_exists=True) + odps.write_table(table_name, [("col_name",)], partition="part1=1,part2=1") + + try: + yield table_name + finally: + odps.delete_table(table_name) + + +def test_merge_task_to_xml(): + task = MergeTask("task_1", table="table_name") + task.update_settings({"odps.merge.cross.paths": True}) + to_xml = task.serialize() + right_xml = merge_template % dict(name="task_1", table="table_name") + + assert to_text(to_xml) == to_text(right_xml) + + task = Task.parse(None, to_xml) + assert isinstance(task, MergeTask) + + +def test_run_merge(odps, test_table): + inst = odps.run_merge_files(test_table, 'part1=1, part2="1"') + wait_filled(lambda: inst.tasks) + task = inst.tasks[0] + assert isinstance(task, MergeTask) + try: + inst.stop() + except: + pass + + inst = odps.run_sql( + "alter table %s partition (part1=1,part2=1) merge smallfiles;" % test_table + ) + wait_filled(lambda: inst.tasks) + task = inst.tasks[0] + assert isinstance(task, MergeTask) + try: + inst.stop() + except: + pass + + +def test_run_compact(odps, test_table): + inst = odps.run_sql( + "alter table %s partition (part1=1,part2=1) compact major;" % test_table + ) + wait_filled(lambda: inst.tasks) + task = inst.tasks[0] + assert isinstance(task, MergeTask) + assert ( + json.loads(task.properties["settings"])["odps.merge.txn.table.compact"] + == "major_compact" + ) + try: + inst.stop() + except: + pass + + inst = odps.run_sql( + "alter table %s partition (part1=1,part2=1) compact minor -h 5 -f;" % test_table + ) + wait_filled(lambda: inst.tasks) + task = inst.tasks[0] + assert isinstance(task, MergeTask) + settings_dict = json.loads(task.properties["settings"]) + assert settings_dict["odps.merge.txn.table.compact"] == "minor_compact" + assert settings_dict["odps.merge.txn.table.compact.txn.id"] == "5" + try: + inst.stop() + except: + pass + + +def test_run_archive(odps, test_table): + inst = odps.run_sql( + "alter table %s partition (part1=1,part2=1) archive;" % test_table + ) + wait_filled(lambda: inst.tasks) + task = inst.tasks[0] + assert isinstance(task, MergeTask) + assert "archiveSettings" in task.properties + try: + inst.stop() + except: + pass + + +def test_run_freeze(odps, test_table): + inst = odps.run_sql( + "alter table %s partition (part1=1,part2=1) freeze;" % test_table + ) + wait_filled(lambda: inst.tasks) + task = inst.tasks[0] + assert isinstance(task, MergeTask) + settings_dict = json.loads(task.properties["settings"]) + assert settings_dict["odps.merge.cold.storage.mode"] == "backup" + try: + inst.stop() + except: + pass diff --git a/odps/models/tests/test_tasks.py b/odps/models/tasks/tests/test_tasks.py similarity index 62% rename from odps/models/tests/test_tasks.py rename to odps/models/tasks/tests/test_tasks.py index 67c979e9..ead60b00 100644 --- a/odps/models/tests/test_tasks.py +++ b/odps/models/tasks/tests/test_tasks.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,11 +17,10 @@ import pytest -from ...errors import ODPSError -from ...config import options -from ...tests.core import tn, wait_filled -from ...utils import get_zone_name, to_text -from .. import CupidTask, MaxFrameTask, MergeTask, SQLCostTask, SQLTask, Task +from ....config import options +from ....errors import ODPSError +from ....utils import get_zone_name, to_text +from .. import CupidTask, MaxFrameTask, SQLCostTask, SQLTask, Task try: import zoneinfo @@ -33,7 +32,7 @@ pytz = None -sql_template = ''' +sql_template = """ AnonymousSQLTask @@ -44,9 +43,9 @@ -''' +""" -sql_tz_template = ''' +sql_tz_template = """ AnonymousSQLTask @@ -57,22 +56,9 @@ -''' +""" -merge_template = ''' - - %(name)s - - - settings - {"odps.merge.cross.paths": "true"} - - - %(table)s - -''' - -cupid_template = ''' +cupid_template = """ task_1 @@ -87,16 +73,16 @@ -''' +""" -sql_cost_template = ''' +sql_cost_template = """ AnonymousSQLCostTask -''' +""" -maxframe_template = ''' +maxframe_template = """ AnonymousMaxFrameTask @@ -107,9 +93,9 @@ CREATE_SESSION -''' +""" -mf_ray_cluster_init_template = ''' +mf_ray_cluster_init_template = """ AnonymousMaxFrameTask @@ -132,14 +118,14 @@ RAY_CLUSTER_INIT -''' +""" def test_task_class_type(): - typed = Task(type='SQL', query='select * from dual') + typed = Task(type="SQL", query="select * from dual") assert isinstance(typed, SQLTask) - unknown_typed = Task(type='UnknownType') + unknown_typed = Task(type="UnknownType") assert type(unknown_typed) is Task pytest.raises(ODPSError, lambda: unknown_typed.serialize()) @@ -149,11 +135,11 @@ def test_task_class_type(): def test_sql_task_to_xml(): - query = 'select * from dual' + query = "select * from dual" task = SQLTask(query=query) to_xml = task.serialize() - right_xml = sql_template % {'sql': query} + right_xml = sql_template % {"sql": query} assert to_text(to_xml) == to_text(right_xml) @@ -161,13 +147,16 @@ def test_sql_task_to_xml(): assert isinstance(task, SQLTask) -@pytest.mark.skipif(pytz is None and zoneinfo is None, reason='pytz not installed') +@pytest.mark.skipif(pytz is None and zoneinfo is None, reason="pytz not installed") def test_sql_task_to_xml_timezone(): - from ... import __version__ - from ...lib import tzlocal + from .... import __version__ + from ....lib import tzlocal - query = 'select * from dual' - versions = {"pyodps_version": __version__, "python_version": json.dumps(sys.version)} + query = "select * from dual" + versions = { + "pyodps_version": __version__, + "python_version": json.dumps(sys.version), + } def _format_template(**kwargs): kwargs.update(versions) @@ -188,12 +177,12 @@ def _format_template(**kwargs): task = SQLTask(query=query) task.update_sql_settings() to_xml = task.serialize() - right_xml = _format_template(sql=query, tz='Etc/GMT') + right_xml = _format_template(sql=query, tz="Etc/GMT") assert to_text(to_xml) == to_text(right_xml) if zoneinfo: - options.local_timezone = zoneinfo.ZoneInfo('Asia/Shanghai') + options.local_timezone = zoneinfo.ZoneInfo("Asia/Shanghai") task = SQLTask(query=query) task.update_sql_settings() to_xml = task.serialize() @@ -202,7 +191,7 @@ def _format_template(**kwargs): assert to_text(to_xml) == to_text(right_xml) if pytz: - options.local_timezone = pytz.timezone('Asia/Shanghai') + options.local_timezone = pytz.timezone("Asia/Shanghai") task = SQLTask(query=query) task.update_sql_settings() to_xml = task.serialize() @@ -213,66 +202,8 @@ def _format_template(**kwargs): options.local_timezone = None -def test_merge_task_to_xml(): - task = MergeTask('task_1', table='table_name') - task.update_settings({'odps.merge.cross.paths': True}) - to_xml = task.serialize() - right_xml = merge_template % dict(name='task_1', table='table_name') - - assert to_text(to_xml) == to_text(right_xml) - - task = Task.parse(None, to_xml) - assert isinstance(task, MergeTask) - - -def test_run_merge_task(odps): - table_name = tn('pyodps_test_merge_task_table') - if odps.exist_table(table_name): - odps.delete_table(table_name) - - table = odps.create_table(table_name, ('col string', 'part1 string, part2 string')) - table.create_partition('part1=1,part2=1', if_not_exists=True) - odps.write_table(table_name, [('col_name', )], partition='part1=1,part2=1') - - inst = odps.run_merge_files(table_name, 'part1=1, part2="1"') - wait_filled(lambda: inst.tasks) - task = inst.tasks[0] - assert isinstance(task, MergeTask) - - try: - inst.stop() - except: - pass - - inst = odps.run_sql( - 'alter table %s partition (part1=1,part2=1) merge smallfiles;' % table_name - ) - wait_filled(lambda: inst.tasks) - task = inst.tasks[0] - assert isinstance(task, MergeTask) - - try: - inst.stop() - except: - pass - - inst = odps.run_sql( - 'alter table %s partition (part1=1,part2=1) compact major;' % table_name - ) - wait_filled(lambda: inst.tasks) - task = inst.tasks[0] - assert isinstance(task, MergeTask) - assert json.loads(task.properties["settings"])["odps.merge.txn.table.compact"] == "major_compact" - try: - inst.stop() - except: - pass - - odps.delete_table(table_name) - - def test_cupid_task_to_xml(): - task = CupidTask('task_1', 'plan_text', {'odps.cupid.wait.am.start.time': 600}) + task = CupidTask("task_1", "plan_text", {"odps.cupid.wait.am.start.time": 600}) to_xml = task.serialize() right_xml = cupid_template @@ -283,10 +214,10 @@ def test_cupid_task_to_xml(): def test_sql_cost_task_to_xml(): - query = 'select * from dual' + query = "select * from dual" task = SQLCostTask(query=query) to_xml = task.serialize() - right_xml = sql_cost_template % {'sql': query} + right_xml = sql_cost_template % {"sql": query} assert to_text(to_xml) == to_text(right_xml) @@ -298,7 +229,7 @@ def test_maxframe_task_to_xml(odps): task = MaxFrameTask(service_endpoint=odps.endpoint) task.update_settings({"odps.maxframe.output_format": "maxframe_v1"}) to_xml = task.serialize() - right_xml = maxframe_template % {'endpoint': odps.endpoint} + right_xml = maxframe_template % {"endpoint": odps.endpoint} assert to_text(to_xml) == to_text(right_xml) @@ -325,4 +256,4 @@ def test_ray_cluster_init(odps): assert to_text(to_xml) == to_text(right_xml) task = Task.parse(None, to_xml) assert isinstance(task, MaxFrameTask) - assert task.command == MaxFrameTask.CommandType.RAY_CLUSTER_INIT \ No newline at end of file + assert task.command == MaxFrameTask.CommandType.RAY_CLUSTER_INIT diff --git a/odps/models/tenant.py b/odps/models/tenant.py index c61b0793..20f32704 100644 --- a/odps/models/tenant.py +++ b/odps/models/tenant.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,13 +16,13 @@ from datetime import datetime from .. import serializers -from ..compat import enum, TimeoutError -from ..errors import ODPSError, InternalServerError +from ..compat import TimeoutError, enum +from ..errors import InternalServerError, ODPSError from .core import JSONRemoteModel class Tenant(JSONRemoteModel): - __slots__ = "_loaded", + __slots__ = ("_loaded",) class State(enum.Enum): NORMAL = "NORMAL" @@ -35,7 +35,9 @@ class Meta(JSONRemoteModel): owner_id = serializers.JSONNodeField("OwnerId", set_to_parent=True) tenant_id = serializers.JSONNodeField("TenantId", set_to_parent=True) tenant_state = serializers.JSONNodeField( - "TenantState", parse_callback=lambda x: Tenant.State(x.upper()), set_to_parent=True + "TenantState", + parse_callback=lambda x: Tenant.State(x.upper()), + set_to_parent=True, ) creation_time = serializers.JSONNodeField( "CreateTime", parse_callback=datetime.fromtimestamp, set_to_parent=True @@ -76,7 +78,7 @@ def _getattr(self, attr): def __getattribute__(self, attr): val = object.__getattribute__(self, attr) if val is None and not self._getattr("_loaded"): - fields = getattr(type(self), '__fields') + fields = getattr(type(self), "__fields") if attr in fields: self.reload() return object.__getattribute__(self, attr) @@ -84,15 +86,17 @@ def __getattribute__(self, attr): @property def create_time(self): warnings.warn( - 'Tenant.create_time is deprecated and will be replaced ' - 'by Tenant.creation_time.', + "Tenant.create_time is deprecated and will be replaced " + "by Tenant.creation_time.", DeprecationWarning, stacklevel=3, ) return self.creation_time def resource(self, client=None, endpoint=None): - endpoint = endpoint if endpoint is not None else (client or self._client).endpoint + endpoint = ( + endpoint if endpoint is not None else (client or self._client).endpoint + ) return endpoint + "/tenants" def reload(self): diff --git a/odps/models/tests/test_functions.py b/odps/models/tests/test_functions.py index 999fac5b..a2b4dc8f 100644 --- a/odps/models/tests/test_functions.py +++ b/odps/models/tests/test_functions.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -52,7 +52,7 @@ def test_functions(odps): def test_function_exists(odps): - non_exists_function = 'a_non_exists_function' + non_exists_function = "a_non_exists_function" assert odps.exist_function(non_exists_function) is False @@ -65,25 +65,25 @@ def test_function(odps): assert function is odps.get_function(function.name) - assert function._getattr('name') is not None - assert function._getattr('owner') is not None - assert function._getattr('creation_time') is not None - assert function._getattr('class_type') is not None - assert function._getattr('_resources') is not None + assert function._getattr("name") is not None + assert function._getattr("owner") is not None + assert function._getattr("creation_time") is not None + assert function._getattr("class_type") is not None + assert function._getattr("_resources") is not None def test_create_delete_update_function(config, odps): try: - secondary_project = config.get('test', 'secondary_project') - secondary_user = config.get('test', 'secondary_user') + secondary_project = config.get("test", "secondary_project") + secondary_user = config.get("test", "secondary_user") except ConfigParser.NoOptionError: secondary_project = secondary_user = None - test_resource_name = tn('pyodps_t_tmp_test_function_resource') + '.py' - test_resource_name2 = tn('pyodps_t_tmp_test_function_resource2') + '.py' - test_resource_name3 = tn('pyodps_t_tmp_test_function_resource3') + '.py' - test_function_name = tn('pyodps_t_tmp_test_function') - test_function_name3 = tn('pyodps_t_tmp_test_function3') + test_resource_name = tn("pyodps_t_tmp_test_function_resource") + ".py" + test_resource_name2 = tn("pyodps_t_tmp_test_function_resource2") + ".py" + test_resource_name3 = tn("pyodps_t_tmp_test_function_resource3") + ".py" + test_function_name = tn("pyodps_t_tmp_test_function") + test_function_name3 = tn("pyodps_t_tmp_test_function3") try: odps.delete_resource(test_resource_name) @@ -109,13 +109,13 @@ def test_create_delete_update_function(config, odps): pass test_resource = odps.create_resource( - test_resource_name, 'py', file_obj=FUNCTION_CONTENT + test_resource_name, "py", file_obj=FUNCTION_CONTENT ) test_function = odps.create_function( test_function_name, - class_type=test_resource_name.split('.', 1)[0]+'.MyPlus', - resources=[test_resource] + class_type=test_resource_name.split(".", 1)[0] + ".MyPlus", + resources=[test_resource], ) assert test_function.name is not None @@ -124,13 +124,13 @@ def test_create_delete_update_function(config, odps): assert test_function.class_type is not None assert len(test_function.resources) == 1 - with odps.open_resource(name=test_resource_name, mode='r') as fp: + with odps.open_resource(name=test_resource_name, mode="r") as fp: assert to_text(fp.read()) == to_text(FUNCTION_CONTENT) assert test_function.owner != secondary_user test_resource2 = odps.create_resource( - test_resource_name2, 'file', file_obj='Hello World' + test_resource_name2, "file", file_obj="Hello World" ) test_function.resources.append(test_resource2) if secondary_user: @@ -149,13 +149,16 @@ def test_create_delete_update_function(config, odps): test_function3 = None if secondary_project: test_resource3 = odps.create_resource( - test_resource_name3, 'py', file_obj=FUNCTION_CONTENT, project=secondary_project + test_resource_name3, + "py", + file_obj=FUNCTION_CONTENT, + project=secondary_project, ) test_function3 = odps.create_function( test_function_name3, - class_type=test_resource_name3.split('.', 1)[0]+'.MyPlus', - resources=[test_resource3] + class_type=test_resource_name3.split(".", 1)[0] + ".MyPlus", + resources=[test_resource3], ) assert test_function3.name == test_function_name3 diff --git a/odps/models/tests/test_instances.py b/odps/models/tests/test_instances.py index a17076d0..986e12fa 100644 --- a/odps/models/tests/test_instances.py +++ b/odps/models/tests/test_instances.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,9 +16,9 @@ import itertools import json -import time import random import textwrap +import time from datetime import datetime, timedelta import mock @@ -29,14 +29,27 @@ import pandas as pd except ImportError: pd = None +try: + import pyarrow as pa +except ImportError: + pa = None -from ... import errors, compat, types as odps_types, utils, options +from ... import compat, errors, options +from ... import types as odps_types +from ... import utils from ...compat import six -from ...models import Instance, SQLTask, TableSchema from ...errors import ODPSError -from ...tests.core import tn, pandas_case, odps2_typed_case, wait_filled, flaky - -expected_xml_template = ''' +from ...tests.core import ( + flaky, + odps2_typed_case, + pandas_case, + pyarrow_case, + tn, + wait_filled, +) +from .. import Instance, SQLTask, TableSchema + +expected_xml_template = """ %(priority)s @@ -65,7 +78,7 @@ -''' +""" class TunnelLimitedInstance(Instance): @@ -75,7 +88,7 @@ def _open_tunnel_reader(self, **kw): self.wait_for_success() cls = type(self) if cls._exc is not None: - if not isinstance(cls._exc, errors.NoPermission) or not kw.get('limit'): + if not isinstance(cls._exc, errors.NoPermission) or not kw.get("limit"): raise cls._exc return super(TunnelLimitedInstance, self)._open_tunnel_reader(**kw) @@ -95,26 +108,33 @@ def test_instances(odps): size = len(list(itertools.islice(odps.list_instances(), 0, 5))) assert size >= 0 - instances = list(itertools.islice( - odps.list_instances(status='running', only_owner=True), 0, 5)) + instances = list( + itertools.islice(odps.list_instances(status="running", only_owner=True), 0, 5) + ) assert len(instances) >= 0 if len(instances) > 0: # fix: use _status instead of status to prevent from fetching the instance which is just terminated - assert all(instance._status == Instance.Status.RUNNING for instance in instances) is True + assert ( + all(instance._status == Instance.Status.RUNNING for instance in instances) + is True + ) assert len(set(instance.owner for instance in instances)) == 1 start_time = time.time() - 10 * 24 * 3600 end_time = time.time() - 24 * 3600 instances = list( - itertools.islice(odps.list_instances(start_time=start_time, end_time=end_time), 0, 5) + itertools.islice( + odps.list_instances(start_time=start_time, end_time=end_time), 0, 5 + ) ) assert len(instances) >= 0 def test_list_instances_in_page(odps): - test_table = tn('pyodps_t_tmp_list_instances_in_page') + test_table = tn("pyodps_t_tmp_list_instances_in_page") - delay_udf = textwrap.dedent(""" + delay_udf = textwrap.dedent( + """ from odps.udf import annotate import sys import time @@ -128,49 +148,54 @@ def evaluate(self, arg0): print('End Logging') sys.stdout.flush() return arg0 - """) - resource_name = tn('test_delayer_function_resource') - function_name = tn('test_delayer_function') + """ + ) + resource_name = tn("test_delayer_function_resource") + function_name = tn("test_delayer_function") - if odps.exist_resource(resource_name + '.py'): - odps.delete_resource(resource_name + '.py') - res = odps.create_resource(resource_name + '.py', 'py', file_obj=delay_udf) + if odps.exist_resource(resource_name + ".py"): + odps.delete_resource(resource_name + ".py") + res = odps.create_resource(resource_name + ".py", "py", file_obj=delay_udf) if odps.exist_function(function_name): odps.delete_function(function_name) fun = odps.create_function( - function_name, class_type=resource_name + '.Delayer', resources=[res] + function_name, class_type=resource_name + ".Delayer", resources=[res] ) data = [[random.randint(0, 1000)] for _ in compat.irange(100)] odps.delete_table(test_table, if_exists=True) - t = odps.create_table(test_table, TableSchema.from_lists(['num'], ['bigint'])) + t = odps.create_table(test_table, TableSchema.from_lists(["num"], ["bigint"])) odps.write_table(t, data) instance = odps.run_sql( - "select sum({0}(num)), 1 + '1' as warn_col from {1} group by num" - .format(function_name, test_table) + "select sum({0}(num)), 1 + '1' as warn_col from {1} group by num".format( + function_name, test_table + ) ) try: assert instance.status == Instance.Status.RUNNING assert instance.id in [ - it.id for it in odps.get_project().instances.iterate( + it.id + for it in odps.get_project().instances.iterate( status=Instance.Status.RUNNING, start_time=datetime.now() - timedelta(days=2), end_time=datetime.now() + timedelta(days=1), - max_items=20 + max_items=20, ) ] wait_filled(lambda: instance.tasks) task = instance.tasks[0] - task.put_info('testInfo', 'TestInfo') + task.put_info("testInfo", "TestInfo") + with pytest.raises(errors.EmptyTaskInfoError): + task.put_info("testInfo", "TestInfo", raise_empty=True) assert task.warnings is not None wait_filled(lambda: task.workers, 30) wait_filled(lambda: [w.log_id for w in task.workers if w.log_id], 30) - assert task.workers[0].get_log('stdout') is not None + assert task.workers[0].get_log("stdout") is not None finally: try: instance.stop() @@ -182,7 +207,7 @@ def evaluate(self, arg0): def test_instance_exists(odps): - non_exists_instance = 'a_non_exists_instance' + non_exists_instance = "a_non_exists_instance" assert odps.exist_instance(non_exists_instance) is False @@ -192,11 +217,11 @@ def test_instance(odps): assert instance is odps.get_instance(instance.name) - assert instance._getattr('name') is not None - assert instance._getattr('owner') is not None - assert instance._getattr('start_time') is not None - assert instance._getattr('end_time') is not None - assert instance._getattr('_status') is not None + assert instance._getattr("name") is not None + assert instance._getattr("owner") is not None + assert instance._getattr("start_time") is not None + assert instance._getattr("end_time") is not None + assert instance._getattr("_status") is not None assert instance._status == Instance.Status.TERMINATED instance.reload() @@ -211,7 +236,7 @@ def test_instance(odps): assert task_status.status in ( Instance.Task.TaskStatus.CANCELLED, Instance.Task.TaskStatus.FAILED, - Instance.Task.TaskStatus.SUCCESS + Instance.Task.TaskStatus.SUCCESS, ) for task_status in instance._tasks: assert task_status.name in task_names @@ -230,22 +255,21 @@ def test_instance(odps): def test_create_instance_xml(odps): instances = odps._project.instances - uuid = '359696d4-ac73-4e6c-86d1-6649b01f1a22' - query = 'select * from dual if fake < 1;' + uuid = "359696d4-ac73-4e6c-86d1-6649b01f1a22" + query = "select * from dual if fake < 1;" priority = 5 try: - options.biz_id = '012345' + options.biz_id = "012345" task = SQLTask(query=query) - job = instances._create_job( - task=task, priority=priority, uuid_=uuid) + job = instances._create_job(task=task, priority=priority, uuid_=uuid) xml = instances._get_submit_instance_content(job) expected_xml = expected_xml_template % { - 'query': query, - 'uuid': uuid, - 'priority': priority, - 'biz_id': options.biz_id, + "query": query, + "uuid": uuid, + "priority": priority, + "biz_id": options.biz_id, } assert utils.to_str(xml) == utils.to_str(expected_xml) finally: @@ -253,26 +277,27 @@ def test_create_instance_xml(odps): def test_create_instance(odps): - test_table = tn('pyodps_t_tmp_create_instance') + test_table = tn("pyodps_t_tmp_create_instance") - task = SQLTask(query='drop table if exists %s' % test_table) + task = SQLTask(query="drop table if exists %s" % test_table) instance = odps._project.instances.create(task=task) + assert instance.get_sql_query().rstrip(";") == task.query.rstrip(";") instance.wait_for_completion() assert instance.is_successful() is True assert odps.exist_table(test_table) is False assert instance.start_time < datetime.now() assert instance.start_time > datetime.now() - timedelta(hours=1) - task = SQLTask(query='create table %s(id string);' % test_table) + task = SQLTask(query="create table %s(id string);" % test_table) instance = odps._project.instances.create(task=task) instance.wait_for_completion() assert instance.is_successful() is True assert odps.exist_table(test_table) is True - instance = odps.execute_sql('select id `中文标题` from %s' % test_table) + instance = odps.execute_sql("select id `中文标题` from %s" % test_table) assert instance.is_successful() is True - instance = odps.execute_sql('drop table %s' % test_table) + instance = odps.execute_sql("drop table %s" % test_table) assert instance.is_successful() is True assert odps.exist_table(test_table) is False @@ -288,26 +313,29 @@ def test_create_instance(odps): def test_read_sql_instance(odps): - test_table = tn('pyodps_t_tmp_read_sql_instance') + test_table = tn("pyodps_t_tmp_read_sql_instance") odps.delete_table(test_table, if_exists=True) table = odps.create_table( - test_table, TableSchema.from_lists(['size'], ['bigint']), if_not_exists=True) - odps.write_table( - table, 0, [table.new_record([1]), table.new_record([2])]) - odps.write_table(table, [table.new_record([3]), ]) + test_table, TableSchema.from_lists(["size"], ["bigint"]), if_not_exists=True + ) + odps.write_table(table, 0, [table.new_record([1]), table.new_record([2])]) + odps.write_table(table, [table.new_record([3])]) - instance = odps.execute_sql('select * from %s' % test_table) + instance = odps.execute_sql("select * from %s" % test_table) with instance.open_reader(table.table_schema) as reader: assert len(list(reader[::2])) == 2 with instance.open_reader(table.table_schema) as reader: assert len(list(reader[1::2])) == 1 - hints = {'odps.sql.mapper.split.size': '16'} - instance = odps.run_sql('select sum(size) as count from %s' % test_table, hints=hints) + hints = {"odps.sql.mapper.split.size": "16"} + instance = odps.run_sql( + "select sum(size) as count from %s" % test_table, hints=hints + ) while ( - len(instance.get_task_names()) == 0 or - compat.lvalues(instance.get_task_statuses())[0].status == Instance.Task.TaskStatus.WAITING + len(instance.get_task_names()) == 0 + or compat.lvalues(instance.get_task_statuses())[0].status + == Instance.Task.TaskStatus.WAITING ): continue @@ -325,27 +353,30 @@ def test_read_sql_instance(odps): instance.wait_for_completion(timeout=3, max_interval=3) instance.wait_for_success() - assert json.loads( - instance.tasks[0].properties['settings'] - )['odps.sql.mapper.split.size'] == hints['odps.sql.mapper.split.size'] + assert ( + json.loads(instance.tasks[0].properties["settings"])[ + "odps.sql.mapper.split.size" + ] + == hints["odps.sql.mapper.split.size"] + ) assert instance.tasks[0].summary is not None with instance.open_reader( - TableSchema.from_lists(['count'], ['bigint']), tunnel=False + TableSchema.from_lists(["count"], ["bigint"]), tunnel=False ) as reader: records = list(reader) assert len(records) == 1 - assert records[0]['count'] == 6 + assert records[0]["count"] == 6 with instance.open_reader(tunnel=True) as reader: records = list(reader) assert len(records) == 1 - assert records[0]['count'] == 6 + assert records[0]["count"] == 6 with instance.open_reader(tunnel=False) as reader: records = list(reader) assert len(records) == 1 - assert records[0]['count'] == '6' + assert records[0]["count"] == "6" if pd is not None: with instance.open_reader(tunnel=True) as reader: @@ -360,58 +391,119 @@ def test_read_sql_instance(odps): pd_data = reader.to_pandas() assert len(pd_data) == 1 + if pa is not None: + with instance.open_reader(tunnel=True, arrow=True) as reader: + pd_data = reader.to_pandas() + assert len(pd_data) == 1 + table.drop() +@pandas_case +@pyarrow_case +def test_instance_to_pandas(odps): + test_table = tn("pyodps_t_tmp_inst_to_pandas") + odps.delete_table(test_table, if_exists=True) + data = pd.DataFrame( + [[0, 134, "a", "a"], [1, 24, "a", "b"], [2, 131, "a", "a"], [3, 141, "a", "b"]], + columns=["a", "b", "c", "d"], + ) + odps.write_table(test_table, data, create_table=True, lifecycle=1) + + instance = odps.execute_sql("select * from %s" % test_table) + + result = instance.to_pandas(columns=["a", "b"]) + pd.testing.assert_frame_equal(result, data[["a", "b"]]) + + # test fallback when arrow format not supported + raised_list = [False] + + def _new_to_pandas(self, *_, **__): + raised_list[0] = True + raise errors.ChecksumError("Checksum invalid") + + with mock.patch( + "odps.models.readers.TunnelArrowReader.to_pandas", new=_new_to_pandas + ): + result = instance.to_pandas(columns=["a", "b"]) + assert raised_list[0] + pd.testing.assert_frame_equal(result, data[["a", "b"]]) + + # test fallback when instance tunnel not supported + raised_list = [False] + + def _new_open_tunnel_reader(self, *_, **__): + raised_list[0] = True + raise errors.InvalidProjectTable("InvalidProjectTable") + + with mock.patch( + "odps.models.instance.Instance._open_tunnel_reader", new=_new_open_tunnel_reader + ): + result = instance.to_pandas(columns=["a", "b"]) + assert raised_list[0] + pd.testing.assert_frame_equal(result, data[["a", "b"]]) + + batches = [] + for batch in instance.iter_pandas(columns=["a", "b"], batch_size=2): + assert len(batch) == 2 + batches.append(batch) + assert len(batches) == 2 + + odps.delete_table(test_table, if_exists=True) + + def test_limited_instance_tunnel(odps): - test_table = tn('pyodps_t_tmp_limit_instance_tunnel') + test_table = tn("pyodps_t_tmp_limit_instance_tunnel") odps.delete_table(test_table, if_exists=True) table = odps.create_table( - test_table, TableSchema.from_lists(['size'], ['bigint']), if_not_exists=True) - odps.write_table( - table, 0, [table.new_record([1]), table.new_record([2])]) - odps.write_table(table, [table.new_record([3]), ]) + test_table, TableSchema.from_lists(["size"], ["bigint"]), if_not_exists=True + ) + odps.write_table(table, 0, [table.new_record([1]), table.new_record([2])]) + odps.write_table(table, [table.new_record([3])]) - instance = odps.execute_sql('select * from %s' % test_table) - instance = TunnelLimitedInstance(client=instance._client, parent=instance.parent, - name=instance.id) + instance = odps.execute_sql("select * from %s" % test_table) + instance = TunnelLimitedInstance( + client=instance._client, parent=instance.parent, name=instance.id + ) - TunnelLimitedInstance._exc = errors.InvalidArgument('Mock fallback error') + TunnelLimitedInstance._exc = errors.InvalidArgument("Mock fallback error") pytest.raises(errors.InvalidArgument, instance.open_reader, tunnel=True) with instance.open_reader() as reader: - assert hasattr(reader, 'raw') is True + assert hasattr(reader, "raw") is True - TunnelLimitedInstance._exc = requests.Timeout('Mock timeout') + TunnelLimitedInstance._exc = requests.Timeout("Mock timeout") pytest.raises(requests.Timeout, instance.open_reader, tunnel=True) with instance.open_reader() as reader: - assert hasattr(reader, 'raw') is True + assert hasattr(reader, "raw") is True - TunnelLimitedInstance._exc = errors.InstanceTypeNotSupported('Mock instance not supported') + TunnelLimitedInstance._exc = errors.InstanceTypeNotSupported( + "Mock instance not supported" + ) pytest.raises(errors.InstanceTypeNotSupported, instance.open_reader, tunnel=True) with instance.open_reader() as reader: - assert hasattr(reader, 'raw') is True + assert hasattr(reader, "raw") is True - TunnelLimitedInstance._exc = errors.NoPermission('Mock permission error') + TunnelLimitedInstance._exc = errors.NoPermission("Mock permission error") pytest.raises(errors.NoPermission, instance.open_reader, limit=False) with instance.open_reader() as reader: - assert hasattr(reader, 'raw') is False + assert hasattr(reader, "raw") is False def test_read_sql_write(odps): - test_table = tn('pyodps_t_tmp_read_sql_instance_write') + test_table = tn("pyodps_t_tmp_read_sql_instance_write") odps.delete_table(test_table, if_exists=True) table = odps.create_table( - test_table, TableSchema.from_lists(['size'], ['bigint']), if_not_exists=True) - odps.write_table( - table, 0, [table.new_record([1]), table.new_record([2])]) - odps.write_table(table, [table.new_record([3]), ]) + test_table, TableSchema.from_lists(["size"], ["bigint"]), if_not_exists=True + ) + odps.write_table(table, 0, [table.new_record([1]), table.new_record([2])]) + odps.write_table(table, [table.new_record([3])]) - test_table2 = tn('pyodps_t_tmp_read_sql_instance_write2') + test_table2 = tn("pyodps_t_tmp_read_sql_instance_write2") odps.delete_table(test_table2, if_exists=True) table2 = odps.create_table(test_table2, table.table_schema) try: - with odps.execute_sql('select * from %s' % test_table).open_reader() as reader: + with odps.execute_sql("select * from %s" % test_table).open_reader() as reader: with table2.open_writer() as writer: for record in reader: writer.write(table2.new_record(record.values)) @@ -423,18 +515,35 @@ def test_read_sql_write(odps): def test_read_binary_sql_instance(odps): try: options.tunnel.string_as_binary = True - test_table = tn('pyodps_t_tmp_read_binary_sql_instance') + test_table = tn("pyodps_t_tmp_read_binary_sql_instance") odps.delete_table(test_table, if_exists=True) table = odps.create_table( test_table, - TableSchema.from_lists(['size', 'name'], ['bigint', 'string']), if_not_exists=True) + TableSchema.from_lists(["size", "name"], ["bigint", "string"]), + if_not_exists=True, + ) - data = [[1, u'中'.encode('utf-8') + b'\\\\n\\\n' + u'文'.encode('utf-8') + b' ,\r\xe9'], - [2, u'测试'.encode('utf-8') + b'\x00\x01\x02' + u'数据'.encode('utf-8') + b'\xe9']] - odps.write_table( - table, 0, [table.new_record(it) for it in data]) + data = [ + [ + 1, + u"中".encode("utf-8") + + b"\\\\n\\\n" + + u"文".encode("utf-8") + + b" ,\r\xe9", + ], + [ + 2, + u"测试".encode("utf-8") + + b"\x00\x01\x02" + + u"数据".encode("utf-8") + + b"\xe9", + ], + ] + odps.write_table(table, 0, [table.new_record(it) for it in data]) - with odps.execute_sql('select name from %s' % test_table).open_reader(tunnel=False) as reader: + with odps.execute_sql("select name from %s" % test_table).open_reader( + tunnel=False + ) as reader: read_data = sorted([r[0] for r in reader]) expected_data = sorted([r[1] for r in data]) @@ -446,19 +555,20 @@ def test_read_binary_sql_instance(odps): def test_read_non_ascii_sql_instance(odps): - test_table = tn('pyodps_t_tmp_read_non_ascii_sql_instance') + test_table = tn("pyodps_t_tmp_read_non_ascii_sql_instance") odps.delete_table(test_table, if_exists=True) table = odps.create_table( test_table, - TableSchema.from_lists(['size', 'name'], ['bigint', 'string']), + TableSchema.from_lists(["size", "name"], ["bigint", "string"]), if_not_exists=True, ) - data = [[1, '中\\\\n\\\n文 ,\r '], [2, '测试\x00\x01\x02数据']] - odps.write_table( - table, 0, [table.new_record(it) for it in data]) + data = [[1, "中\\\\n\\\n文 ,\r "], [2, "测试\x00\x01\x02数据"]] + odps.write_table(table, 0, [table.new_record(it) for it in data]) - with odps.execute_sql('select name from %s' % test_table).open_reader(tunnel=False) as reader: + with odps.execute_sql("select name from %s" % test_table).open_reader( + tunnel=False + ) as reader: read_data = sorted([utils.to_str(r[0]) for r in reader]) expected_data = sorted([utils.to_str(r[1]) for r in data]) @@ -468,23 +578,27 @@ def test_read_non_ascii_sql_instance(odps): def test_read_map_array_sql_instance(odps): - test_table = tn('pyodps_t_tmp_read_map_array_sql_instance') + test_table = tn("pyodps_t_tmp_read_map_array_sql_instance") odps.delete_table(test_table, if_exists=True) table = odps.create_table( test_table, TableSchema.from_lists( - ['idx', 'map_col', 'array_col'], - ['bigint', odps_types.Map(odps_types.string, odps_types.string), odps_types.Array(odps_types.string)], - ) + ["idx", "map_col", "array_col"], + [ + "bigint", + odps_types.Map(odps_types.string, odps_types.string), + odps_types.Array(odps_types.string), + ], + ), ) data = [ - [0, {'key1': 'value1', 'key2': 'value2'}, ['item1', 'item2', 'item3']], - [1, {'key3': 'value3', 'key4': 'value4'}, ['item4', 'item5']], + [0, {"key1": "value1", "key2": "value2"}, ["item1", "item2", "item3"]], + [1, {"key3": "value3", "key4": "value4"}, ["item4", "item5"]], ] odps.write_table(test_table, data) - inst = odps.execute_sql('select * from %s' % test_table) + inst = odps.execute_sql("select * from %s" % test_table) with inst.open_reader(table.table_schema, tunnel=False) as reader: read_data = [list(r.values) for r in reader] @@ -504,19 +618,17 @@ def test_read_map_array_sql_instance(odps): def test_sql_alias_instance(odps): - test_table = tn('pyodps_t_tmp_sql_aliases_instance') + test_table = tn("pyodps_t_tmp_sql_aliases_instance") odps.delete_table(test_table, if_exists=True) table = odps.create_table( - test_table, - TableSchema.from_lists(['size'], ['bigint']), - if_not_exists=True + test_table, TableSchema.from_lists(["size"], ["bigint"]), if_not_exists=True ) - data = [[1, ], ] + data = [[1]] odps.write_table(table, 0, data) - res_name1 = tn('pyodps_t_tmp_resource_1') - res_name2 = tn('pyodps_t_tmp_resource_2') + res_name1 = tn("pyodps_t_tmp_resource_1") + res_name2 = tn("pyodps_t_tmp_resource_2") try: odps.delete_resource(res_name1) except ODPSError: @@ -525,10 +637,11 @@ def test_sql_alias_instance(odps): odps.delete_resource(res_name2) except ODPSError: pass - res1 = odps.create_resource(res_name1, 'file', file_obj='1') - res2 = odps.create_resource(res_name2, 'file', file_obj='2') + res1 = odps.create_resource(res_name1, "file", file_obj="1") + res2 = odps.create_resource(res_name2, "file", file_obj="2") - test_func_content = """ + test_func_content = ( + """ from odps.udf import annotate from odps.distcache import get_cache_file @@ -539,35 +652,37 @@ def __init__(self): def evaluate(self, arg): return arg + self.n - """ % res_name1 + """ + % res_name1 + ) test_func_content = textwrap.dedent(test_func_content) - py_res_name = tn('pyodps_t_tmp_func_res') + py_res_name = tn("pyodps_t_tmp_func_res") try: - odps.delete_resource(py_res_name+'.py') + odps.delete_resource(py_res_name + ".py") except ODPSError: pass - py_res = odps.create_resource(py_res_name+'.py', 'py', file_obj=test_func_content) + py_res = odps.create_resource(py_res_name + ".py", "py", file_obj=test_func_content) - test_func_name = tn('pyodps_t_tmp_func_1') + test_func_name = tn("pyodps_t_tmp_func_1") try: odps.delete_function(test_func_name) except ODPSError: pass - func = odps.create_function(test_func_name, - class_type='{0}.Example'.format(py_res_name), - resources=[py_res_name+'.py', res_name1]) + func = odps.create_function( + test_func_name, + class_type="{0}.Example".format(py_res_name), + resources=[py_res_name + ".py", res_name1], + ) for i in range(1, 3): aliases = None if i == 2: - aliases = { - res_name1: res_name2 - } + aliases = {res_name1: res_name2} with odps.execute_sql( - 'select %s(size) from %s' % (test_func_name, test_table), - aliases=aliases).open_reader() as reader: + "select %s(size) from %s" % (test_func_name, test_table), aliases=aliases + ).open_reader() as reader: data = reader[0] assert int(data[0]) == i + 1 @@ -576,52 +691,55 @@ def evaluate(self, arg): def test_read_non_select_sql_instance(odps): - test_table = tn('pyodps_t_tmp_read_non_select_sql_instance') + test_table = tn("pyodps_t_tmp_read_non_select_sql_instance") odps.delete_table(test_table, if_exists=True) table = odps.create_table( test_table, - TableSchema.from_lists(['size'], ['bigint'], ['pt'], ['string']), + TableSchema.from_lists(["size"], ["bigint"], ["pt"], ["string"]), if_not_exists=True, ) - pt_spec = 'pt=20170410' + pt_spec = "pt=20170410" table.create_partition(pt_spec) - inst = odps.execute_sql('desc %s' % test_table) + inst = odps.execute_sql("desc %s" % test_table) + + with pytest.raises( + (Instance.DownloadSessionCreationError, errors.InstanceTypeNotSupported) + ): + inst.open_reader(tunnel=True) - pytest.raises((Instance.DownloadSessionCreationError, errors.InstanceTypeNotSupported), - lambda: inst.open_reader(tunnel=True)) reader = inst.open_reader() - assert hasattr(reader, 'raw') is True + assert hasattr(reader, "raw") is True - inst = odps.execute_sql('show partitions %s' % test_table) + inst = odps.execute_sql("show partitions %s" % test_table) reader = inst.open_reader() - assert hasattr(reader, 'raw') is True + assert hasattr(reader, "raw") is True assert utils.to_text(pt_spec) in utils.to_text(reader.raw) @pandas_case def test_instance_result_to_result_frame(odps): - test_table = tn('pyodps_t_tmp_instance_result_to_pd') + test_table = tn("pyodps_t_tmp_instance_result_to_pd") odps.delete_table(test_table, if_exists=True) table = odps.create_table( - test_table, TableSchema.from_lists(['size'], ['bigint']), if_not_exists=True + test_table, TableSchema.from_lists(["size"], ["bigint"]), if_not_exists=True ) odps.write_table(table, [[1], [2], [3]]) - inst = odps.execute_sql('select * from %s' % test_table) + inst = odps.execute_sql("select * from %s" % test_table) tunnel_pd = inst.open_reader(tunnel=True).to_pandas() result_pd = inst.open_reader(tunnel=False).to_pandas() assert tunnel_pd.values.tolist() == result_pd.values.tolist() def test_instance_logview(odps): - instance = odps.run_sql('drop table if exists non_exist_table_name') + instance = odps.run_sql("drop table if exists non_exist_table_name") assert isinstance(odps.get_logview_address(instance.id, 12), six.string_types) @flaky(max_runs=3) def test_instance_queueing_info(odps): - instance = odps.run_sql('select * from dual') + instance = odps.run_sql("select * from dual") queue_info, resp = instance._get_queueing_info() if json.loads(resp.content if six.PY2 else resp.text): assert queue_info.instance is instance @@ -641,10 +759,15 @@ def test_instance_queueing_info(odps): @flaky(max_runs=3) def test_instance_queueing_infos(odps): - odps.run_sql('select * from dual') + odps.run_sql("select * from dual") - infos = [info for i, info in compat.izip(itertools.count(0), odps.list_instance_queueing_infos()) - if i < 5] + infos = [ + info + for i, info in compat.izip( + itertools.count(0), odps.list_instance_queueing_infos() + ) + if i < 5 + ] if len(infos) > 0: assert isinstance(infos[0], Instance.InstanceQueueingInfo) assert infos[0].instance_id is not None @@ -654,27 +777,27 @@ def test_instance_queueing_infos(odps): @odps2_typed_case def test_sql_cost_instance(odps): - test_table = tn('pyodps_t_tmp_sql_cost_instance') + test_table = tn("pyodps_t_tmp_sql_cost_instance") odps.delete_table(test_table, if_exists=True) table = odps.create_table( - test_table, TableSchema.from_lists(['size'], ['bigint']), if_not_exists=True + test_table, TableSchema.from_lists(["size"], ["bigint"]), if_not_exists=True ) odps.write_table(table, [[1], [2], [3]]) - sql_cost = odps.execute_sql_cost('select * from %s' % test_table) + sql_cost = odps.execute_sql_cost("select * from %s" % test_table) assert isinstance(sql_cost, Instance.SQLCost) assert sql_cost.udf_num == 0 assert sql_cost.complexity == 1.0 assert sql_cost.input_size >= 100 - test_table = tn('pyodps_t_tmp_sql_cost_odps2_instance') + test_table = tn("pyodps_t_tmp_sql_cost_odps2_instance") odps.delete_table(test_table, if_exists=True) table = odps.create_table( - test_table, TableSchema.from_lists(['size'], ['tinyint']), if_not_exists=True + test_table, TableSchema.from_lists(["size"], ["tinyint"]), if_not_exists=True ) odps.write_table(table, [[1], [2], [3]]) - sql_cost = odps.execute_sql_cost('select * from %s' % test_table) + sql_cost = odps.execute_sql_cost("select * from %s" % test_table) assert isinstance(sql_cost, Instance.SQLCost) assert sql_cost.udf_num == 0 assert sql_cost.complexity == 1.0 @@ -682,10 +805,10 @@ def test_sql_cost_instance(odps): def test_instance_progress_log(odps): - test_table = tn('pyodps_t_tmp_sql_cost_instance') + test_table = tn("pyodps_t_tmp_sql_cost_instance") odps.delete_table(test_table, if_exists=True) table = odps.create_table( - test_table, TableSchema.from_lists(['size'], ['bigint']), if_not_exists=True + test_table, TableSchema.from_lists(["size"], ["bigint"]), if_not_exists=True ) odps.write_table(table, [[1], [2], [3]]) @@ -696,8 +819,8 @@ def test_instance_progress_log(odps): options.verbose_log = logs.append options.progress_time_interval = 0.1 - inst = odps.run_sql('select * from %s where size > 0' % test_table) - inst.wait_for_success(interval=0.1) + inst = odps.run_sql("select * from %s where size > 0" % test_table) + inst.wait_for_success(interval=0.1, blocking=False) assert any("instance" in log.lower() for log in logs) assert any("_job_" in log.lower() for log in logs) finally: @@ -707,7 +830,7 @@ def test_instance_progress_log(odps): def test_sql_statement_error(odps): - statement = 'WRONG_SQL' + statement = "WRONG_SQL" try: odps.run_sql(statement) except errors.ParseError as ex: diff --git a/odps/models/tests/test_offline_models.py b/odps/models/tests/test_offline_models.py index 110eec01..3a71bb98 100644 --- a/odps/models/tests/test_offline_models.py +++ b/odps/models/tests/test_offline_models.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + def test_offline_models(odps): assert odps.get_project().offline_models is odps.get_project().offline_models size = len(list(odps.list_offline_models())) @@ -19,5 +20,5 @@ def test_offline_models(odps): def test_instance_exists(odps): - non_exists_offline_model = 'a_non_exists_offline_model' + non_exists_offline_model = "a_non_exists_offline_model" assert odps.exist_offline_model(non_exists_offline_model) is False diff --git a/odps/models/tests/test_partitions.py b/odps/models/tests/test_partitions.py index 85023fb6..9867b955 100644 --- a/odps/models/tests/test_partitions.py +++ b/odps/models/tests/test_partitions.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,31 +19,29 @@ import mock import pytest -from ...tests.core import tn from ... import types +from ...tests.core import tn from .. import TableSchema from ..storage_tier import StorageTier def test_partitions(odps): - test_table_name = tn('pyodps_t_tmp_partitions_table') - partitions = ['s=%s' % i for i in range(3)] - schema = TableSchema.from_lists(['id', ], ['string', ], ['s', ], ['string', ]) + test_table_name = tn("pyodps_t_tmp_partitions_table") + partitions = ["s=%s" % i for i in range(3)] + schema = TableSchema.from_lists(["id"], ["string"], ["s"], ["string"]) odps.delete_table(test_table_name, if_exists=True) table = odps.create_table(test_table_name, schema) for partition in partitions: table.create_partition(partition) - assert ( - sorted([str(types.PartitionSpec(p)) for p in partitions]) - == sorted([str(p.partition_spec) for p in table.partitions]) + assert sorted([str(types.PartitionSpec(p)) for p in partitions]) == sorted( + [str(p.partition_spec) for p in table.partitions] ) table.get_partition(partitions[0]).drop() - assert ( - sorted([str(types.PartitionSpec(p)) for p in partitions[1:]]) - == sorted([str(p.partition_spec) for p in table.partitions]) + assert sorted([str(types.PartitionSpec(p)) for p in partitions[1:]]) == sorted( + [str(p.partition_spec) for p in table.partitions] ) p = next(table.partitions) @@ -59,51 +57,57 @@ def test_partitions(odps): def test_sub_partitions(odps): - test_table_name = tn('pyodps_t_tmp_sub_partitions_table') - root_partition = 'type=test' - sub_partitions = ['s=%s' % i for i in range(3)] - schema = TableSchema.from_lists(['id', ], ['string', ], ['type', 's'], ['string', 'string']) + test_table_name = tn("pyodps_t_tmp_sub_partitions_table") + root_partition = "type=test" + sub_partitions = ["s=%s" % i for i in range(3)] + schema = TableSchema.from_lists( + ["id"], ["string"], ["type", "s"], ["string", "string"] + ) odps.delete_table(test_table_name, if_exists=True) table = odps.create_table(test_table_name, schema) - partitions = [root_partition+','+p for p in sub_partitions] - partitions.append('type=test2,s=0') + partitions = [root_partition + "," + p for p in sub_partitions] + partitions.append("type=test2,s=0") for partition in partitions: table.create_partition(partition) - assert sorted([str(types.PartitionSpec(p)) for p in partitions]) == sorted([str(p.partition_spec) for p in table.partitions]) + assert sorted([str(types.PartitionSpec(p)) for p in partitions]) == sorted( + [str(p.partition_spec) for p in table.partitions] + ) assert len(list(table.iterate_partitions(root_partition))) == 3 - assert table.exist_partitions('type=test2') is True - assert table.exist_partitions('type=test3') is False + assert table.exist_partitions("type=test2") is True + assert table.exist_partitions("type=test3") is False table.delete_partition(partitions[0]) - assert sorted([str(types.PartitionSpec(p)) for p in partitions[1:]]) == sorted([str(p.partition_spec) for p in table.partitions]) + assert sorted([str(types.PartitionSpec(p)) for p in partitions[1:]]) == sorted( + [str(p.partition_spec) for p in table.partitions] + ) odps.delete_table(test_table_name) def test_partition(odps): - test_table_name = tn('pyodps_t_tmp_partition_table') - partition = 's=1' - schema = TableSchema.from_lists(['id', ], ['string', ], ['s', ], ['string', ]) + test_table_name = tn("pyodps_t_tmp_partition_table") + partition = "s=1" + schema = TableSchema.from_lists(["id"], ["string"], ["s"], ["string"]) odps.delete_table(test_table_name, if_exists=True) table = odps.create_table(test_table_name, schema) partition = table.create_partition(partition) - assert partition._getattr('_is_extend_info_loaded') is False - assert partition._getattr('_loaded') is False + assert partition._getattr("_is_extend_info_loaded") is False + assert partition._getattr("_loaded") is False - assert partition._getattr('creation_time') is None - assert partition._getattr('last_meta_modified_time') is None - assert partition._getattr('last_data_modified_time') is None - assert partition._getattr('size') is None - assert partition._getattr('is_archived') is None - assert partition._getattr('is_exstore') is None - assert partition._getattr('lifecycle') is None - assert partition._getattr('physical_size') is None - assert partition._getattr('file_num') is None + assert partition._getattr("creation_time") is None + assert partition._getattr("last_meta_modified_time") is None + assert partition._getattr("last_data_modified_time") is None + assert partition._getattr("size") is None + assert partition._getattr("is_archived") is None + assert partition._getattr("is_exstore") is None + assert partition._getattr("lifecycle") is None + assert partition._getattr("physical_size") is None + assert partition._getattr("file_num") is None assert isinstance(partition.is_archived, bool) assert isinstance(partition.is_exstore, bool) @@ -121,9 +125,9 @@ def test_partition(odps): assert partition.is_loaded is True assert table.exist_partition(partition) is True - assert table.exist_partition('s=a_non_exist_partition') is False + assert table.exist_partition("s=a_non_exist_partition") is False - row_contents = ['index', '1'] + row_contents = ["index", "1"] with partition.open_writer() as writer: writer.write([row_contents]) with partition.open_reader() as reader: @@ -138,7 +142,7 @@ def test_iter_partition_condition(odps): from ...types import PartitionSpec from ..partitions import PartitionSpecCondition - test_table_name = tn('pyodps_t_tmp_cond_partition_table') + test_table_name = tn("pyodps_t_tmp_cond_partition_table") odps.delete_table(test_table_name, if_exists=True) tb = odps.create_table(test_table_name, ("col string", "pt1 string, pt2 string")) @@ -161,7 +165,9 @@ def new_init(self, *args, **kwargs): orig_init(self, *args, **kwargs) part_prefix[0] = self.partition_spec - with mock.patch("odps.models.partitions.PartitionSpecCondition.__init__", new=new_init): + with mock.patch( + "odps.models.partitions.PartitionSpecCondition.__init__", new=new_init + ): # filter with predicates parts = list(tb.iterate_partitions("pt1=1")) assert part_prefix[0] == PartitionSpec("pt1=1") @@ -186,13 +192,11 @@ def new_init(self, *args, **kwargs): def test_tiered_partition(odps_with_storage_tier): odps = odps_with_storage_tier - test_table_name = tn('pyodps_t_tmp_parted_tiered') + test_table_name = tn("pyodps_t_tmp_parted_tiered") odps.delete_table(test_table_name, if_exists=True) assert odps.exist_table(test_table_name) is False - table = odps.create_table( - test_table_name, ("col string", "pt string"), lifecycle=1 - ) + table = odps.create_table(test_table_name, ("col string", "pt string"), lifecycle=1) part = table.create_partition("pt=20230711") part.set_storage_tier("standard") assert part.storage_tier_info.storage_tier == StorageTier.STANDARD diff --git a/odps/models/tests/test_projects.py b/odps/models/tests/test_projects.py index 4a9aa037..72acab77 100644 --- a/odps/models/tests/test_projects.py +++ b/odps/models/tests/test_projects.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,11 +17,11 @@ import pytest from ...compat import six -from .. import Projects, Project +from .. import Project, Projects def test_projects_exists(odps): - not_exists_project_name = 'a_not_exists_project' + not_exists_project_name = "a_not_exists_project" assert odps.exist_project(not_exists_project_name) is False assert odps.exist_project(odps.project) is True diff --git a/odps/models/tests/test_quotas.py b/odps/models/tests/test_quotas.py new file mode 100644 index 00000000..ea14d190 --- /dev/null +++ b/odps/models/tests/test_quotas.py @@ -0,0 +1,33 @@ +# Copyright 1999-2024 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + + +def test_quotas(odps_daily): + odps = odps_daily + + with pytest.raises(TypeError): + odps.get_quota() + + first_quota_nick = next(odps.list_quotas()) + assert first_quota_nick.nickname is not None + + assert not odps.exist_quota("non_exist_quota") + assert odps.exist_quota(first_quota_nick) + assert odps.exist_quota(first_quota_nick.nickname) + + quota_obj = odps.get_quota(first_quota_nick.nickname) + quota_obj.reload() + assert quota_obj.nickname == first_quota_nick.nickname diff --git a/odps/models/tests/test_resources.py b/odps/models/tests/test_resources.py index d526b502..d0e31912 100644 --- a/odps/models/tests/test_resources.py +++ b/odps/models/tests/test_resources.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,19 +19,28 @@ import pytest from ... import compat, errors, options, types -from ...compat import futures, six, ConfigParser, UnsupportedOperation +from ...compat import ConfigParser, UnsupportedOperation, futures, six from ...tests.core import tn from ...utils import to_text -from .. import Resource, FileResource, TableResource, VolumeArchiveResource, \ - VolumeFileResource, TableSchema - -FILE_CONTENT = to_text(""" +from .. import ( + FileResource, + Resource, + TableResource, + TableSchema, + VolumeArchiveResource, + VolumeFileResource, +) + +FILE_CONTENT = to_text( + """ Proudly swept the rain by the cliffs As it glided through the trees Still following ever the bud The ahihi lehua of the vale -""") -OVERWRITE_FILE_CONTENT = to_text(""" +""" +) +OVERWRITE_FILE_CONTENT = to_text( + """ Farewell to thee, farewell to thee The charming one who dwells in the shaded bowers One fond embrace, @@ -42,7 +51,8 @@ Of the past Dearest one, yes, you are mine own From you, true love shall never depart -""") +""" +) @pytest.fixture(autouse=True) @@ -64,37 +74,36 @@ def test_resources(odps): break assert isinstance(resource, Resource._get_cls(resource.type)) - pytest.raises(TypeError, lambda: odps.create_resource( - 'test_error', 'py', resource=['uvw'] - ) + pytest.raises( + TypeError, lambda: odps.create_resource("test_error", "py", resource=["uvw"]) ) def test_resource_exists(odps): - non_exists_resource = 'a_non_exists_resource' + non_exists_resource = "a_non_exists_resource" assert odps.exist_resource(non_exists_resource) is False def test_table_resource(config, odps): try: - secondary_project = config.get('test', 'secondary_project') + secondary_project = config.get("test", "secondary_project") except ConfigParser.NoOptionError: secondary_project = None - test_table_name = tn('pyodps_t_tmp_resource_table') - schema = TableSchema.from_lists(['id', 'name'], ['string', 'string']) + test_table_name = tn("pyodps_t_tmp_resource_table") + schema = TableSchema.from_lists(["id", "name"], ["string", "string"]) odps.delete_table(test_table_name, if_exists=True) odps.create_table(test_table_name, schema) if secondary_project: odps.delete_table(test_table_name, if_exists=True, project=secondary_project) odps.create_table(test_table_name, schema, project=secondary_project) - resource_name = tn('pyodps_t_tmp_table_resource') + resource_name = tn("pyodps_t_tmp_table_resource") try: odps.delete_resource(resource_name) except errors.NoSuchObject: pass - res = odps.create_resource(resource_name, 'table', table_name=test_table_name) + res = odps.create_resource(resource_name, "table", table_name=test_table_name) assert isinstance(res, TableResource) assert res.get_source_table().name == test_table_name assert res.table.name == test_table_name @@ -115,8 +124,10 @@ def test_table_resource(config, odps): assert res.get_source_table().name == test_table_name assert res.get_source_table_partition() is None - test_table_partition = 'pt=test,sec=1' - schema = TableSchema.from_lists(['id', 'name'], ['string', 'string'], ['pt', 'sec'], ['string', 'bigint']) + test_table_partition = "pt=test,sec=1" + schema = TableSchema.from_lists( + ["id", "name"], ["string", "string"], ["pt", "sec"], ["string", "bigint"] + ) odps.delete_table(test_table_name, if_exists=True) table = odps.create_table(test_table_name, schema) table.create_partition(test_table_partition) @@ -125,19 +136,23 @@ def test_table_resource(config, odps): assert isinstance(res, TableResource) assert res.get_source_table().name == test_table_name assert res.table.name == test_table_name - assert str(res.get_source_table_partition()) == str(types.PartitionSpec(test_table_partition)) + assert str(res.get_source_table_partition()) == str( + types.PartitionSpec(test_table_partition) + ) assert str(res.partition.spec) == str(types.PartitionSpec(test_table_partition)) assert res is odps.get_resource(resource_name) - test_table_partition = 'pt=test,sec=2' + test_table_partition = "pt=test,sec=2" table.create_partition(test_table_partition) res = res.update(partition=test_table_partition) assert isinstance(res, TableResource) assert res.get_source_table().name == test_table_name - assert str(res.get_source_table_partition()) == str(types.PartitionSpec(test_table_partition)) + assert str(res.get_source_table_partition()) == str( + types.PartitionSpec(test_table_partition) + ) assert res is odps.get_resource(resource_name) - test_table_partition = types.PartitionSpec('pt=test,sec=3') + test_table_partition = types.PartitionSpec("pt=test,sec=3") table.create_partition(test_table_partition) res = res.update(partition=test_table_partition) assert isinstance(res, TableResource) @@ -152,13 +167,17 @@ def test_table_resource(config, odps): assert rec[1] == FILE_CONTENT if secondary_project: - resource_name2 = tn('pyodps_t_tmp_table_resource2') + resource_name2 = tn("pyodps_t_tmp_table_resource2") try: odps.delete_resource(resource_name2) except errors.NoSuchObject: pass - res = odps.create_resource(resource_name2, 'table', project_name=secondary_project, - table_name=test_table_name) + res = odps.create_resource( + resource_name2, + "table", + project_name=secondary_project, + table_name=test_table_name, + ) assert isinstance(res, TableResource) assert res.get_source_table().project.name == secondary_project assert res.get_source_table().name == test_table_name @@ -176,14 +195,16 @@ def test_table_resource(config, odps): assert res.get_source_table().name == test_table_name assert res.get_source_table_partition() is None - test_table_partition = 'pt=test,sec=1' + test_table_partition = "pt=test,sec=1" res = res.update(project_name=odps.project, partition=test_table_partition) assert isinstance(res, TableResource) assert res.get_source_table().project.name == odps.project assert res.get_source_table().name == test_table_name assert str(res.partition.spec) == str(types.PartitionSpec(test_table_partition)) - res = res.update(table_name=secondary_project + '.' + test_table_name, partition=None) + res = res.update( + table_name=secondary_project + "." + test_table_name, partition=None + ) assert isinstance(res, TableResource) assert res.get_source_table().project.name == secondary_project assert res.get_source_table().name == test_table_name @@ -195,16 +216,17 @@ def test_table_resource(config, odps): odps.delete_table(test_table_name, project=secondary_project) -def test_temp_file_resource(odps_daily): - odps = odps_daily - resource_name = tn('pyodps_t_tmp_file_resource') +def test_temp_file_resource(odps): + resource_name = tn("pyodps_t_tmp_file_resource") try: odps.delete_resource(resource_name) except errors.ODPSError: pass - resource = odps.create_resource(resource_name, 'file', fileobj=FILE_CONTENT, temp=True) + resource = odps.create_resource( + resource_name, "file", fileobj=FILE_CONTENT, temp=True + ) assert isinstance(resource, FileResource) assert resource.is_temp_resource resource.reload() @@ -213,11 +235,10 @@ def test_temp_file_resource(odps_daily): odps.delete_resource(resource_name) -def test_stream_file_resource(odps_daily): - odps = odps_daily +def test_stream_file_resource(odps): options.resource_chunk_size = 1024 content = OVERWRITE_FILE_CONTENT * 32 - resource_name = tn('pyodps_t_tmp_file_resource') + resource_name = tn("pyodps_t_tmp_file_resource") del_pool = futures.ThreadPoolExecutor(10) res_to_del = [resource_name] @@ -242,7 +263,7 @@ def test_stream_file_resource(odps_daily): with odps.open_resource(resource_name, mode="w", stream=True) as res: pytest.raises(UnsupportedOperation, lambda: res.seek(0, os.SEEK_END)) for offset in range(0, len(content), 1023): - res.write(content[offset:offset + 1023]) + res.write(content[offset : offset + 1023]) assert res.tell() == min(offset + 1023, len(content)) pytest.raises(UnsupportedOperation, lambda: res.truncate(1024)) @@ -263,7 +284,7 @@ def test_stream_file_resource(odps_daily): with odps.open_resource(resource_name, mode="w", stream=True, temp=True) as res: lines = content.splitlines(True) for offset in range(0, len(lines), 50): - res.writelines(lines[offset:offset + 50]) + res.writelines(lines[offset : offset + 50]) with odps.open_resource(resource_name, mode="r", stream=True) as res: lines = res.readlines() @@ -272,22 +293,21 @@ def test_stream_file_resource(odps_daily): assert "".join(lines) == content -def test_file_resource(odps_daily): - odps = odps_daily - resource_name = tn('pyodps_t_tmp_file_resource') +def test_file_resource(odps): + resource_name = tn("pyodps_t_tmp_file_resource") try: odps.delete_resource(resource_name) except errors.ODPSError: pass - resource = odps.create_resource(resource_name, 'file', fileobj=FILE_CONTENT) + resource = odps.create_resource(resource_name, "file", fileobj=FILE_CONTENT) assert isinstance(resource, FileResource) resource.drop() # create resource with open_resource and write with odps.open_resource( - resource_name, mode='w', type='file', comment="comment_data", temp=True + resource_name, mode="w", type="file", comment="comment_data", temp=True ) as resource: resource.write(FILE_CONTENT) resource.reload() @@ -296,8 +316,11 @@ def test_file_resource(odps_daily): # create resource with full resource path with odps.open_resource( - odps.project + "/resources/" + resource_name, mode='w', type='file', - comment="comment_data", temp=True, + odps.project + "/resources/" + resource_name, + mode="w", + type="file", + comment="comment_data", + temp=True, ) as resource: resource.write(FILE_CONTENT) resource.reload() @@ -306,9 +329,9 @@ def test_file_resource(odps_daily): resource.reload() assert resource.comment == "comment_data" - with resource.open(mode='r') as fp: - pytest.raises(IOError, lambda: fp.write('sss')) - pytest.raises(IOError, lambda: fp.writelines(['sss\n'])) + with resource.open(mode="r") as fp: + pytest.raises(IOError, lambda: fp.write("sss")) + pytest.raises(IOError, lambda: fp.writelines(["sss\n"])) assert isinstance(fp.read(), six.text_type) @@ -322,13 +345,13 @@ def test_file_resource(odps_daily): assert to_text(fp.read()) == to_text(FILE_CONTENT[1:]) fp.seek(0) - assert to_text(fp.readline()) == to_text(FILE_CONTENT.split('\n', 1)[0] + '\n') + assert to_text(fp.readline()) == to_text(FILE_CONTENT.split("\n", 1)[0] + "\n") fp.seek(0) - add_newline = lambda s: s if s.endswith('\n') else s+'\n' - assert [ - to_text(add_newline(line)) for line in fp - ] == [to_text(add_newline(line)) for line in FILE_CONTENT.splitlines()] + add_newline = lambda s: s if s.endswith("\n") else s + "\n" + assert [to_text(add_newline(line)) for line in fp] == [ + to_text(add_newline(line)) for line in FILE_CONTENT.splitlines() + ] assert fp._fp._need_commit is False assert fp.opened is True @@ -336,7 +359,7 @@ def test_file_resource(odps_daily): assert fp.opened is False assert fp._fp is None - with resource.open(mode='w') as fp: + with resource.open(mode="w") as fp: pytest.raises(IOError, fp.read) pytest.raises(IOError, fp.readline) pytest.raises(IOError, fp.readlines) @@ -347,7 +370,7 @@ def test_file_resource(odps_daily): size = fp._size - with resource.open(mode='r+') as fp: + with resource.open(mode="r+") as fp: assert to_text(fp.read()) == to_text(OVERWRITE_FILE_CONTENT * 2) assert size == fp._size @@ -358,7 +381,7 @@ def test_file_resource(odps_daily): assert fp._fp._need_commit is True - with resource.open(mode='a') as fp: + with resource.open(mode="a") as fp: pytest.raises(IOError, fp.read) pytest.raises(IOError, fp.readline) pytest.raises(IOError, fp.readlines) @@ -367,7 +390,7 @@ def test_file_resource(odps_daily): assert fp._fp._need_commit is True - with resource.open(mode='a+') as fp: + with resource.open(mode="a+") as fp: assert to_text(fp.read()) == to_text(FILE_CONTENT + OVERWRITE_FILE_CONTENT) fp.seek(1) fp.truncate() @@ -375,31 +398,31 @@ def test_file_resource(odps_daily): # redundant closing should work as well fp.close() - fp = resource.open(mode='r') + fp = resource.open(mode="r") assert to_text(fp.read()) == FILE_CONTENT[0] fp.close() - with resource.open(mode='w+') as fp: + with resource.open(mode="w+") as fp: assert len(fp.read()) == 0 fp.write(FILE_CONTENT) - with resource.open(mode='r+') as fp: + with resource.open(mode="r+") as fp: assert to_text(fp.read()) == FILE_CONTENT - resource.update(file_obj='update') - with resource.open(mode='rb') as fp: + resource.update(file_obj="update") + with resource.open(mode="rb") as fp: assert isinstance(fp.read(), six.binary_type) fp.seek(0) - assert to_text(fp.read()) == to_text('update') + assert to_text(fp.read()) == to_text("update") odps.delete_resource(resource_name) def test_volume_archive_resource(odps): - volume_name = tn('pyodps_t_tmp_resource_archive_volume') - resource_name = tn('pyodps_t_tmp_volume_archive_resource') + '.zip' - partition_name = 'test_partition' - file_name = 'test_file.zip' + volume_name = tn("pyodps_t_tmp_resource_archive_volume") + resource_name = tn("pyodps_t_tmp_volume_archive_resource") + ".zip" + partition_name = "test_partition" + file_name = "test_file.zip" try: odps.delete_volume(volume_name) except errors.ODPSError: @@ -410,17 +433,19 @@ def test_volume_archive_resource(odps): pass file_io = six.BytesIO() - zfile = zipfile.ZipFile(file_io, 'a', zipfile.ZIP_DEFLATED, False) - zfile.writestr('file1.txt', FILE_CONTENT) - zfile.writestr('file2.txt', OVERWRITE_FILE_CONTENT) + zfile = zipfile.ZipFile(file_io, "a", zipfile.ZIP_DEFLATED, False) + zfile.writestr("file1.txt", FILE_CONTENT) + zfile.writestr("file2.txt", OVERWRITE_FILE_CONTENT) zfile.close() odps.create_parted_volume(volume_name) with odps.open_volume_writer(volume_name, partition_name) as writer: writer.write(file_name, file_io.getvalue()) - volume_file = odps.get_volume_partition(volume_name, partition_name).files[file_name] - odps.create_resource(resource_name, 'volumearchive', volume_file=volume_file) + volume_file = odps.get_volume_partition(volume_name, partition_name).files[ + file_name + ] + odps.create_resource(resource_name, "volumearchive", volume_file=volume_file) res = odps.get_resource(resource_name) assert isinstance(res, VolumeArchiveResource) assert res.type == Resource.Type.VOLUMEARCHIVE @@ -429,10 +454,10 @@ def test_volume_archive_resource(odps): def test_volume_file_resource(odps): - volume_name = tn('pyodps_t_tmp_resource_file_volume') - resource_name = tn('pyodps_t_tmp_volume_file_resource') - partition_name = 'test_partition' - file_name = 'test_file.txt' + volume_name = tn("pyodps_t_tmp_resource_file_volume") + resource_name = tn("pyodps_t_tmp_volume_file_resource") + partition_name = "test_partition" + file_name = "test_file.txt" try: odps.delete_volume(volume_name) except errors.ODPSError: @@ -446,8 +471,10 @@ def test_volume_file_resource(odps): with odps.open_volume_writer(volume_name, partition_name) as writer: writer.write(file_name, FILE_CONTENT) - volume_file = odps.get_volume_partition(volume_name, partition_name).files[file_name] - odps.create_resource(resource_name, 'volumefile', volume_file=volume_file) + volume_file = odps.get_volume_partition(volume_name, partition_name).files[ + file_name + ] + odps.create_resource(resource_name, "volumefile", volume_file=volume_file) res = odps.get_resource(resource_name) assert isinstance(res, VolumeFileResource) assert res.type == Resource.Type.VOLUMEFILE diff --git a/odps/models/tests/test_schemas.py b/odps/models/tests/test_schemas.py index d61ade44..bea2a7c3 100644 --- a/odps/models/tests/test_schemas.py +++ b/odps/models/tests/test_schemas.py @@ -43,7 +43,7 @@ def setup_module_schema(odps_with_schema): _project_has_schema_api.pop( (odps_with_schema.endpoint, odps_with_schema.project), None ) - options.always_enable_schema = False + options.enable_schema = False for cls_schema_names in (TEST_CLS_SCHEMA_NAME, TEST_CLS_SCHEMA_NAME2): if odps_with_schema.exist_schema(cls_schema_names): @@ -68,7 +68,7 @@ def reset_schema_config(odps_with_schema): _project_has_schema_api.pop( (odps_with_schema.endpoint, odps_with_schema.project), None ) - options.always_enable_schema = False + options.enable_schema = False def _assert_schema_deleted(odps, schema_name): @@ -154,7 +154,9 @@ def test_default_schema(odps_with_schema): assert schema.project.name == odps_with_schema.project assert schema.name == TEST_CLS_SCHEMA_NAME - res = new_odps.create_resource(TEST_RESOURCE_NAME, "file", fileobj=BytesIO(b"content")) + res = new_odps.create_resource( + TEST_RESOURCE_NAME, "file", fileobj=BytesIO(b"content") + ) assert new_odps.exist_resource(TEST_RESOURCE_NAME) assert res.schema.name == TEST_CLS_SCHEMA_NAME @@ -174,7 +176,7 @@ def test_table_with_schema(odps_with_schema, schema_name): odps = odps_with_schema if schema_name is None: - options.always_enable_schema = True + options.enable_schema = True default_schema_name = "default" if odps.is_schema_namespace_enabled() else None @@ -188,9 +190,7 @@ def test_table_with_schema(odps_with_schema, schema_name): ) assert table.get_schema().name == schema_name or default_schema_name - tables = list( - odps.list_tables(prefix=test_table_name, schema=schema_name) - ) + tables = list(odps.list_tables(prefix=test_table_name, schema=schema_name)) assert len(tables) >= 1 assert tables[0].name == test_table_name assert tables[0].get_schema().name == schema_name or default_schema_name @@ -214,7 +214,9 @@ def test_table_with_schema(odps_with_schema, schema_name): arrow_array = pa.array(["abc", "def"]) writer.write(pa.record_batch([arrow_array], names=["col1"])) - with table.open_reader(reopen=True, partition=test_partition, arrow=True) as reader: + with table.open_reader( + reopen=True, partition=test_partition, arrow=True + ) as reader: arrow_table = reader.read_all() assert arrow_table.num_rows == 2 @@ -233,10 +235,8 @@ def test_get_table_with_schema_opt(odps_with_schema): test_table_name = tn("pyodps_test_table_with_schema2") try: - options.always_enable_schema = True - odps.delete_table( - test_table_name, schema=TEST_CLS_SCHEMA_NAME, if_exists=True - ) + options.enable_schema = True + odps.delete_table(test_table_name, schema=TEST_CLS_SCHEMA_NAME, if_exists=True) odps.create_table( test_table_name, "col1 string", schema=TEST_CLS_SCHEMA_NAME, lifecycle=1 ) @@ -247,7 +247,7 @@ def test_get_table_with_schema_opt(odps_with_schema): tb.drop() finally: - options.always_enable_schema = False + options.enable_schema = False def test_table_tenant_config(odps_with_schema): @@ -277,16 +277,23 @@ def test_file_resource_with_schema(odps_with_schema): test_file_res_name = tn("pyodps_test_file_resource") try: - odps_with_schema.delete_resource(test_file_res_name, schema=TEST_CLS_SCHEMA_NAME) + odps_with_schema.delete_resource( + test_file_res_name, schema=TEST_CLS_SCHEMA_NAME + ) except NoSuchObject: pass res = odps_with_schema.create_resource( - test_file_res_name, "file", fileobj=BytesIO(b"content"), schema=TEST_CLS_SCHEMA_NAME + test_file_res_name, + "file", + fileobj=BytesIO(b"content"), + schema=TEST_CLS_SCHEMA_NAME, ) assert res.schema.name == TEST_CLS_SCHEMA_NAME - assert odps_with_schema.exist_resource(test_file_res_name, schema=TEST_CLS_SCHEMA_NAME) + assert odps_with_schema.exist_resource( + test_file_res_name, schema=TEST_CLS_SCHEMA_NAME + ) resources = list(odps_with_schema.list_resources(schema=TEST_CLS_SCHEMA_NAME)) assert 1 == len(resources) @@ -307,7 +314,9 @@ def test_table_resource_with_schema(odps_with_schema): test_res_table_name = tn("pyodps_test_resource_table") try: - odps_with_schema.delete_resource(test_table_res_name, schema=TEST_CLS_SCHEMA_NAME) + odps_with_schema.delete_resource( + test_table_res_name, schema=TEST_CLS_SCHEMA_NAME + ) except NoSuchObject: pass @@ -362,7 +371,7 @@ def test_function_with_resource(odps_with_schema): test_func_name, class_type=test_func_res_name + ".MyPlus", resources=[res], - schema=TEST_CLS_SCHEMA_NAME + schema=TEST_CLS_SCHEMA_NAME, ) assert func.schema.name == TEST_CLS_SCHEMA_NAME @@ -376,5 +385,7 @@ def test_function_with_resource(odps_with_schema): assert funcs[0].resources[0].schema.name == TEST_CLS_SCHEMA_NAME2 odps.delete_function(test_func_name, schema=TEST_CLS_SCHEMA_NAME) - assert not odps_with_schema.exist_function(test_func_name, schema=TEST_CLS_SCHEMA_NAME) + assert not odps_with_schema.exist_function( + test_func_name, schema=TEST_CLS_SCHEMA_NAME + ) odps.delete_resource(test_func_res_file, schema=TEST_CLS_SCHEMA_NAME2) diff --git a/odps/models/tests/test_security.py b/odps/models/tests/test_security.py index 0d2d702c..1975329a 100644 --- a/odps/models/tests/test_security.py +++ b/odps/models/tests/test_security.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,9 +19,9 @@ from ...config import options from ...errors import NoSuchObject, ODPSError, SecurityQueryError -from ...tests.core import tn, global_locked +from ...tests.core import global_locked, tn -TEST_ROLE_NAME = tn('test_role_name') +TEST_ROLE_NAME = tn("test_role_name") TEST_PROJECT_POLICY_STRING = """ { @@ -122,7 +122,9 @@ def test_project_methods(project): assert cur_user.id is not None and cur_user.display_name is not None old_policy = project.policy - policy_json = json.loads(TEST_PROJECT_POLICY_STRING.replace('#project#', project.name)) + policy_json = json.loads( + TEST_PROJECT_POLICY_STRING.replace("#project#", project.name) + ) project.policy = policy_json project.reload() assert json.dumps(project.policy) == json.dumps(policy_json) @@ -152,9 +154,9 @@ def test_roles(odps, project): assert TEST_ROLE_NAME in [r.name for r in project.roles] assert TEST_ROLE_NAME in project.roles assert role in project.roles - assert 'non_exist_role_name' not in project.roles + assert "non_exist_role_name" not in project.roles - policy_json = json.loads(TEST_ROLE_POLICY_STRING.replace('#project#', project.name)) + policy_json = json.loads(TEST_ROLE_POLICY_STRING.replace("#project#", project.name)) role.policy = policy_json role.reload() assert json.dumps(role.policy) == json.dumps(policy_json) @@ -163,9 +165,9 @@ def test_roles(odps, project): assert TEST_ROLE_NAME not in project.roles -@global_locked('odps_project_user') +@global_locked("odps_project_user") def test_users(config, project): - secondary_user = config.get('test', 'secondary_user') + secondary_user = config.get("test", "secondary_user") if not secondary_user: return @@ -174,15 +176,15 @@ def test_users(config, project): project.users.create(secondary_user) assert secondary_user in project.users assert secondary_user in [user.display_name for user in project.users] - assert 'non_exist_user' not in project.users + assert "non_exist_user" not in project.users project.users.delete(secondary_user) assert secondary_user not in project.users -@global_locked('odps_project_user') +@global_locked("odps_project_user") def test_user_role(config, project): - secondary_user = config.get('test', 'secondary_user') + secondary_user = config.get("test", "secondary_user") if not secondary_user: return @@ -211,11 +213,11 @@ def test_user_role(config, project): def test_security_query(odps, project): - assert 'ALIYUN' in odps.run_security_query('LIST ACCOUNTPROVIDERS') - assert 'ALIYUN' in odps.execute_security_query('LIST ACCOUNTPROVIDERS') + assert "ALIYUN" in odps.run_security_query("LIST ACCOUNTPROVIDERS") + assert "ALIYUN" in odps.execute_security_query("LIST ACCOUNTPROVIDERS") inst = odps.run_security_query( - 'INSTALL PACKAGE %s.non_exist_package' % project.name + "INSTALL PACKAGE %s.non_exist_package" % project.name ) assert isinstance(inst, project.AuthQueryInstance) @@ -226,7 +228,7 @@ def test_security_query(odps, project): with pytest.raises(SecurityQueryError): odps.execute_security_query( - 'INSTALL PACKAGE %s.non_exist_package' % project.name + "INSTALL PACKAGE %s.non_exist_package" % project.name ) @@ -237,12 +239,8 @@ def test_generate_auth_token(odps, project): policy = { "Version": "1", "Statement": [ - { - "Action": ["odps:*"], - "Resource": "acs:odps:*:*", - "Effect": "Allow" - } - ] + {"Action": ["odps:*"], "Resource": "acs:odps:*:*", "Effect": "Allow"} + ], } token = project.generate_auth_token(policy, "bearer", 5) diff --git a/odps/models/tests/test_session.py b/odps/models/tests/test_session.py index 0713af01..8c1ce979 100644 --- a/odps/models/tests/test_session.py +++ b/odps/models/tests/test_session.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,20 +23,22 @@ import mock import pytest + try: import pandas as pd except ImportError: pd = None try: - from filelock import FileLock, Timeout as FLTimeout + from filelock import FileLock + from filelock import Timeout as FLTimeout except ImportError: FileLock = None -from ... import errors, ODPS -from ...errors import ODPSError, InvalidStateSetting +from ... import ODPS, errors +from ...errors import InvalidStateSetting, ODPSError from ...tests.core import tn -from .. import Instance, TableSchema, Record -from ..session import FallbackPolicy, FallbackMode +from .. import Instance, Record, TableSchema +from ..session import FallbackMode, FallbackPolicy logger = logging.getLogger(__name__) is_windows = sys.platform.lower().startswith("win") @@ -45,8 +47,8 @@ TEST_SESSION_WORKER_MEMORY = 512 TEST_TABLE_NAME = tn("_pyodps__session_test_table") -TEST_CREATE_SCHEMA = TableSchema.from_lists(['id'], ['string']) -TEST_DATA = [['1'], ['2'], ['3'], ['4'], ['5']] +TEST_CREATE_SCHEMA = TableSchema.from_lists(["id"], ["string"]) +TEST_DATA = [["1"], ["2"], ["3"], ["4"], ["5"]] TEST_SELECT_STRING = "select * from %s" % TEST_TABLE_NAME @@ -117,7 +119,9 @@ def _wait_session_startup(session_instance): def test_create_mcqa_session(odps): - sess_instance = odps._create_mcqa_session(TEST_SESSION_WORKERS, TEST_SESSION_WORKER_MEMORY) + sess_instance = odps._create_mcqa_session( + TEST_SESSION_WORKERS, TEST_SESSION_WORKER_MEMORY + ) assert sess_instance with mock.patch( "odps.models.instance.Instance.is_running", new=lambda *_, **__: False @@ -129,7 +133,9 @@ def test_create_mcqa_session(odps): def test_attach_mcqa_session(odps): - sess_instance = odps._create_mcqa_session(TEST_SESSION_WORKERS, TEST_SESSION_WORKER_MEMORY) + sess_instance = odps._create_mcqa_session( + TEST_SESSION_WORKERS, TEST_SESSION_WORKER_MEMORY + ) assert sess_instance # wait to running _wait_session_startup(sess_instance) @@ -149,7 +155,9 @@ def test_attach_default_session(odps): def test_session_failing_sql(odps): odps.delete_table(TEST_TABLE_NAME, if_exists=True) - sess_instance = odps._create_mcqa_session(TEST_SESSION_WORKERS, TEST_SESSION_WORKER_MEMORY) + sess_instance = odps._create_mcqa_session( + TEST_SESSION_WORKERS, TEST_SESSION_WORKER_MEMORY + ) assert sess_instance # wait to running _wait_session_startup(sess_instance) @@ -164,12 +172,16 @@ def test_session_failing_sql(odps): def test_direct_execute_failing_sql(odps): odps.delete_table(TEST_TABLE_NAME, if_exists=True) # the default public session may not exist, so we create one beforehand - sess_instance = odps._create_mcqa_session(TEST_SESSION_WORKERS, TEST_SESSION_WORKER_MEMORY) + sess_instance = odps._create_mcqa_session( + TEST_SESSION_WORKERS, TEST_SESSION_WORKER_MEMORY + ) assert sess_instance # wait to running _wait_session_startup(sess_instance) - select_inst = odps.run_sql_interactive(TEST_SELECT_STRING, service_name=sess_instance.name) + select_inst = odps.run_sql_interactive( + TEST_SELECT_STRING, service_name=sess_instance.name + ) select_inst.wait_for_completion() # should return normally even the task is failed with pytest.raises(ODPSError): @@ -181,7 +193,9 @@ def test_session_sql(odps): odps.delete_table(TEST_TABLE_NAME, if_exists=True) table = odps.create_table(TEST_TABLE_NAME, TEST_CREATE_SCHEMA) assert table - sess_instance = odps._create_mcqa_session(TEST_SESSION_WORKERS, TEST_SESSION_WORKER_MEMORY) + sess_instance = odps._create_mcqa_session( + TEST_SESSION_WORKERS, TEST_SESSION_WORKER_MEMORY + ) assert sess_instance # wait to running _wait_session_startup(sess_instance) @@ -197,9 +211,19 @@ def test_session_sql(odps): rows.append(each_row.values) if pd is not None: - with _dump_instance_results(select_inst), select_inst.open_reader(tunnel=True) as rd: + with _dump_instance_results(select_inst), select_inst.open_reader( + tunnel=True + ) as rd: pd_result = rd.to_pandas() - pd.testing.assert_frame_equal(pd_result, pd.DataFrame(TEST_DATA, columns=["id"])) + pd.testing.assert_frame_equal( + pd_result, pd.DataFrame(TEST_DATA, columns=["id"]) + ) + + with _dump_instance_results(select_inst): + pd_result = select_inst.to_pandas() + pd.testing.assert_frame_equal( + pd_result, pd.DataFrame(TEST_DATA, columns=["id"]) + ) assert len(rows) == len(TEST_DATA) assert len(rows[0]) == len(TEST_DATA[0]) @@ -214,15 +238,25 @@ def test_direct_execute_sql(odps): table = odps.create_table(TEST_TABLE_NAME, TEST_CREATE_SCHEMA) assert table # the default public session may not exist, so we create one beforehand - sess_instance = odps._create_mcqa_session(TEST_SESSION_WORKERS, TEST_SESSION_WORKER_MEMORY) + sess_instance = odps._create_mcqa_session( + TEST_SESSION_WORKERS, TEST_SESSION_WORKER_MEMORY + ) assert sess_instance # wait to running _wait_session_startup(sess_instance) records = [Record(schema=TEST_CREATE_SCHEMA, values=values) for values in TEST_DATA] odps.write_table(table, 0, records) - select_inst = odps.run_sql_interactive(TEST_SELECT_STRING, service_name=sess_instance.name) - select_inst.wait_for_success() + select_inst = odps.run_sql_interactive( + TEST_SELECT_STRING, service_name=sess_instance.name + ) + + assert "subQuery" in select_inst.get_logview_address() + assert select_inst._get_queueing_info() + time.sleep(3) + assert select_inst.get_sql_query().rstrip(";") == TEST_SELECT_STRING.rstrip(";") + assert select_inst.get_task_detail2(select_inst._session_task_name) + rows = [] with _dump_instance_results(select_inst), select_inst.open_reader() as rd: @@ -242,7 +276,9 @@ def test_direct_execute_sql_fallback(odps): table = odps.create_table(TEST_TABLE_NAME, TEST_CREATE_SCHEMA) assert table # the default public session may not exist, so we create one beforehand - sess_instance = odps._create_mcqa_session(TEST_SESSION_WORKERS, TEST_SESSION_WORKER_MEMORY) + sess_instance = odps._create_mcqa_session( + TEST_SESSION_WORKERS, TEST_SESSION_WORKER_MEMORY + ) assert sess_instance # wait to running _wait_session_startup(sess_instance) @@ -253,15 +289,24 @@ def test_direct_execute_sql_fallback(odps): with pytest.raises(ODPSError): odps.execute_sql_interactive( - TEST_SELECT_STRING, service_name=sess_instance.name, hints=hints, fallback=False + TEST_SELECT_STRING, + service_name=sess_instance.name, + hints=hints, + fallback=False, ) with pytest.raises(ODPSError): odps.execute_sql_interactive( - TEST_SELECT_STRING, service_name=sess_instance.name, hints=hints, fallback="noresource" + TEST_SELECT_STRING, + service_name=sess_instance.name, + hints=hints, + fallback="noresource", ) with pytest.raises(ODPSError): odps.execute_sql_interactive( - TEST_SELECT_STRING, service_name=sess_instance.name, hints=hints, fallback={"generic", "noresource"} + TEST_SELECT_STRING, + service_name=sess_instance.name, + hints=hints, + fallback={"generic", "noresource"}, ) select_inst = odps.execute_sql_interactive( @@ -286,7 +331,9 @@ def test_session_sql_with_instance_tunnel(odps): odps.delete_table(TEST_TABLE_NAME, if_exists=True) table = odps.create_table(TEST_TABLE_NAME, TEST_CREATE_SCHEMA) assert table - sess_instance = odps._create_mcqa_session(TEST_SESSION_WORKERS, TEST_SESSION_WORKER_MEMORY) + sess_instance = odps._create_mcqa_session( + TEST_SESSION_WORKERS, TEST_SESSION_WORKER_MEMORY + ) assert sess_instance # wait to running _wait_session_startup(sess_instance) @@ -297,7 +344,9 @@ def test_session_sql_with_instance_tunnel(odps): select_inst.wait_for_success() rows = [] - with _dump_instance_results(select_inst), select_inst.open_reader(tunnel=True) as rd: + with _dump_instance_results(select_inst), select_inst.open_reader( + tunnel=True + ) as rd: for each_row in rd: rows.append(each_row.values) @@ -328,36 +377,48 @@ def test_fallback_policy(): assert getattr(set_str_policy, policy_name) assert policy_name in repr(set_policy) - assert policy.get_mode_from_exception( - errors.SQARetryError("Retry") - ) == FallbackMode.INTERACTIVE - assert policy.get_mode_from_exception( - errors.ODPSError("Job is cancelled") - ) is None - assert all_policy.get_mode_from_exception( - errors.ODPSError("MiscError") - ) == FallbackMode.OFFLINE - assert set_policy.get_mode_from_exception( - errors.ODPSError("MiscError") - ) is None - assert set_policy.get_mode_from_exception( - errors.SQAGenericError("MiscError") - ) is FallbackMode.OFFLINE - assert default_policy.get_mode_from_exception( - errors.SQAGenericError("MiscError") - ) is None - assert default_policy.get_mode_from_exception( - errors.SQAUnsupportedFeature("UnsupportedFeature") - ) is FallbackMode.OFFLINE - assert default_policy.get_mode_from_exception( - errors.SQAServiceUnavailable("ServiceUnavailable") - ) is FallbackMode.OFFLINE - assert default_policy.get_mode_from_exception( - errors.SQAResourceNotEnough("ResourceNotEnough") - ) is FallbackMode.OFFLINE - assert default_policy.get_mode_from_exception( - errors.SQAQueryTimedout("QueryTimedout") - ) is FallbackMode.OFFLINE + assert ( + policy.get_mode_from_exception(errors.SQARetryError("Retry")) + == FallbackMode.INTERACTIVE + ) + assert ( + policy.get_mode_from_exception(errors.ODPSError("Job is cancelled")) is None + ) + assert ( + all_policy.get_mode_from_exception(errors.ODPSError("MiscError")) + == FallbackMode.OFFLINE + ) + assert set_policy.get_mode_from_exception(errors.ODPSError("MiscError")) is None + assert ( + set_policy.get_mode_from_exception(errors.SQAGenericError("MiscError")) + is FallbackMode.OFFLINE + ) + assert ( + default_policy.get_mode_from_exception(errors.SQAGenericError("MiscError")) + is None + ) + assert ( + default_policy.get_mode_from_exception( + errors.SQAUnsupportedFeature("UnsupportedFeature") + ) + is FallbackMode.OFFLINE + ) + assert ( + default_policy.get_mode_from_exception( + errors.SQAServiceUnavailable("ServiceUnavailable") + ) + is FallbackMode.OFFLINE + ) + assert ( + default_policy.get_mode_from_exception( + errors.SQAResourceNotEnough("ResourceNotEnough") + ) + is FallbackMode.OFFLINE + ) + assert ( + default_policy.get_mode_from_exception(errors.SQAQueryTimedout("QueryTimedout")) + is FallbackMode.OFFLINE + ) def test_reuse_session(odps): diff --git a/odps/models/tests/test_storage_tier.py b/odps/models/tests/test_storage_tier.py index d0a017ec..e4e9be27 100644 --- a/odps/models/tests/test_storage_tier.py +++ b/odps/models/tests/test_storage_tier.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/odps/models/tests/test_tableio.py b/odps/models/tests/test_tableio.py index 80db5492..de51e8eb 100644 --- a/odps/models/tests/test_tableio.py +++ b/odps/models/tests/test_tableio.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import errno import logging import multiprocessing import sys @@ -29,34 +28,63 @@ import mock import pytest -from ...compat import six, futures +from ... import types as odps_types +from ...compat import futures, six from ...config import options from ...errors import NoSuchObject -from ..tableio import MPBlockClient, MPBlockServer -from ...tests.core import tn, pandas_case +from ...tests.core import get_test_unique_name, pandas_case, py_and_c, pyarrow_case, tn +from ...tunnel import TableTunnel from ...utils import to_text -from .. import TableSchema, Record +from .. import Record, TableSchema +from ..tableio import MPBlockClient, MPBlockServer + + +def _reloader(): + from ...conftest import get_config + + cfg = get_config() + cfg.tunnel = TableTunnel(cfg.odps, endpoint=cfg.odps._tunnel_endpoint) + + +py_and_c_deco = py_and_c( + [ + "odps.models.record", + "odps.models", + "odps.tunnel.io.reader", + "odps.tunnel.io.writer", + "odps.tunnel.tabletunnel", + "odps.tunnel.instancetunnel", + ], + _reloader, +) @pytest.mark.parametrize("use_legacy", [False, True]) def test_record_read_write_table(odps, use_legacy): - test_table_name = tn('pyodps_t_tmp_read_write_table') - schema = TableSchema.from_lists(['id', 'name', 'right'], ['bigint', 'string', 'boolean']) + test_table_name = tn("pyodps_t_tmp_read_write_table_" + get_test_unique_name(5)) + schema = TableSchema.from_lists( + ["id", "name", "right"], ["bigint", "string", "boolean"] + ) odps.delete_table(test_table_name, if_exists=True) assert odps.exist_table(test_table_name) is False table = odps.create_table(test_table_name, schema) - data = [[111, 'aaa', True], - [222, 'bbb', False], - [333, 'ccc', True], - [5940813139082772990, '中文', False]] + data = [ + [111, "aaa", True], + [222, "bbb", False], + [333, "ccc", True], + [5940813139082772990, "中文", False], + ] length = len(data) records = [Record(schema=schema, values=values) for values in data] texted_data = [[it[0], to_text(it[1]), it[2]] for it in data] - odps.write_table(table, 0, records) + if use_legacy: + records = (rec for rec in records) + + odps.write_table(table, records) assert texted_data == [record.values for record in odps.read_table(table, length)] assert texted_data[::2] == [ record.values for record in odps.read_table(table, length, step=2) @@ -74,24 +102,30 @@ def test_record_read_write_table(odps, use_legacy): def test_array_iter_read_write_table(odps): - test_table_name = tn('pyodps_t_tmp_read_write_table') - schema = TableSchema.from_lists(['id', 'name', 'right'], ['bigint', 'string', 'boolean']) + test_table_name = tn("pyodps_t_tmp_array_iter_read_write_table") + schema = TableSchema.from_lists( + ["id", "name", "right"], ["bigint", "string", "boolean"] + ) odps.delete_table(test_table_name, if_exists=True) assert odps.exist_table(test_table_name) is False table = odps.create_table(test_table_name, schema) - data = [[111, 'aaa', True], - [222, 'bbb', False], - [333, 'ccc', True], - [444, '中文', False]] + data = [ + [111, "aaa", True], + [222, "bbb", False], + [333, "ccc", True], + [444, "中文", False], + ] length = len(data) texted_data = [[it[0], to_text(it[1]), it[2]] for it in data] odps.write_table(table, 0, data) assert texted_data == [record.values for record in odps.read_table(table, length)] - assert texted_data[::2] == [record.values for record in odps.read_table(table, length, step=2)] + assert texted_data[::2] == [ + record.values for record in odps.read_table(table, length, step=2) + ] assert texted_data == [record.values for record in table.head(length)] @@ -106,9 +140,9 @@ def test_array_iter_read_write_table(odps): def test_read_write_partition_table(odps): - test_table_name = tn('pyodps_t_tmp_read_write_partition_table') + test_table_name = tn("pyodps_t_tmp_read_write_partition_table") schema = TableSchema.from_lists( - ['id', 'name'], ['bigint', 'string'], ['pt'], ['string'] + ["id", "name"], ["bigint", "string"], ["pt"], ["string"] ) odps.delete_table(test_table_name, if_exists=True) @@ -117,8 +151,8 @@ def test_read_write_partition_table(odps): table = odps.create_table(test_table_name, schema) table._upload_ids = dict() - pt1 = 'pt=20151122' - pt2 = 'pt=20151123' + pt1 = "pt=20151122" + pt2 = "pt=20151123" table.create_partition(pt1) table.create_partition(pt2) @@ -126,12 +160,12 @@ def test_read_write_partition_table(odps): assert len(list(reader)) == 0 with table.open_writer(pt1, commit=False) as writer: - record = table.new_record([1, 'name1']) + record = table.new_record([1, "name1"]) writer.write(record) record = table.new_record() record[0] = 3 - record[1] = 'name3' + record[1] = "name3" writer.write(record) assert len(table._upload_ids) == 1 @@ -141,105 +175,155 @@ def test_read_write_partition_table(odps): assert upload_id == list(table._upload_ids.values())[0] with table.open_writer(pt2) as writer: - writer.write([2, 'name2']) + writer.write([2, "name2"]) with table.open_reader(pt1, reopen=True) as reader: records = list(reader) assert len(records) == 2 assert sum(r[0] for r in records) == 4 + with table.open_reader(pt2, append_partitions=False, reopen=True) as reader: + records = list(reader) + assert len(records[0]) == 2 + assert len(records) == 1 + assert sum(r[0] for r in records) == 2 + with table.open_reader(pt2, reopen=True) as reader: records = list(reader) + assert len(records[0]) == 3 assert len(records) == 1 assert sum(r[0] for r in records) == 2 + # need to guarantee generators of + odps.write_table(table, (rec for rec in records), partition=pt2) + ret_records = list(odps.read_table(table, partition=pt2)) + assert len(ret_records) == 2 + + if pa is not None and pd is not None: + with table.open_reader(pt2, arrow=True, reopen=True) as reader: + result = reader.to_pandas() + assert len(result.dtypes) == 2 + + with table.open_reader( + pt2, arrow=True, append_partitions=True, reopen=True + ) as reader: + result = reader.to_pandas() + assert len(result.dtypes) == 3 + table.drop() def test_simple_record_read_write_table(odps): - test_table_name = tn('pyodps_t_tmp_simpe_read_write_table') - schema = TableSchema.from_lists(['num'], ['string'], ['pt'], ['string']) + test_table_name = tn("pyodps_t_tmp_simple_record_read_write_table") + schema = TableSchema.from_lists(["num"], ["string"], ["pt"], ["string"]) odps.delete_table(test_table_name, if_exists=True) table = odps.create_table(test_table_name, schema) - partition = 'pt=20151122' + partition = "pt=20151122" table.create_partition(partition) with table.open_writer(partition) as writer: record = table.new_record() - record[0] = '1' + record[0] = "1" writer.write(record) with table.open_reader(partition) as reader: assert reader.count == 1 record = next(reader) - assert record[0] == '1' - assert record.num == '1' + assert record[0] == "1" + assert record.num == "1" if pd is not None: with table.open_reader(partition, reopen=True) as reader: pd_data = reader.to_pandas() assert len(pd_data) == 1 - partition = 'pt=20151123' - pytest.raises(NoSuchObject, lambda: table.open_writer(partition, create_partition=False)) + partition = "pt=20151123" + with pytest.raises(NoSuchObject): + table.open_writer(partition, create_partition=False) with table.open_writer(partition, create_partition=True) as writer: record = table.new_record() - record[0] = '1' + record[0] = "1" writer.write(record) with table.open_reader(partition) as reader: assert reader.count == 1 record = next(reader) - assert record[0] == '1' - assert record.num == '1' + assert record[0] == "1" + assert record.num == "1" table.drop() def test_simple_array_read_write_table(odps): - test_table_name = tn('pyodps_t_tmp_simpe_read_write_table') - schema = TableSchema.from_lists(['num'], ['string'], ['pt'], ['string']) + test_table_name = tn("pyodps_t_tmp_simple_array_read_write_table") + schema = TableSchema.from_lists(["num"], ["string"], ["pt"], ["string"]) odps.delete_table(test_table_name, if_exists=True) table = odps.create_table(test_table_name, schema) - partition = 'pt=20151122' + partition = "pt=20151122" table.create_partition(partition) with table.open_writer(partition) as writer: - writer.write(['1', ]) + writer.write(["1"]) with table.open_reader(partition) as reader: assert reader.count == 1 record = next(reader) - assert record[0] == '1' - assert record.num == '1' + assert record[0] == "1" + assert record.num == "1" with table.open_reader(partition, async_mode=True, reopen=True) as reader: assert reader.count == 1 record = next(reader) - assert record[0] == '1' - assert record.num == '1' + assert record[0] == "1" + assert record.num == "1" table.drop() def test_table_write_error(odps): - test_table_name = tn('pyodps_t_tmp_test_table_write') - schema = TableSchema.from_lists(['name'], ['string']) + test_table_name = tn("pyodps_t_tmp_test_table_write_error") + schema = TableSchema.from_lists(["name"], ["string"]) odps.delete_table(test_table_name, if_exists=True) table = odps.create_table(test_table_name, schema) try: with table.open_writer() as writer: - writer.write([['Content']]) - raise ValueError('Mock error') + writer.write([["Content"]]) + raise ValueError("Mock error") except ValueError as ex: - assert str(ex) == 'Mock error' + assert str(ex) == "Mock error" + + +@pandas_case +@pyarrow_case +def test_table_to_pandas(odps): + test_table_name = tn("pyodps_t_tmp_to_pandas") + schema = TableSchema.from_lists(["num"], ["bigint"]) + odps.delete_table(test_table_name, if_exists=True) + + table = odps.create_table(test_table_name, schema, lifecycle=1) + with table.open_writer(arrow=True) as writer: + writer.write(pd.DataFrame({"num": np.random.randint(0, 1000, 1000)})) + + pd_data = table.to_pandas(columns=["num"], start=10, count=20) + assert len(pd_data) == 20 + + pd_data = table.to_pandas(columns=["num"], start=10) + assert len(pd_data) == 990 + + batches = [] + for batch in table.iter_pandas(columns=["num"], start=10, count=30, batch_size=10): + assert len(batch) == 10 + batches.append(batch) + assert len(batches) == 3 + + table.drop() @pandas_case @@ -249,12 +333,12 @@ def test_multi_process_to_pandas(odps): if pa is None: pytest.skip("Need pyarrow to run the test.") - test_table_name = tn('pyodps_t_tmp_mproc_read_table') - schema = TableSchema.from_lists(['num'], ['bigint']) + test_table_name = tn("pyodps_t_tmp_mproc_read_table") + schema = TableSchema.from_lists(["num"], ["bigint"]) odps.delete_table(test_table_name, if_exists=True) - table = odps.create_table(test_table_name, schema) + table = odps.create_table(test_table_name, schema, lifecycle=1) with table.open_writer(arrow=True) as writer: writer.write(pd.DataFrame({"num": np.random.randint(0, 1000, 1000)})) @@ -271,7 +355,8 @@ def patched(self, start, *args, **kwargs): with pytest.raises(ValueError): with mock.patch( - "odps.tunnel.tabletunnel.TableDownloadSession.open_record_reader", new=patched + "odps.tunnel.tabletunnel.TableDownloadSession.open_record_reader", + new=patched, ): with table.open_reader() as reader: reader.to_pandas(n_process=2) @@ -280,23 +365,29 @@ def patched(self, start, *args, **kwargs): pd_data = reader.to_pandas(n_process=2) assert len(pd_data) == 1000 + table.drop() + @pandas_case def test_column_select_to_pandas(odps): if pa is None: pytest.skip("Need pyarrow to run the test.") - test_table_name = tn('pyodps_t_tmp_col_select_table') - schema = TableSchema.from_lists(['num1', 'num2'], ['bigint', 'bigint']) + test_table_name = tn("pyodps_t_tmp_col_select_table") + schema = TableSchema.from_lists(["num1", "num2"], ["bigint", "bigint"]) odps.delete_table(test_table_name, if_exists=True) table = odps.create_table(test_table_name, schema) with table.open_writer(arrow=True) as writer: - writer.write(pd.DataFrame({ - "num1": np.random.randint(0, 1000, 1000), - "num2": np.random.randint(0, 1000, 1000), - })) + writer.write( + pd.DataFrame( + { + "num1": np.random.randint(0, 1000, 1000), + "num2": np.random.randint(0, 1000, 1000), + } + ) + ) with table.open_reader(columns=["num1"]) as reader: pd_data = reader.to_pandas() @@ -313,9 +404,8 @@ def test_column_select_to_pandas(odps): def test_complex_type_to_pandas(odps): test_table_name = tn("pyodps_t_tmp_complex_type_to_pd") schema = TableSchema.from_lists( - ['cp1', 'cp2', 'cp3'], [ - 'array', 'map', 'struct' - ] + ["cp1", "cp2", "cp3"], + ["array", "map", "struct"], ) odps.delete_table(test_table_name, if_exists=True) @@ -342,9 +432,10 @@ def test_complex_type_to_pandas(odps): OrderedDict(pd_data.iloc[0, 2]), ] == row + @pandas_case def test_record_to_pandas_batches(odps): - test_table_name = tn('pyodps_t_read_in_batches') + test_table_name = tn("pyodps_t_read_in_batches") odps.delete_table(test_table_name, if_exists=True) rec_count = 37 @@ -371,13 +462,13 @@ def test_record_to_pandas_batches(odps): @pytest.mark.skipif(pa is None, reason="Need pyarrow to run this test") def test_simple_arrow_read_write_table(odps): - test_table_name = tn('pyodps_t_tmp_simple_arrow_read_write_table') - schema = TableSchema.from_lists(['num'], ['string'], ['pt'], ['string']) + test_table_name = tn("pyodps_t_tmp_simple_arrow_read_write_table") + schema = TableSchema.from_lists(["num"], ["string"], ["pt"], ["string"]) odps.delete_table(test_table_name, if_exists=True) table = odps.create_table(test_table_name, schema) - partition = 'pt=20151122' + partition = "pt=20151122" table.create_partition(partition) with table.open_writer(partition, arrow=True) as writer: @@ -413,51 +504,6 @@ def test_simple_arrow_read_write_table(odps): table.drop() -def test_read_with_retry(odps): - test_table_name = tn('pyodps_t_tmp_read_with_retry') - odps.delete_table(test_table_name, if_exists=True) - - table = odps.create_table(test_table_name, "col string") - - try: - data = [["str%d" % idx] for idx in range(10)] - with table.open_writer() as writer: - writer.write(data) - - from ..readers import TunnelRecordReader - original = TunnelRecordReader._open_and_iter_reader - exc_type = ConnectionResetError if six.PY3 else OSError - raised = [] - - def raise_conn_reset(): - exc = exc_type("Connection reset") - exc.errno = errno.ECONNRESET - raised.append(True) - raise exc - - def wrapped(self, start, *args, **kwargs): - for idx, rec in enumerate(original(self, start, *args, **kwargs)): - yield rec - if idx == 2: - raise_conn_reset() - - with mock.patch.object( - TunnelRecordReader, "_open_and_iter_reader", new=wrapped - ): - with table.open_reader() as reader: - assert data == sorted([rec[0]] for rec in reader) - assert len(raised) > 1 - - with pytest.raises(exc_type) as exc_info: - with table.open_reader(reopen=True) as reader: - for idx, _ in enumerate(reader): - if idx == 2: - raise_conn_reset() - assert exc_info.value.errno == errno.ECONNRESET - finally: - table.drop() - - def test_mp_block_server(): class MockWriter(object): def __init__(self): @@ -508,12 +554,10 @@ def _spawned_write(idx, writer, close=True): def test_multi_thread_write(odps): - test_table_name = tn('pyodps_t_tmp_multi_thread_write') + test_table_name = tn("pyodps_t_tmp_multi_thread_write") odps.delete_table(test_table_name, if_exists=True) - table = odps.create_table( - test_table_name, "col1 bigint, col2 bigint, col3 string" - ) + table = odps.create_table(test_table_name, "col1 bigint, col2 bigint, col3 string") try: pool = futures.ThreadPoolExecutor(2) @@ -528,24 +572,26 @@ def test_multi_thread_write(odps): with table.open_reader() as reader: results = sorted([rec.values for rec in reader]) assert results == [ - [0, 0, "row1"], [0, 1, "row2"], [1, 0, "row1"], [1, 1, "row2"] + [0, 0, "row1"], + [0, 1, "row2"], + [1, 0, "row1"], + [1, 1, "row2"], ] finally: table.drop() @pytest.mark.parametrize( - "ctx_name", [ - "fork", "forkserver", "spawn" - ] if sys.version_info[0] > 2 and sys.platform != "win32" else ["spawn"] + "ctx_name", + ["fork", "forkserver", "spawn"] + if sys.version_info[0] > 2 and sys.platform != "win32" + else ["spawn"], ) def test_multi_process_write(odps, ctx_name): - test_table_name = tn('pyodps_t_tmp_multi_process_write') + test_table_name = tn("pyodps_t_tmp_multi_process_write") odps.delete_table(test_table_name, if_exists=True) - table = odps.create_table( - test_table_name, "col1 bigint, col2 bigint, col3 string" - ) + table = odps.create_table(test_table_name, "col1 bigint, col2 bigint, col3 string") if sys.version_info[0] > 2: orig_ctx = multiprocessing.get_start_method() @@ -555,7 +601,9 @@ def test_multi_process_write(odps, ctx_name): with table.open_writer() as writer: procs = [] for idx in range(2): - proc = multiprocessing.Process(target=_spawned_write, args=(idx, writer)) + proc = multiprocessing.Process( + target=_spawned_write, args=(idx, writer) + ) proc.start() procs.append(proc) @@ -565,7 +613,10 @@ def test_multi_process_write(odps, ctx_name): with table.open_reader() as reader: results = sorted([rec.values for rec in reader]) assert results == [ - [0, 0, "row1"], [0, 1, "row2"], [1, 0, "row1"], [1, 1, "row2"] + [0, 0, "row1"], + [0, 1, "row2"], + [1, 0, "row1"], + [1, 1, "row2"], ] finally: if sys.version_info[0] > 2: @@ -574,12 +625,10 @@ def test_multi_process_write(odps, ctx_name): def test_multi_process_pool_write(odps): - test_table_name = tn('pyodps_t_tmp_multi_process_pool_write') + test_table_name = tn("pyodps_t_tmp_multi_process_pool_write") odps.delete_table(test_table_name, if_exists=True) - table = odps.create_table( - test_table_name, "col1 bigint, col2 bigint, col3 string" - ) + table = odps.create_table(test_table_name, "col1 bigint, col2 bigint, col3 string") try: with table.open_writer() as writer: @@ -592,7 +641,177 @@ def test_multi_process_pool_write(odps): with table.open_reader() as reader: results = sorted([rec.values for rec in reader]) assert results == [ - [0, 0, "row1"], [0, 1, "row2"], [1, 0, "row1"], [1, 1, "row2"] + [0, 0, "row1"], + [0, 1, "row2"], + [1, 0, "row1"], + [1, 1, "row2"], ] finally: table.drop() + + +@pandas_case +@pyarrow_case +@pytest.mark.parametrize("use_arrow", [False, True]) +def test_write_table_with_pandas_or_arrow(odps, use_arrow): + suffix = "arrow" if use_arrow else "pd" + test_table_name = tn("pyodps_t_tmp_write_table_pandas_arrow_" + suffix) + odps.delete_table(test_table_name, if_exists=True) + + data = pd.DataFrame( + pd.DataFrame( + [["falcon", 2, 2], ["dog", 4, 0], ["cat", 4, 0], ["ant", 6, 0]], + columns=["names", "num_legs", "num_wings"], + ) + ) + + # test write pandas dataframe + with pytest.raises(NoSuchObject): + odps.write_table(test_table_name, data, lifecycle=1) + + try: + if use_arrow: + data_to_write = pa.Table.from_pandas(data) + else: + data_to_write = data + + odps.write_table(test_table_name, data_to_write, create_table=True, lifecycle=1) + fetched = odps.get_table(test_table_name).to_pandas() + pd.testing.assert_frame_equal(data, fetched) + finally: + odps.delete_table(test_table_name, if_exists=True) + + +@pandas_case +@pyarrow_case +def test_write_table_with_pandas_or_arrow_parted(odps): + test_table_name = tn("pyodps_t_tmp_write_table_pandas_arrow_parted") + odps.delete_table(test_table_name, if_exists=True) + + data = pd.DataFrame( + pd.DataFrame( + [["falcon", 2, 2], ["dog", 4, 0], ["cat", 4, 0], ["ant", 6, 0]], + columns=["names", "num_legs", "num_wings"], + ) + ) + try: + odps.write_table( + test_table_name, + data, + partition=odps_types.PartitionSpec("pt=test"), + create_table=True, + create_partition=True, + lifecycle=1, + ) + fetched = odps.get_table(test_table_name).to_pandas(partition="pt=test") + pd.testing.assert_frame_equal(data, fetched) + + schema = odps.get_table(test_table_name).table_schema + assert len(schema.simple_columns) == len(data.columns) + + odps.write_table( + test_table_name, + data, + partition="pt=test2", + create_partition=True, + lifecycle=1, + ) + fetched = odps.get_table(test_table_name).to_pandas(partition="pt=test2") + pd.testing.assert_frame_equal(data, fetched) + finally: + odps.delete_table(test_table_name, if_exists=True) + + +@pandas_case +@pyarrow_case +@pytest.mark.parametrize("use_arrow", [False, True]) +def test_write_pandas_with_dynamic_parts(odps, use_arrow): + suffix = "arrow" if use_arrow else "pd" + test_table_name = tn("pyodps_t_tmp_write_pandas_dyn_parts_" + suffix) + odps.delete_table(test_table_name, if_exists=True) + + data = pd.DataFrame( + [[0, 134, "a", "a"], [1, 24, "a", "b"], [2, 131, "a", "a"], [3, 141, "a", "b"]], + columns=["a", "b", "p1", "p2"], + ) + + try: + if use_arrow: + data_to_write = pa.Table.from_pandas(data) + else: + data_to_write = data + + odps.write_table( + test_table_name, + data_to_write, + create_table=True, + partitions=["p1", "p2"], + create_partition=True, + lifecycle=1, + ) + fetched = odps.get_table(test_table_name).to_pandas(partition="p1=a,p2=a") + expected = data[data.p1 == "a"][data.p2 == "a"][["a", "b"]].reset_index( + drop=True + ) + pd.testing.assert_frame_equal(fetched, expected) + + fetched = odps.get_table(test_table_name).to_pandas(partition="p1=a,p2=b") + expected = data[data.p1 == "a"][data.p2 == "b"][["a", "b"]].reset_index( + drop=True + ) + pd.testing.assert_frame_equal(fetched, expected) + finally: + odps.delete_table(test_table_name, if_exists=True) + + +@py_and_c_deco +def test_write_record_with_dynamic_parts(odps): + test_table_name = tn("pyodps_t_tmp_write_rec_dyn_parts") + odps.delete_table(test_table_name, if_exists=True) + + data = [[0, 134, "a"], [1, 24, "b"], [2, 131, "a"], [3, 141, "b"]] + schema = odps_types.OdpsSchema( + [ + odps_types.Column("a", odps_types.bigint), + odps_types.Column("b", odps_types.bigint), + ], + [ + odps_types.Column("p1", odps_types.string), + odps_types.Column("pt", odps_types.string), + ], + ) + + try: + odps.create_table(test_table_name, schema, lifecycle=1) + + with pytest.raises(ValueError): + odps.write_table( + test_table_name, + odps, + partitions="p1", + partition="pt=test", + create_partition=True, + ) + + odps.write_table( + test_table_name, + data, + partitions="p1", + partition="pt=test", + create_partition=True, + ) + fetched = [ + r.values[:2] + for r in odps.read_table(test_table_name, partition="p1=a,pt=test") + ] + expected = [d[:2] for d in data if d[2:] == ["a"]] + assert fetched == expected + + fetched = [ + r.values[:2] + for r in odps.read_table(test_table_name, partition="p1=b,pt=test") + ] + expected = [d[:2] for d in data if d[2:] == ["b"]] + assert fetched == expected + finally: + odps.delete_table(test_table_name, if_exists=True) diff --git a/odps/models/tests/test_tables.py b/odps/models/tests/test_tables.py index 349f9f99..efa6c24c 100644 --- a/odps/models/tests/test_tables.py +++ b/odps/models/tests/test_tables.py @@ -33,8 +33,8 @@ from ...config import options from ...tests.core import tn from ...utils import to_text -from .. import Table, TableSchema, Column, Partition -from ..cluster_info import ClusterType, ClusterSortOrder +from .. import Column, Partition, Table, TableSchema +from ..cluster_info import ClusterSortOrder, ClusterType from ..storage_tier import StorageTier @@ -58,12 +58,12 @@ def test_tables(odps): def test_schema_pickle(): - schema = TableSchema.from_lists(['name'], ['string']) + schema = TableSchema.from_lists(["name"], ["string"]) assert schema == pickle.loads(pickle.dumps(schema)) def test_table_exists(odps): - non_exists_table = 'a_non_exists_table' + non_exists_table = "a_non_exists_table" assert odps.exist_table(non_exists_table) is False @@ -78,48 +78,50 @@ def test_table(odps): assert table is odps.get_table(table.name) - assert table._getattr('format') is None - assert table._getattr('table_schema') is None - assert table._getattr('comment') is None - assert table._getattr('owner') is not None - assert table._getattr('table_label') is None - assert table._getattr('creation_time') is None - assert table._getattr('last_data_modified_time') is None - assert table._getattr('last_meta_modified_time') is None - assert table._getattr('is_virtual_view') is None - assert table._getattr('lifecycle') is None - assert table._getattr('view_text') is None - assert table._getattr('size') is None - assert table._getattr('is_archived') is None - assert table._getattr('physical_size') is None - assert table._getattr('file_num') is None + assert table._getattr("format") is None + assert table._getattr("table_schema") is None + assert table._getattr("comment") is None + assert table._getattr("owner") is not None + assert table._getattr("table_label") is None + assert table._getattr("creation_time") is None + assert table._getattr("last_data_modified_time") is None + assert table._getattr("last_meta_modified_time") is None + assert table._getattr("is_virtual_view") is None + assert table._getattr("lifecycle") is None + assert table._getattr("view_text") is None + assert table._getattr("size") is None + assert table._getattr("is_archived") is None + assert table._getattr("physical_size") is None + assert table._getattr("file_num") is None assert table.is_loaded is False assert table._is_extend_info_loaded is False - test_table_name = tn('pyodps_t_tmp_test_table_attrs') - schema = TableSchema.from_lists(['id', 'name'], ['bigint', 'string'], ['ds', ], ['string',]) + test_table_name = tn("pyodps_t_tmp_test_table_attrs") + schema = TableSchema.from_lists( + ["id", "name"], ["bigint", "string"], ["ds"], ["string"] + ) odps.delete_table(test_table_name, if_exists=True) table = odps.create_table(test_table_name, schema, lifecycle=1) try: assert table is odps.get_table(table.name) - assert table._getattr('format') is None - assert table._getattr('table_schema') is None - assert table._getattr('comment') is None - assert table._getattr('owner') is None - assert table._getattr('table_label') is None - assert table._getattr('creation_time') is None - assert table._getattr('last_data_modified_time') is None - assert table._getattr('last_meta_modified_time') is None - assert table._getattr('is_virtual_view') is None - assert table._getattr('lifecycle') is None - assert table._getattr('view_text') is None - assert table._getattr('size') is None - assert table._getattr('is_archived') is None - assert table._getattr('physical_size') is None - assert table._getattr('file_num') is None + assert table._getattr("format") is None + assert table._getattr("table_schema") is None + assert table._getattr("comment") is None + assert table._getattr("owner") is None + assert table._getattr("table_label") is None + assert table._getattr("creation_time") is None + assert table._getattr("last_data_modified_time") is None + assert table._getattr("last_meta_modified_time") is None + assert table._getattr("is_virtual_view") is None + assert table._getattr("lifecycle") is None + assert table._getattr("view_text") is None + assert table._getattr("size") is None + assert table._getattr("is_archived") is None + assert table._getattr("physical_size") is None + assert table._getattr("file_num") is None assert table.is_loaded is False assert table._is_extend_info_loaded is False @@ -147,29 +149,34 @@ def test_table(odps): def test_create_table_ddl(odps): - test_table_name = tn('pyodps_t_tmp_table_ddl') + test_table_name = tn("pyodps_t_tmp_table_ddl") schema = TableSchema.from_lists( - ['id', 'name'], ['bigint', 'string'], ['ds', ], ['string',] + ["id", "name"], ["bigint", "string"], ["ds"], ["string"] ) odps.delete_table(test_table_name, if_exists=True) table = odps.create_table(test_table_name, schema, lifecycle=10) ddl = table.get_ddl() - assert 'EXTERNAL' not in ddl - assert 'NOT EXISTS' not in ddl + assert "EXTERNAL" not in ddl + assert "NOT EXISTS" not in ddl for col in table.table_schema.names: assert col in ddl ddl = table.get_ddl(if_not_exists=True) - assert 'NOT EXISTS' in ddl + assert "NOT EXISTS" in ddl ddl = Table.gen_create_table_sql( - 'test_external_table', schema, comment='TEST_COMMENT', - storage_handler='com.aliyun.odps.CsvStorageHandler', - serde_properties=OrderedDict([('name1', 'value1'), ('name2', 'value2')]), - location='oss://mock_endpoint/mock_bucket/mock_path/', + "test_external_table", + schema, + comment="TEST_COMMENT", + storage_handler="com.aliyun.odps.CsvStorageHandler", + serde_properties=OrderedDict([("name1", "value1"), ("name2", "value2")]), + location="oss://mock_endpoint/mock_bucket/mock_path/", ) - assert ddl == textwrap.dedent(""" + assert ( + ddl + == textwrap.dedent( + """ CREATE EXTERNAL TABLE `test_external_table` ( `id` BIGINT, `name` STRING @@ -184,13 +191,15 @@ def test_create_table_ddl(odps): 'name2' = 'value2' ) LOCATION 'oss://mock_endpoint/mock_bucket/mock_path/' - """).strip() + """ + ).strip() + ) def test_create_delete_table(odps): - test_table_name = tn('pyodps_t_tmp_create_table') + test_table_name = tn("pyodps_t_tmp_create_table") schema = TableSchema.from_lists( - ['id', 'name'], ['bigint', 'string'], ['ds', ], ['string',] + ["id", "name"], ["bigint", "string"], ["ds"], ["string"] ) tables = odps._project.tables @@ -200,7 +209,7 @@ def test_create_delete_table(odps): table = tables.create(test_table_name, schema, lifecycle=10) - assert table._getattr('owner') is None + assert table._getattr("owner") is None assert table.owner is not None assert table.name == test_table_name @@ -211,7 +220,7 @@ def test_create_delete_table(odps): tables.delete(test_table_name, if_exists=True) assert odps.exist_table(test_table_name) is False - str_schema = ('id bigint, name string', 'ds string') + str_schema = ("id bigint, name string", "ds string") table = tables.create(test_table_name, str_schema, lifecycle=10) assert table.name == test_table_name @@ -232,20 +241,21 @@ def test_create_delete_table(odps): def test_create_table_with_chinese_column(odps): - test_table_name = tn('pyodps_t_tmp_create_table_with_chinese_columns') + test_table_name = tn("pyodps_t_tmp_create_table_with_chinese_columns") columns = [ - Column(name='序列', type='bigint', comment='注释'), - Column(name=u'值', type=u'string', comment=u'注释2'), + Column(name="序列", type="bigint", comment="注释"), + Column(name=u"值", type=u"string", comment=u"注释2"), ] partitions = [ - Partition(name='ds', type='string', comment='分区注释'), - Partition(name=u'ds2', type=u'string', comment=u'分区注释2'), + Partition(name="ds", type="string", comment="分区注释"), + Partition(name=u"ds2", type=u"string", comment=u"分区注释2"), ] schema = TableSchema(columns=columns, partitions=partitions) columns_repr = "[, ]" partitions_repr = "[, ]" - schema_repr = textwrap.dedent(""" + schema_repr = textwrap.dedent( + """ odps.Schema { 序列 bigint # 注释 值 string # 注释2 @@ -254,9 +264,11 @@ def test_create_table_with_chinese_column(odps): ds string # 分区注释 ds2 string # 分区注释2 } - """).strip() + """ + ).strip() - ddl_string_comment = textwrap.dedent(u""" + ddl_string_comment = textwrap.dedent( + u""" CREATE TABLE `table_name` ( `序列` BIGINT COMMENT '注释', `值` STRING COMMENT '注释2' @@ -264,8 +276,10 @@ def test_create_table_with_chinese_column(odps): PARTITIONED BY ( `ds` STRING COMMENT '分区注释', `ds2` STRING COMMENT '分区注释2' - )""").strip() - ddl_string = textwrap.dedent(u""" + )""" + ).strip() + ddl_string = textwrap.dedent( + u""" CREATE TABLE `table_name` ( `序列` BIGINT, `值` STRING @@ -273,7 +287,8 @@ def test_create_table_with_chinese_column(odps): PARTITIONED BY ( `ds` STRING, `ds2` STRING - )""").strip() + )""" + ).strip() assert repr(columns) == columns_repr assert repr(partitions) == partitions_repr @@ -294,7 +309,8 @@ def test_create_table_with_chinese_column(odps): # test repr with not null columns schema[u"序列"].nullable = False columns_repr = "[, ]" - schema_repr = textwrap.dedent(""" + schema_repr = textwrap.dedent( + """ odps.Schema { 序列 bigint not null # 注释 值 string # 注释2 @@ -303,24 +319,24 @@ def test_create_table_with_chinese_column(odps): ds string # 分区注释 ds2 string # 分区注释2 } - """).strip() + """ + ).strip() assert repr(columns) == columns_repr assert repr(schema).strip() == schema_repr -def test_create_transactional_table(odps_daily): - test_table_name = tn('pyodps_t_tmp_transactional') - schema = TableSchema.from_lists(['key', 'value'], ['string', 'string']) +def test_create_transactional_table(odps): + test_table_name = tn("pyodps_t_tmp_transactional") + schema = TableSchema.from_lists(["key", "value"], ["string", "string"]) schema["key"].nullable = False schema["key"].comment = "comment_text" schema["value"].comment = "comment_text2" - odps = odps_daily odps.delete_table(test_table_name, if_exists=True) assert odps.exist_table(test_table_name) is False table = odps.create_table( - test_table_name, schema, transactional=True, primary_key="key" + test_table_name, schema, transactional=True, primary_key="key", lifecycle=1 ) table.reload() assert not table.table_schema["key"].nullable @@ -343,7 +359,7 @@ def test_create_transactional_table(odps_daily): def test_create_tier_table(odps_with_storage_tier): odps = odps_with_storage_tier - test_table_name = tn('pyodps_t_tmp_tiered') + test_table_name = tn("pyodps_t_tmp_tiered") odps.delete_table(test_table_name, if_exists=True) assert odps.exist_table(test_table_name) is False @@ -358,7 +374,7 @@ def test_create_tier_table(odps_with_storage_tier): def test_create_clustered_table(odps): - test_table_name = tn('pyodps_t_tmp_clustered') + test_table_name = tn("pyodps_t_tmp_clustered") odps.delete_table(test_table_name, if_exists=True) assert odps.exist_table(test_table_name) is False @@ -376,7 +392,7 @@ def test_create_clustered_table(odps): assert "CLUSTERED BY" in table.get_ddl() table.drop() - test_table_name = tn('pyodps_t_tmp_range_clustered') + test_table_name = tn("pyodps_t_tmp_range_clustered") odps.delete_table(test_table_name, if_exists=True) assert odps.exist_table(test_table_name) is False @@ -394,14 +410,13 @@ def test_create_clustered_table(odps): def test_create_view(odps): - test_table_name = tn('pyodps_t_tmp_view_source') + test_table_name = tn("pyodps_t_tmp_view_source") odps.delete_table(test_table_name, if_exists=True) assert odps.exist_table(test_table_name) is False - table = odps.create_table( - test_table_name, ("col string, col2 string", "pt string") - ) + table = odps.create_table(test_table_name, ("col string, col2 string", "pt string")) - test_view_name = tn('pyodps_v_tmp_view') + test_view_name = tn("pyodps_v_tmp_view") + odps.delete_view(test_view_name, if_exists=True) odps.execute_sql( "create view %s comment 'comment_text' " "as select * from %s" % (test_view_name, test_table_name) @@ -412,7 +427,8 @@ def test_create_view(odps): assert "CREATE VIEW" in view.get_ddl() view.drop() - test_view_name = tn('pyodps_v_tmp_mt_view') + test_view_name = tn("pyodps_v_tmp_mt_view") + odps.delete_materialized_view(test_view_name, if_exists=True) odps.execute_sql( "create materialized view %s " "disable rewrite " @@ -428,7 +444,7 @@ def test_create_view(odps): def test_run_sql_clear_cache(odps): - test_table_name = tn('pyodps_t_tmp_statement_cache_clear') + test_table_name = tn("pyodps_t_tmp_statement_cache_clear") odps.delete_table(test_table_name, if_exists=True) odps.create_table(test_table_name, "col string") @@ -446,7 +462,7 @@ def test_run_sql_clear_cache(odps): def test_max_partition(odps): - test_table_name = tn('pyodps_t_tmp_max_partition') + test_table_name = tn("pyodps_t_tmp_max_partition") odps.delete_table(test_table_name, if_exists=True) table = odps.create_table(test_table_name, ("col string", "pt1 string, pt2 string")) @@ -459,12 +475,11 @@ def test_max_partition(odps): table.create_partition("pt1=c,pt2=e") assert tuple(table.get_max_partition().partition_spec.values()) == ("b", "c") - assert tuple( - table.get_max_partition(skip_empty=False).partition_spec.values() - ) == ("c", "e") - assert tuple( - table.get_max_partition("pt1=a").partition_spec.values() - ) == ("a", "b") + assert tuple(table.get_max_partition(skip_empty=False).partition_spec.values()) == ( + "c", + "e", + ) + assert tuple(table.get_max_partition("pt1=a").partition_spec.values()) == ("a", "b") assert table.get_max_partition("pt1=c") is None assert table.get_max_partition("pt1=d") is None @@ -483,11 +498,13 @@ def test_schema_arg_backward_compat(odps): with pytest.deprecated_call(): from .. import Schema - columns = [Column(name='num', type='bigint', comment='the column'), - Column(name='num2', type='double', comment='the column2')] + columns = [ + Column(name="num", type="bigint", comment="the column"), + Column(name="num2", type="double", comment="the column2"), + ] schema = Schema(columns=columns) - table_name = tn('test_backward_compat') + table_name = tn("test_backward_compat") with pytest.deprecated_call(): table = odps.create_table(table_name, schema=schema, lifecycle=1) diff --git a/odps/models/tests/test_tenant.py b/odps/models/tests/test_tenant.py index b571032d..31187189 100644 --- a/odps/models/tests/test_tenant.py +++ b/odps/models/tests/test_tenant.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/odps/models/tests/test_volumes.py b/odps/models/tests/test_volumes.py index 8a553f07..67fdbd70 100644 --- a/odps/models/tests/test_volumes.py +++ b/odps/models/tests/test_volumes.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,10 @@ from __future__ import print_function +import warnings + import pytest + try: import oss2 except ImportError: @@ -27,36 +30,40 @@ from ...tests.core import tn from ...utils import to_str from .. import ( - PartedVolume, ExternalVolume, ExternalVolumeDir, ExternalVolumeFile, FSVolume, FSVolumeDir, FSVolumeFile, + PartedVolume, ) -FILE_CONTENT = to_str(""" +FILE_CONTENT = to_str( + """ Four score and seven years ago our fathers brought forth, upon this continent, a new nation, conceived in liberty, and dedicated to the proposition that "all men are created equal" -""") -FILE_CONTENT2 = to_str(""" +""" +) +FILE_CONTENT2 = to_str( + """ Were it to benefit my country I would lay down my life; What then is risk to me? -""") -TEST_PARTED_VOLUME_NAME = tn('pyodps_test_parted_volume') -TEST_FS_VOLUME_NAME = tn('pyodps_test_fs_volume') -TEST_EXT_VOLUME_NAME = tn('pyodps_test_external_volume') +""" +) +TEST_PARTED_VOLUME_NAME = tn("pyodps_test_parted_volume") +TEST_FS_VOLUME_NAME = tn("pyodps_test_fs_volume") +TEST_EXT_VOLUME_NAME = tn("pyodps_test_external_volume") -TEST_PARTITION_NAME = 'pyodps_test_partition' -TEST_FILE_NAME = 'test_output_file' -TEST_FILE_NAME2 = 'test_output_file2' -TEST_NEW_FILE_NAME = 'test_new_output_file' +TEST_PARTITION_NAME = "pyodps_test_partition" +TEST_FILE_NAME = "test_output_file" +TEST_FILE_NAME2 = "test_output_file2" +TEST_NEW_FILE_NAME = "test_new_output_file" -TEST_DIR_NAME = 'pyodps_test_dir' +TEST_DIR_NAME = "pyodps_test_dir" @pytest.fixture @@ -70,6 +77,17 @@ def auto_teardown_volumes(odps): odps.delete_volume(TEST_FS_VOLUME_NAME) +@pytest.fixture +def check_experimental(request): + with warnings.catch_warnings(record=True) as record: + warnings.simplefilter("always", category=FutureWarning) + yield + if request.param: + assert len(record) > 0, "No experimental warnings popped" + else: + assert len(record) == 0, "Unexpected experimental warnings popped" + + def test_volumes(odps): if odps.exist_volume(TEST_PARTED_VOLUME_NAME): odps.delete_volume(TEST_PARTED_VOLUME_NAME) @@ -93,7 +111,7 @@ def test_volumes(odps): assert odps.exist_volume(TEST_PARTED_VOLUME_NAME) is True assert odps.exist_volume(TEST_FS_VOLUME_NAME) is True - assert odps.exist_volume('non_existing_volume') is False + assert odps.exist_volume("non_existing_volume") is False for vol in odps.list_volumes(): assert vol.name is not None @@ -105,7 +123,7 @@ def test_volume_partition_and_file(odps): odps.create_parted_volume(TEST_PARTED_VOLUME_NAME) vol = odps.get_volume(TEST_PARTED_VOLUME_NAME) - partition_path = '/'.join(('', TEST_PARTED_VOLUME_NAME, TEST_PARTITION_NAME)) + partition_path = "/".join(("", TEST_PARTED_VOLUME_NAME, TEST_PARTITION_NAME)) partition = vol.get_partition(TEST_PARTITION_NAME) assert partition is odps.get_volume_partition(partition_path) with partition.open_writer() as writer: @@ -116,19 +134,21 @@ def test_volume_partition_and_file(odps): assert partition.length == len(FILE_CONTENT) + len(FILE_CONTENT2) assert partition.file_number == 2 - file_path = '/'.join(('', TEST_PARTED_VOLUME_NAME, TEST_PARTITION_NAME, TEST_FILE_NAME)) + file_path = "/".join( + ("", TEST_PARTED_VOLUME_NAME, TEST_PARTITION_NAME, TEST_FILE_NAME) + ) file_obj = odps.get_volume_file(file_path) assert file_obj.name == TEST_FILE_NAME - assert odps.project + '/volumes/' + file_path.lstrip('/') == file_obj.path + assert odps.project + "/volumes/" + file_path.lstrip("/") == file_obj.path with partition.files[TEST_FILE_NAME].open_reader() as reader: out_content = reader.read() if not six.PY2: - out_content = out_content.decode('utf-8') + out_content = out_content.decode("utf-8") assert out_content == FILE_CONTENT assert vol.exist_partition(TEST_PARTITION_NAME) is True - assert vol.exist_partition('non_existing_partition') is False + assert vol.exist_partition("non_existing_partition") is False for part in odps.list_volume_partitions(TEST_PARTED_VOLUME_NAME): assert part.name is not None @@ -136,34 +156,46 @@ def test_volume_partition_and_file(odps): for f in partition.list_files(): assert f.name is not None assert len(list(odps.list_volume_files(partition_path))) == 2 - assert any(f.name == TEST_FILE_NAME for f in odps.list_volume_files(partition_path)) is True + assert ( + any(f.name == TEST_FILE_NAME for f in odps.list_volume_files(partition_path)) + is True + ) odps.delete_volume_partition(partition_path) assert odps.exist_volume_partition(partition_path) is False -def test_volume_fs(odps): +@pytest.mark.parametrize("check_experimental", [True], indirect=True) +def test_volume_fs(odps, check_experimental): if odps.exist_volume(TEST_FS_VOLUME_NAME): odps.delete_volume(TEST_FS_VOLUME_NAME) odps.create_fs_volume(TEST_FS_VOLUME_NAME) vol = odps.get_volume(TEST_FS_VOLUME_NAME) - odps.create_volume_directory(vol.path + '/' + TEST_DIR_NAME) + odps.create_volume_directory(vol.path + "/" + TEST_DIR_NAME) dir_obj = vol[TEST_DIR_NAME] assert isinstance(dir_obj, FSVolumeDir) - assert dir_obj is odps.get_volume_file(vol.path + '/' + TEST_DIR_NAME) - assert dir_obj.path == '/' + TEST_FS_VOLUME_NAME + '/' + TEST_DIR_NAME - assert any(f.path in (dir_obj.path, dir_obj.path + '/') - for f in odps.list_volume_files(vol.path)) is True + assert dir_obj is odps.get_volume_file(vol.path + "/" + TEST_DIR_NAME) + assert dir_obj.path == "/" + TEST_FS_VOLUME_NAME + "/" + TEST_DIR_NAME + assert ( + any( + f.path in (dir_obj.path, dir_obj.path + "/") + for f in odps.list_volume_files(vol.path) + ) + is True + ) - with odps.open_volume_writer(dir_obj.path + '/' + TEST_FILE_NAME) as writer: + with odps.open_volume_writer(dir_obj.path + "/" + TEST_FILE_NAME) as writer: writer.write(FILE_CONTENT) - assert 'non_existing_file' not in dir_obj + assert "non_existing_file" not in dir_obj assert TEST_FILE_NAME in dir_obj - assert any(f.basename == TEST_FILE_NAME - for f in odps.list_volume_files(dir_obj.path)) is True - with odps.open_volume_reader(dir_obj.path + '/' + TEST_FILE_NAME) as reader: + assert ( + any(f.basename == TEST_FILE_NAME for f in odps.list_volume_files(dir_obj.path)) + is True + ) + + with odps.open_volume_reader(dir_obj.path + "/" + TEST_FILE_NAME) as reader: content = reader.read() assert to_str(content) == FILE_CONTENT @@ -177,7 +209,7 @@ def test_volume_fs(odps): assert file_obj.replication == 5 old_dir_name = file_obj.dirname - odps.move_volume_file(file_obj.path, './/' + TEST_NEW_FILE_NAME, replication=10) + odps.move_volume_file(file_obj.path, ".//" + TEST_NEW_FILE_NAME, replication=10) assert old_dir_name == file_obj.dirname assert file_obj.basename == TEST_NEW_FILE_NAME assert file_obj.replication == 10 @@ -189,7 +221,8 @@ def test_volume_fs(odps): assert TEST_DIR_NAME not in vol -def test_external_volume(config, odps_daily): +@pytest.mark.parametrize("check_experimental", [False], indirect=True) +def test_external_volume(config, odps_daily, check_experimental): if not hasattr(config, "oss_bucket") or oss2 is None: pytest.skip("Need oss2 and config to run this test") @@ -209,11 +242,15 @@ def test_external_volume(config, odps_daily): oss_endpoint, ) = config.oss_config test_location = "oss://%s:%s@%s/%s/%s" % ( - oss_access_id, oss_secret_access_key, oss_endpoint, oss_bucket_name, test_dir_name + oss_access_id, + oss_secret_access_key, + oss_endpoint, + oss_bucket_name, + test_dir_name, ) vol = odps_daily.create_external_volume( - TEST_EXT_VOLUME_NAME, location=test_location + TEST_EXT_VOLUME_NAME, location=test_location, auto_create_dir=True ) try: assert isinstance(vol, ExternalVolume) @@ -232,9 +269,12 @@ def test_external_volume(config, odps_daily): with vol.open_writer(test_write_file_name) as writer: writer.write(FILE_CONTENT2.encode()) assert any(test_write_file_name in f.path for f in vol) - assert config.oss_bucket.get_object( - test_dir_name + "/" + test_write_file_name - ).read() == FILE_CONTENT2.encode() + assert ( + config.oss_bucket.get_object( + test_dir_name + "/" + test_write_file_name + ).read() + == FILE_CONTENT2.encode() + ) vol.delete(test_write_file_name) assert not any(test_write_file_name in f.path for f in vol) @@ -243,17 +283,15 @@ def test_external_volume(config, odps_daily): assert isinstance(dir_obj, ExternalVolumeDir) with dir_obj.open_writer(test_write_file_name) as writer: writer.write(FILE_CONTENT2.encode()) - assert config.oss_bucket.get_object( - "/".join([test_dir_name, test_subdir_name, test_write_file_name]) - ).read() == FILE_CONTENT2.encode() + assert ( + config.oss_bucket.get_object( + "/".join([test_dir_name, test_subdir_name, test_write_file_name]) + ).read() + == FILE_CONTENT2.encode() + ) with dir_obj.open_reader(test_write_file_name) as reader: assert reader.read() == FILE_CONTENT2.encode() dir_obj.delete(recursive=True) assert not any(test_subdir_name in f.path for f in vol) finally: - keys = [ - obj.key - for obj in oss2.ObjectIterator(config.oss_bucket, test_dir_name) - ] - config.oss_bucket.batch_delete_objects(keys) - vol.drop() + vol.drop(auto_remove_dir=True, recursive=True) diff --git a/odps/models/tests/test_xflows.py b/odps/models/tests/test_xflows.py index 5aaee06b..baea118a 100644 --- a/odps/models/tests/test_xflows.py +++ b/odps/models/tests/test_xflows.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ from ...utils import to_text from .. import XFlows -EXPECTED_XFLOW_INSTANCE_XML = ''' +EXPECTED_XFLOW_INSTANCE_XML = """ algo_project @@ -38,8 +38,8 @@ -''' -EXPECTED_PRIORITY_XFLOW_INSTANCE_XML = ''' +""" +EXPECTED_PRIORITY_XFLOW_INSTANCE_XML = """ algo_project @@ -59,7 +59,7 @@ -''' +""" def test_x_flows(odps): @@ -70,24 +70,31 @@ def test_x_flows(odps): def test_x_flow_instance_to_xml(odps): - xflow_name = 'pyodps_t_tmp_xflow_algo_name' - project = 'algo_project' - parameters = {'key': 'value'} - properties = {'odps.setting': 'value'} + xflow_name = "pyodps_t_tmp_xflow_algo_name" + project = "algo_project" + parameters = {"key": "value"} + properties = {"odps.setting": "value"} got_xml = odps.get_project(project).xflows._gen_xflow_instance_xml( - xflow_name=xflow_name, xflow_project=project, parameters=parameters, - properties=properties) + xflow_name=xflow_name, + xflow_project=project, + parameters=parameters, + properties=properties, + ) assert to_text(got_xml) == to_text(EXPECTED_XFLOW_INSTANCE_XML) got_xml = odps.get_project(project).xflows._gen_xflow_instance_xml( - xflow_name=xflow_name, xflow_project=project, parameters=parameters, - properties=properties, priority=1) + xflow_name=xflow_name, + xflow_project=project, + parameters=parameters, + properties=properties, + priority=1, + ) assert to_text(got_xml) == to_text(EXPECTED_PRIORITY_XFLOW_INSTANCE_XML) def test_run_x_flow_instance(odps): - xflow_name = 'test_xflow' + xflow_name = "test_xflow" if not odps.exist_xflow(xflow_name): return @@ -96,27 +103,31 @@ def test_run_x_flow_instance(odps): assert isinstance(xflow_results, dict) assert all( - map(lambda x: isinstance(x, XFlows.XFlowResult.XFlowAction), xflow_results.values())) + map( + lambda x: isinstance(x, XFlows.XFlowResult.XFlowAction), + xflow_results.values(), + ) + ) def test_iter_sub_instances(odps): - table = create_iris(odps, tn('test_iris_table')) - model_name = tn('test_xflow_model') + table = create_iris(odps, tn("test_iris_table")) + model_name = tn("test_xflow_model") try: odps.delete_offline_model(model_name, if_exists=True) except: pass try: xflow_inst = odps.run_xflow( - 'LogisticRegression', - 'algo_public', + "LogisticRegression", + "algo_public", dict( - featureColNames='sepal_length,sepal_width,petal_length,petal_width', - labelColName='category', + featureColNames="sepal_length,sepal_width,petal_length,petal_width", + labelColName="category", inputTableName=table.name, modelName=model_name, ), - hints={"settings": "{\"SKYNET_ID\": \"12345\"}"} + hints={"settings": "{\"SKYNET_ID\": \"12345\"}"}, ) sub_insts = dict() for k, v in odps.iter_xflow_sub_instances(xflow_inst, check=True): diff --git a/odps/models/volume_ext.py b/odps/models/volume_ext.py index 649005e2..9602f94d 100644 --- a/odps/models/volume_ext.py +++ b/odps/models/volume_ext.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,11 +18,11 @@ import requests -from . import cache_parent from .. import serializers -from ..compat import Enum +from ..compat import Enum, urlparse from ..errors import OSSSignUrlError -from .volume_fs import FSVolumeObject, FSVolumeObjects, FSVolume +from . import cache_parent +from .volume_fs import FSVolume, FSVolumeObject, FSVolumeObjects class SignUrlMethod(Enum): @@ -46,27 +46,35 @@ def _get_dir_cls(cls): def get_sign_url(self, method, seconds=None): if isinstance(method, SignUrlMethod): method = method.value - params = {'sign_url': method.lower()} + params = {"sign_url": method.lower()} if seconds: params["expire_seconds"] = seconds - headers = {'x-odps-volume-fs-path': self.path} + headers = {"x-odps-volume-fs-path": self.path} schema_name = self.volume._get_schema_name() if schema_name is not None: params["curr_schema"] = schema_name resp = self._client.get( - self.parent.resource(), action='meta', params=params, headers=headers + self.parent.resource(), action="meta", params=params, headers=headers ) self.parse(self._client, resp, obj=self) return self.sign_url def _request_sign_url(self, path, method, *args, **kw): if path: - path = self.path.rstrip('/') + '/' + path.lstrip('/') + path = self.path.rstrip("/") + "/" + path.lstrip("/") else: path = self.path - vol_rel_path = path[len(self.volume.name) + 1:] + vol_rel_path = path[len(self.volume.name) + 1 :] sign_url = self.volume.get_sign_url(vol_rel_path, method) + + replace_internal_host = kw.pop("replace_internal_host", False) + if replace_internal_host: + parsed_url = urlparse(sign_url) + if "-internal." in parsed_url.netloc: + new_netloc = parsed_url.netloc.replace("-internal.", ".") + sign_url = sign_url.replace(parsed_url.netloc, new_netloc) + if method == SignUrlMethod.PUT: resp = requests.put(sign_url, *args, **kw) else: @@ -93,8 +101,8 @@ def _delete(self, recursive=False): :param recursive: indicate whether a recursive deletion should be performed. """ - params = {'recursive': str(recursive).lower()} - headers = {'x-odps-volume-fs-path': self.path.rstrip("/")} + params = {"recursive": str(recursive).lower()} + headers = {"x-odps-volume-fs-path": self.path.rstrip("/")} self._del_cache(self.path) self._client.delete( self.parent.resource(), @@ -104,7 +112,7 @@ def _delete(self, recursive=False): ) @contextlib.contextmanager - def _open_reader(self, path): + def _open_reader(self, path, replace_internal_host=False): """ Open a volume file and read contents in it. @@ -115,7 +123,12 @@ def _open_reader(self, path): >>> with fs_dir.open_reader('file') as reader: >>> [print(line) for line in reader] """ - req = self._request_sign_url(path, SignUrlMethod.GET, stream=True) + req = self._request_sign_url( + path, + SignUrlMethod.GET, + stream=True, + replace_internal_host=replace_internal_host, + ) yield req.raw @contextlib.contextmanager @@ -134,9 +147,15 @@ def _open_writer(self, path, **kwargs): if kwargs.pop("replication", None) is not None: # pragma: no cover raise TypeError("External volume does not support replication argument") + replace_internal_host = kwargs.pop("replace_internal_host", False) def put_func(data): - self._request_sign_url(path, SignUrlMethod.PUT, data=data) + self._request_sign_url( + path, + SignUrlMethod.PUT, + data=data, + replace_internal_host=replace_internal_host, + ) rio = RequestsIO(put_func) try: @@ -158,7 +177,7 @@ def delete(self): """ return self._delete(False) - def open_reader(self): + def open_reader(self, replace_internal_host=False): """ Open current file and read contents in it. :return: file reader @@ -167,7 +186,7 @@ def open_reader(self): >>> with fs_file.open_reader('file') as reader: >>> [print(line) for line in reader] """ - return self._open_reader(None) + return self._open_reader(None, replace_internal_host=replace_internal_host) def open_writer(self, **kw): return self._open_writer(None, **kw) @@ -183,7 +202,7 @@ def __init__(self, **kw): def objects(self): return ExternalVolumeObjects(parent=self, client=self._client) - def create_dir(self, path): + def create_dir(self, path, replace_internal_host=False): """ Creates and returns a sub-directory under the current directory. :param str path: directory name to be created @@ -191,8 +210,15 @@ def create_dir(self, path): :rtype: :class:`odps.models.FSVolumeDir` """ path = path.strip("/") + "/" - resp = self._request_sign_url(path, SignUrlMethod.PUT, b"") - dir_object = type(self)(path=resp.volume_path, parent=self.parent, client=self._client) + resp = self._request_sign_url( + path, + SignUrlMethod.PUT, + b"", + replace_internal_host=replace_internal_host, + ) + dir_object = type(self)( + path=resp.volume_path, parent=self.parent, client=self._client + ) dir_object.reload() return dir_object @@ -213,7 +239,7 @@ def delete(self, recursive=False): """ self._delete(recursive=recursive) - def open_reader(self, path): + def open_reader(self, path, replace_internal_host=False): """ Open a volume file and read contents in it. @@ -224,7 +250,7 @@ def open_reader(self, path): >>> with fs_dir.open_reader('file') as reader: >>> [print(line) for line in reader] """ - return self._open_reader(path) + return self._open_reader(path, replace_internal_host=replace_internal_host) def open_writer(self, path, **kwargs): """ @@ -241,7 +267,7 @@ def open_writer(self, path, **kwargs): class ExternalVolumeObjects(FSVolumeObjects): - objects = serializers.XMLNodesReferencesField(ExternalVolumeObject, 'Item') + objects = serializers.XMLNodesReferencesField(ExternalVolumeObject, "Item") @classmethod def _get_single_object_cls(cls): diff --git a/odps/models/volume_fs.py b/odps/models/volume_fs.py index 07169cbb..93810299 100644 --- a/odps/models/volume_fs.py +++ b/odps/models/volume_fs.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,46 +14,46 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .. import serializers, errors, utils -from ..compat import six, long_type -from .core import LazyLoad, cache, Iterable +from .. import errors, serializers, utils +from ..compat import long_type, six from .cache import cache_parent +from .core import Iterable, LazyLoad, cache from .volumes import Volume class FSVolumeObject(LazyLoad): - __slots__ = '_volume_fs_tunnel', - _type_indicator = '_isdir' - _cache_name_arg = 'path' + __slots__ = ("_volume_fs_tunnel",) + _type_indicator = "_isdir" + _cache_name_arg = "path" class CreateRequestXML(serializers.XMLSerializableModel): - _root = 'Item' + _root = "Item" - type = serializers.XMLNodeField('Type') - path = serializers.XMLNodeField('Path') + type = serializers.XMLNodeField("Type") + path = serializers.XMLNodeField("Path") class UpdateRequestXML(serializers.XMLSerializableModel): - _root = 'Item' - - path = serializers.XMLNodeField('Path') - replication = serializers.XMLNodeField('Replication') - - _project = serializers.XMLNodeField('Project') - _volume = serializers.XMLNodeField('Volume') - path = serializers.XMLNodeField('Path') - _isdir = serializers.XMLNodeField('Isdir', type='bool') - permission = serializers.XMLNodeField('permission') - _replication = serializers.XMLNodeField('BlockReplications', parse_callback=int) - length = serializers.XMLNodeField('Length', parse_callback=long_type) - quota = serializers.XMLNodeField('Quota', parse_callback=long_type) - block_size = serializers.XMLNodeField('BlockSize', parse_callback=long_type) - owner = serializers.XMLNodeField('Owner') - group = serializers.XMLNodeField('Group') - creation_time = serializers.XMLNodeField('CreationTime', type='rfc822') - access_time = serializers.XMLNodeField('AccessTime', type='rfc822') - last_modified_time = serializers.XMLNodeField('ModificationTime', type='rfc822') - symlink = serializers.XMLNodeField('Symlink') - sign_url = serializers.XMLNodeField('URL') + _root = "Item" + + path = serializers.XMLNodeField("Path") + replication = serializers.XMLNodeField("Replication") + + _project = serializers.XMLNodeField("Project") + _volume = serializers.XMLNodeField("Volume") + path = serializers.XMLNodeField("Path") + _isdir = serializers.XMLNodeField("Isdir", type="bool") + permission = serializers.XMLNodeField("permission") + _replication = serializers.XMLNodeField("BlockReplications", parse_callback=int) + length = serializers.XMLNodeField("Length", parse_callback=long_type) + quota = serializers.XMLNodeField("Quota", parse_callback=long_type) + block_size = serializers.XMLNodeField("BlockSize", parse_callback=long_type) + owner = serializers.XMLNodeField("Owner") + group = serializers.XMLNodeField("Group") + creation_time = serializers.XMLNodeField("CreationTime", type="rfc822") + access_time = serializers.XMLNodeField("AccessTime", type="rfc822") + last_modified_time = serializers.XMLNodeField("ModificationTime", type="rfc822") + symlink = serializers.XMLNodeField("Symlink") + sign_url = serializers.XMLNodeField("URL") @classmethod def _get_base_cls(cls): @@ -73,26 +73,35 @@ def _get_objects_cls(cls): @staticmethod def _filter_cache(_, **kwargs): - isdir = kwargs.get('_isdir') - return isdir is not None and isdir != 'UNKNOWN' + isdir = kwargs.get("_isdir") + return isdir is not None and isdir != "UNKNOWN" - @utils.experimental('Volume2 is still experimental. Usage in production environment is strongly opposed.') @cache def __new__(cls, *args, **kwargs): - isdir = kwargs.get('_isdir') + isdir = kwargs.get("_isdir") base_cls = cls._get_base_cls() if cls is not base_cls and issubclass(cls, base_cls): return object.__new__(cls) if isdir is not None: - if isdir == 'UNKNOWN': + if isdir == "UNKNOWN": return object.__new__(base_cls) return object.__new__(cls._get_dir_cls() if isdir else cls._get_file_cls()) - obj = base_cls(_isdir='UNKNOWN', **kwargs) + obj = base_cls(_isdir="UNKNOWN", **kwargs) obj.reload() return base_cls(**obj.extract()) + @utils.experimental( + "Volume2 is still experimental. Usage in production environment is strongly opposed.", + cond=lambda self, *_, **kw: ( + type(self) in (FSVolumeObject, FSVolumeDir, FSVolumeFile) + and type(kw.get("parent")) is FSVolume + ), + ) + def __init__(self, *args, **kwargs): + super(FSVolumeObject, self).__init__(*args, **kwargs) + def _name(self): return self.path @@ -100,7 +109,7 @@ def _set_state(self, name, parent, client): self.__init__(path=name, _parent=parent, _client=client) def split(self): - return self.path.rsplit('/', 1) + return self.path.rsplit("/", 1) @property def basename(self): @@ -118,7 +127,7 @@ def volume(self): @property def is_root(self): - return self.path == '/' + self.parent.name + return self.path == "/" + self.parent.name def reload(self): # check if the volume path is the root @@ -132,9 +141,9 @@ def reload(self): resp = self._client.get( self.parent.resource(), - action='meta', + action="meta", params=params, - headers={'x-odps-volume-fs-path': self.path}, + headers={"x-odps-volume-fs-path": self.path}, ) self.parse(self._client, resp, obj=self) @@ -142,33 +151,35 @@ def reload(self): @staticmethod def _normpath(path): - path = path.rstrip('/') + path = path.rstrip("/") i = 0 parts = [] start = 0 while i < len(path): - if path[i] == '/' or i == len(path) - 1: - chunk = path[start:i + 1] + if path[i] == "/" or i == len(path) - 1: + chunk = path[start : i + 1] start = i + 1 - if chunk in ['', '/', '.', './']: + if chunk in ["", "/", ".", "./"]: # do nothing pass - elif chunk in ['..', '../']: + elif chunk in ["..", "../"]: if len(parts): - parts = parts[:len(parts) - 1] + parts = parts[: len(parts) - 1] else: parts.append(chunk) else: parts.append(chunk) i += 1 - if path.startswith('/'): - return '/' + ''.join(parts) - return ''.join(parts) + if path.startswith("/"): + return "/" + "".join(parts) + return "".join(parts) def _del_cache(self, path): - root_objs = self._get_objects_cls()(parent=self.volume.root, client=self._client) - if not path.startswith('/'): - path = self.path + '/' + path.lstrip('/') + root_objs = self._get_objects_cls()( + parent=self.volume.root, client=self._client + ) + if not path.startswith("/"): + path = self.path + "/" + path.lstrip("/") del root_objs[path] def move(self, new_path, replication=None): @@ -178,18 +189,18 @@ def move(self, new_path, replication=None): :param new_path: target location of current file / directory :param replication: number of replication """ - if not new_path.startswith('/'): - new_path = self._normpath(self.dirname + '/' + new_path) + if not new_path.startswith("/"): + new_path = self._normpath(self.dirname + "/" + new_path) else: new_path = self._normpath(new_path) if new_path == self.path: - raise ValueError('New path should be different from the original one.') + raise ValueError("New path should be different from the original one.") update_def = self.UpdateRequestXML(path=new_path) if replication: update_def.replication = replication headers = { - 'Content-Type': 'application/xml', - 'x-odps-volume-fs-path': self.path, + "Content-Type": "application/xml", + "x-odps-volume-fs-path": self.path, } schema_name = self.parent._get_schema_name() @@ -199,7 +210,7 @@ def move(self, new_path, replication=None): self._client.put( self.parent.resource(), - action='meta', + action="meta", params=params, headers=headers, data=update_def.serialize(), @@ -239,6 +250,7 @@ class FSVolumeDir(FSVolumeObject): >>> # get a file/directory object >>> file_obj = fs_dir[file_name] """ + def __init__(self, **kw): super(FSVolumeDir, self).__init__(**kw) self._isdir = True @@ -247,16 +259,16 @@ def __init__(self, **kw): def objects(self): return self._get_objects_cls()(parent=self, client=self._client) - def create_dir(self, path): + def create_dir(self, path, **kw): """ Creates and returns a sub-directory under the current directory. :param str path: directory name to be created :return: directory object :rtype: :class:`odps.models.FSVolumeDir` """ - path = self.path + '/' + path.lstrip('/') - dir_def = self.CreateRequestXML(type='directory', path=path) - headers = {'Content-Type': 'application/xml'} + path = self.path + "/" + path.lstrip("/") + dir_def = self.CreateRequestXML(type="directory", path=path) + headers = {"Content-Type": "application/xml"} self._client.post( self.parent.resource(), headers=headers, @@ -283,8 +295,8 @@ def delete(self, recursive=False): :param recursive: indicate whether a recursive deletion should be performed. """ - params = {'recursive': recursive} - headers = {'x-odps-volume-fs-path': self.path} + params = {"recursive": recursive} + headers = {"x-odps-volume-fs-path": self.path} self._del_cache(self.path) self._client.delete( self.parent.resource(), @@ -309,10 +321,14 @@ def open_reader(self, path, **kw): >>> with fs_dir.open_reader('file') as reader: >>> [print(line) for line in reader] """ - endpoint = kw.pop('endpoint', None) - quota_name = kw.pop('quota_name', None) + endpoint = kw.pop("endpoint", None) + quota_name = kw.pop("quota_name", None) tunnel = self._create_volume_fs_tunnel(endpoint=endpoint, quota_name=quota_name) - path = self.path.lstrip('/')[len(self.parent.name):].lstrip('/') + '/' + path.lstrip('/') + path = ( + self.path.lstrip("/")[len(self.parent.name) :].lstrip("/") + + "/" + + path.lstrip("/") + ) return tunnel.open_reader(self.parent, path, **kw) def open_writer(self, path, replication=None, **kw): @@ -329,10 +345,14 @@ def open_writer(self, path, replication=None, **kw): >>> with fs_dir.open_writer('file') as reader: >>> writer.write('some content') """ - endpoint = kw.pop('endpoint', None) - quota_name = kw.pop('quota_name', None) + endpoint = kw.pop("endpoint", None) + quota_name = kw.pop("quota_name", None) tunnel = self._create_volume_fs_tunnel(endpoint=endpoint, quota_name=quota_name) - vol_path = self.path.lstrip('/')[len(self.parent.name):].lstrip('/') + '/' + path.lstrip('/') + vol_path = ( + self.path.lstrip("/")[len(self.parent.name) :].lstrip("/") + + "/" + + path.lstrip("/") + ) return tunnel.open_writer(self.parent, vol_path, replication=replication, **kw) @@ -353,8 +373,8 @@ def replication(self): def replication(self, value): update_def = self.UpdateRequestXML(replication=value) headers = { - 'Content-Type': 'application/xml', - 'x-odps-volume-fs-path': self.path, + "Content-Type": "application/xml", + "x-odps-volume-fs-path": self.path, } schema_name = self.parent._get_schema_name() @@ -364,7 +384,7 @@ def replication(self, value): self._client.put( self.parent.resource(), - action='meta', + action="meta", params=params, headers=headers, data=update_def.serialize(), @@ -375,8 +395,8 @@ def delete(self, **_): """ Delete current file. """ - params = {'recursive': False} - headers = {'x-odps-volume-fs-path': self.path} + params = {"recursive": False} + headers = {"x-odps-volume-fs-path": self.path} schema_name = self.parent._get_schema_name() if schema_name is not None: @@ -400,24 +420,24 @@ def open_reader(self, **kw): >>> with fs_file.open_reader('file') as reader: >>> [print(line) for line in reader] """ - endpoint = kw.pop('endpoint', None) - quota_name = kw.pop('quota_name', None) + endpoint = kw.pop("endpoint", None) + quota_name = kw.pop("quota_name", None) tunnel = self._create_volume_fs_tunnel(endpoint=endpoint, quota_name=quota_name) - path = self.path.lstrip('/')[len(self.parent.name):].lstrip('/') + path = self.path.lstrip("/")[len(self.parent.name) :].lstrip("/") return tunnel.open_reader(self.parent, path, **kw) def open_writer(self, replication=None, **kw): - endpoint = kw.pop('endpoint', None) - quota_name = kw.pop('quota_name', None) + endpoint = kw.pop("endpoint", None) + quota_name = kw.pop("quota_name", None) tunnel = self._create_volume_fs_tunnel(endpoint=endpoint, quota_name=quota_name) - path = self.path.lstrip('/')[len(self.parent.name):].lstrip('/') + path = self.path.lstrip("/")[len(self.parent.name) :].lstrip("/") return tunnel.open_writer(self.parent, path, replication=replication, **kw) class FSVolumeObjects(Iterable): - marker = serializers.XMLNodeField('Marker') - max_items = serializers.XMLNodeField('MaxItems', parse_callback=int) - objects = serializers.XMLNodesReferencesField(FSVolumeObject, 'Item') + marker = serializers.XMLNodeField("Marker") + max_items = serializers.XMLNodeField("MaxItems", parse_callback=int) + objects = serializers.XMLNodesReferencesField(FSVolumeObject, "Item") @property def project(self): @@ -434,8 +454,10 @@ def _get_single_object_cls(cls): def _get(self, name): path = name if not path.startswith(self.parent.path): - path = self.parent.path + '/' + name.lstrip('/') - return self._get_single_object_cls()(client=self._client, parent=self.volume, path=path) + path = self.parent.path + "/" + name.lstrip("/") + return self._get_single_object_cls()( + client=self._client, parent=self.volume, path=path + ) def __contains__(self, item): if isinstance(item, six.string_types): @@ -458,24 +480,23 @@ def __iter__(self): return self.iterate() def iterate(self): - params = {'expectmarker': 'true'} - headers = {'x-odps-volume-fs-path': self.parent.path} + params = {"expectmarker": "true"} + headers = {"x-odps-volume-fs-path": self.parent.path} schema_name = self.volume._get_schema_name() if schema_name is not None: params["curr_schema"] = schema_name def _it(): - last_marker = params.get('marker') - if 'marker' in params and \ - (last_marker is None or len(last_marker) == 0): + last_marker = params.get("marker") + if "marker" in params and (last_marker is None or len(last_marker) == 0): return url = self.volume.resource() resp = self._client.get(url, params=params, headers=headers) r = type(self).parse(self._client, resp, obj=self, parent=self.volume) - params['marker'] = r.marker + params["marker"] = r.marker return r.objects @@ -502,18 +523,19 @@ class FSVolume(Volume): >>> # get a file/directory object >>> file_obj = fs_volume[file_name] """ - __slots__ = '_root_dir', + + __slots__ = ("_root_dir",) _dir_cls = FSVolumeDir - def create_dir(self, path): + def create_dir(self, path, **kw): """ Creates and returns a directory under the current volume. :param str path: directory name to be created :return: directory object :rtype: :class:`odps.models.FSVolumeDir` """ - return self.root.create_dir(path) + return self.root.create_dir(path, **kw) def __contains__(self, item): return item in self.root @@ -529,7 +551,7 @@ def __getitem__(self, item): @property def path(self): - return '/' + self.name + return "/" + self.name @property def location(self): @@ -584,7 +606,9 @@ def delete(self, path, recursive=False): @property def root(self): if not self._root_dir: - self._root_dir = self._dir_cls(path='/' + self.name, parent=self, client=self._client) + self._root_dir = self._dir_cls( + path="/" + self.name, parent=self, client=self._client + ) return self._root_dir diff --git a/odps/models/volume_parted.py b/odps/models/volume_parted.py index d6139701..77cbca04 100644 --- a/odps/models/volume_parted.py +++ b/odps/models/volume_parted.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,17 +14,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .core import LazyLoad, Iterable, XMLRemoteModel -from .. import serializers, errors, utils -from ..compat import six, Enum -from .volumes import Volume +from .. import errors, serializers, utils +from ..compat import Enum, six from .cache import cache_parent +from .core import Iterable, LazyLoad, XMLRemoteModel +from .volumes import Volume class VolumeFile(serializers.XMLSerializableModel): - _root = 'VolumeFileModel' + _root = "VolumeFileModel" - name = serializers.XMLNodeField('Name') + name = serializers.XMLNodeField("Name") @property def partition(self): @@ -43,12 +43,22 @@ def open_reader(self, **kw): @property def path(self): - return '/'.join((self.project.name, 'volumes', self.volume.name, self.partition.name, self.name)) + return "/".join( + ( + self.project.name, + "volumes", + self.volume.name, + self.partition.name, + self.name, + ) + ) class VolumePartitionMeta(XMLRemoteModel): - length = serializers.XMLNodeField('Length', parse_callback=int, set_to_parent=True) - file_number = serializers.XMLNodeField('FileNumber', parse_callback=int, set_to_parent=True) + length = serializers.XMLNodeField("Length", parse_callback=int, set_to_parent=True) + file_number = serializers.XMLNodeField( + "FileNumber", parse_callback=int, set_to_parent=True + ) def load(self): url = self._parent.resource() @@ -58,7 +68,7 @@ def load(self): if schema_name is not None: params["curr_schema"] = schema_name - resp = self._client.get(url, action='meta', params=params) + resp = self._client.get(url, action="meta", params=params) self.parse(self._client, resp, obj=self) @@ -66,26 +76,32 @@ class VolumePartition(LazyLoad): """ Represents a partition in a volume. """ - __slots__ = '_volume_tunnel', '_id_thread_local', \ - 'length', 'file_number' # meta - - class Type(Enum): - NEW = 'NEW' - OLD = 'OLD' - - _root = 'Meta' - _type_indicator = 'type' - name = serializers.XMLNodeField('Name') - owner = serializers.XMLNodeField('Owner') - type = serializers.XMLNodeField('Type', parse_callback=lambda t: VolumePartition.Type(t.upper())) - comment = serializers.XMLNodeField('Comment') - creation_time = serializers.XMLNodeField('CreationTime', parse_callback=utils.parse_rfc822) - last_modified_time = serializers.XMLNodeField('LastModifiedTime', parse_callback=utils.parse_rfc822) - meta = serializers.XMLNodeReferenceField(VolumePartitionMeta, 'Meta') + __slots__ = "_volume_tunnel", "_id_thread_local", "length", "file_number" # meta - _download_id = utils.thread_local_attribute('_id_thread_local', lambda: None) - _upload_id = utils.thread_local_attribute('_id_thread_local', lambda: None) + class Type(Enum): + NEW = "NEW" + OLD = "OLD" + + _root = "Meta" + _type_indicator = "type" + + name = serializers.XMLNodeField("Name") + owner = serializers.XMLNodeField("Owner") + type = serializers.XMLNodeField( + "Type", parse_callback=lambda t: VolumePartition.Type(t.upper()) + ) + comment = serializers.XMLNodeField("Comment") + creation_time = serializers.XMLNodeField( + "CreationTime", parse_callback=utils.parse_rfc822 + ) + last_modified_time = serializers.XMLNodeField( + "LastModifiedTime", parse_callback=utils.parse_rfc822 + ) + meta = serializers.XMLNodeReferenceField(VolumePartitionMeta, "Meta") + + _download_id = utils.thread_local_attribute("_id_thread_local", lambda: None) + _upload_id = utils.thread_local_attribute("_id_thread_local", lambda: None) def __init__(self, **kwargs): super(VolumePartition, self).__init__(**kwargs) @@ -109,19 +125,19 @@ def volume(self): def __getattribute__(self, attr): val = object.__getattribute__(self, attr) - if val is None and attr in getattr(VolumePartitionMeta, '__fields'): + if val is None and attr in getattr(VolumePartitionMeta, "__fields"): self.meta.load() return object.__getattribute__(self, attr) return super(VolumePartition, self).__getattribute__(attr) class VolumeFiles(serializers.XMLSerializableModel): - _root = 'Items' + _root = "Items" skip_null = False - marker = serializers.XMLNodeField('Marker') - files = serializers.XMLNodesReferencesField(VolumeFile, 'Item') - max_items = serializers.XMLNodeField('MaxItems', parse_callback=int) + marker = serializers.XMLNodeField("Marker") + files = serializers.XMLNodesReferencesField(VolumeFile, "Item") + max_items = serializers.XMLNodeField("MaxItems", parse_callback=int) def __getitem__(self, item): for f in self.iterate(name=item): @@ -143,27 +159,28 @@ def iterate(self, name=None, max_items=None): :param name: the prefix of volume name name :return: """ - params = {'expectmarker': 'true', 'path': ''} + params = {"expectmarker": "true", "path": ""} if name is not None: - params['name'] = name + params["name"] = name if max_items is not None: - params['maxitems'] = max_items + params["maxitems"] = max_items schema_name = self.parent._get_schema_name() if schema_name is not None: params["curr_schema"] = schema_name def _it(): - last_marker = params.get('marker') - if 'marker' in params and \ - (last_marker is None or len(last_marker) == 0): + last_marker = params.get("marker") + if "marker" in params and ( + last_marker is None or len(last_marker) == 0 + ): return url = self.parent.resource() resp = self.parent._client.get(url, params=params) v = self.parse(resp, obj=self, parent=self.parent) - params['marker'] = v.marker + params["marker"] = v.marker return v.files @@ -202,7 +219,14 @@ def _create_volume_tunnel(self, endpoint=None, quota_name=None): return self._volume_tunnel def open_reader( - self, file_name, reopen=False, endpoint=None, start=None, length=None, quota_name=None, **kwargs + self, + file_name, + reopen=False, + endpoint=None, + start=None, + length=None, + quota_name=None, + **kwargs ): """ Open a volume file for read. A file-like object will be returned which can be used to read contents from @@ -224,16 +248,19 @@ def open_reader( tunnel = self._create_volume_tunnel(endpoint=endpoint, quota_name=quota_name) download_id = self._download_id if reopen else None download_session = tunnel.create_download_session( - volume=self.volume.name, partition_spec=self.name, - file_name=file_name, download_id=download_id, **kwargs + volume=self.volume.name, + partition_spec=self.name, + file_name=file_name, + download_id=download_id, + **kwargs ) self._download_id = download_session.id open_args = {} if start is not None: - open_args['start'] = start + open_args["start"] = start if length is not None: - open_args['length'] = length + open_args["length"] = length return download_session.open(**open_args) def open_writer(self, reopen=False, endpoint=None, quota_name=None, **kwargs): @@ -254,8 +281,10 @@ def open_writer(self, reopen=False, endpoint=None, quota_name=None, **kwargs): tunnel = self._create_volume_tunnel(endpoint=endpoint, quota_name=quota_name) upload_id = self._upload_id if reopen else None upload_session = tunnel.create_upload_session( - volume=self.volume.name, partition_spec=self.name, - upload_id=upload_id, **kwargs + volume=self.volume.name, + partition_spec=self.name, + upload_id=upload_id, + **kwargs ) self._upload_id = upload_session.id file_dict = dict() @@ -299,14 +328,15 @@ class PartedVolume(Volume): """ PartedVolume represents the old-fashioned partitioned volume in ODPS. """ + class Partitions(Iterable): - _root = 'Volume' + _root = "Volume" - marker = serializers.XMLNodeField('Marker') + marker = serializers.XMLNodeField("Marker") partitions = serializers.XMLNodesReferencesField( - VolumePartition, 'Partitions', 'Partition' + VolumePartition, "Partitions", "Partition" ) - max_items = serializers.XMLNodeField('MaxItems', parse_callback=int) + max_items = serializers.XMLNodeField("MaxItems", parse_callback=int) def _get(self, item): return VolumePartition(client=self._client, parent=self.parent, name=item) @@ -334,27 +364,28 @@ def iterate(self, name=None, owner=None): :param owner: :return: """ - params = {'expectmarker': 'true'} + params = {"expectmarker": "true"} if name is not None: - params['name'] = name + params["name"] = name if owner is not None: - params['owner'] = owner + params["owner"] = owner schema_name = self.parent._get_schema_name() if schema_name is not None: params["curr_schema"] = schema_name def _it(): - last_marker = params.get('marker') - if 'marker' in params and \ - (last_marker is None or len(last_marker) == 0): + last_marker = params.get("marker") + if "marker" in params and ( + last_marker is None or len(last_marker) == 0 + ): return url = self.parent.resource() resp = self._client.get(url, params=params) v = PartedVolume.Partitions.parse(self._client, resp, obj=self) - params['marker'] = v.marker + params["marker"] = v.marker return v.partitions @@ -414,7 +445,9 @@ def delete_partition(self, name): """ return self.partitions.delete(name) - def open_reader(self, partition, file_name, endpoint=None, start=None, length=None, **kwargs): + def open_reader( + self, partition, file_name, endpoint=None, start=None, length=None, **kwargs + ): """ Open a volume file for read. A file-like object will be returned which can be used to read contents from volume files. @@ -431,8 +464,9 @@ def open_reader(self, partition, file_name, endpoint=None, start=None, length=No >>> with volume.open_reader('part', 'file') as reader: >>> [print(line) for line in reader] """ - return self.partitions[partition].open_reader(file_name, endpoint=endpoint, start=start, length=length, - **kwargs) + return self.partitions[partition].open_reader( + file_name, endpoint=endpoint, start=start, length=length, **kwargs + ) def open_writer(self, partition, endpoint=None, **kwargs): """ diff --git a/odps/models/volumes.py b/odps/models/volumes.py index 462cf4ec..fed305fa 100644 --- a/odps/models/volumes.py +++ b/odps/models/volumes.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,8 +17,8 @@ import json import warnings -from .. import serializers, errors, utils -from ..compat import six, Enum +from .. import errors, serializers, utils +from ..compat import Enum, six from ..errors import InternalServerError from .cache import cache from .core import Iterable, LazyLoad @@ -33,26 +33,32 @@ class Volume(LazyLoad): EXTERNAL_VOLUME_ROLEARN_KEY = "odps.properties.rolearn" class Type(Enum): - NEW = 'NEW' - OLD = 'OLD' - EXTERNAL = 'EXTERNAL' - UNKNOWN = 'UNKNOWN' + NEW = "NEW" + OLD = "OLD" + EXTERNAL = "EXTERNAL" + UNKNOWN = "UNKNOWN" - _root = 'Meta' - _type_indicator = 'type' + _root = "Meta" + _type_indicator = "type" - name = serializers.XMLNodeField('Name') - owner = serializers.XMLNodeField('Owner') - comment = serializers.XMLNodeField('Comment') + name = serializers.XMLNodeField("Name") + owner = serializers.XMLNodeField("Owner") + comment = serializers.XMLNodeField("Comment") type = serializers.XMLNodeField( - 'Type', parse_callback=lambda t: Volume.Type(t.upper()), serialize_callback=lambda t: t.value + "Type", + parse_callback=lambda t: Volume.Type(t.upper()), + serialize_callback=lambda t: t.value, + ) + length = serializers.XMLNodeField("Length", parse_callback=int) + file_number = serializers.XMLNodeField("FileNumber", parse_callback=int) + creation_time = serializers.XMLNodeField( + "CreationTime", parse_callback=utils.parse_rfc822 + ) + last_modified_time = serializers.XMLNodeField( + "LastModifiedTime", parse_callback=utils.parse_rfc822 ) - length = serializers.XMLNodeField('Length', parse_callback=int) - file_number = serializers.XMLNodeField('FileNumber', parse_callback=int) - creation_time = serializers.XMLNodeField('CreationTime', parse_callback=utils.parse_rfc822) - last_modified_time = serializers.XMLNodeField('LastModifiedTime', parse_callback=utils.parse_rfc822) properties = serializers.XMLNodeField( - 'Properties', parse_callback=json.loads, serialize_callback=json.dumps + "Properties", parse_callback=json.loads, serialize_callback=json.dumps ) @classmethod @@ -64,27 +70,30 @@ def _get_cls(cls, typo): if typo == Volume.Type.OLD: from . import PartedVolume + return PartedVolume elif typo == Volume.Type.NEW: from . import FSVolume + return FSVolume elif typo == Volume.Type.EXTERNAL: from . import ExternalVolume + return ExternalVolume elif typo == Volume.Type.UNKNOWN: return Volume @staticmethod def _filter_cache(_, **kwargs): - return kwargs.get('type') is not None and kwargs['type'] != Volume.Type.UNKNOWN + return kwargs.get("type") is not None and kwargs["type"] != Volume.Type.UNKNOWN @cache def __new__(cls, *args, **kwargs): - typo = kwargs.get('type') + typo = kwargs.get("type") if typo is not None or (cls != Volume and issubclass(cls, Volume)): return object.__new__(cls._get_cls(typo)) - kwargs['type'] = Volume.Type.UNKNOWN + kwargs["type"] = Volume.Type.UNKNOWN obj = Volume(**kwargs) try: obj.reload() @@ -96,7 +105,7 @@ def __new__(cls, *args, **kwargs): return obj def __init__(self, **kwargs): - typo = kwargs.get('type') + typo = kwargs.get("type") properties = kwargs.get("properties") or {} location = kwargs.pop("location", None) @@ -109,25 +118,26 @@ def __init__(self, **kwargs): kwargs["properties"] = properties if isinstance(typo, six.string_types): - kwargs['type'] = Volume.Type(typo.upper()) + kwargs["type"] = Volume.Type(typo.upper()) super(Volume, self).__init__(**kwargs) def reload(self): params = {} schema_name = self._get_schema_name() if schema_name is not None: - params['curr_schema'] = schema_name - resp = self._client.get(self.resource(), action='meta', params=params) + params["curr_schema"] = schema_name + resp = self._client.get(self.resource(), action="meta", params=params) self.parse(self._client, resp, obj=self) - def drop(self): - return self.parent.delete(self) + def drop(self, auto_remove_dir=False, recursive=False): + return self.parent.delete( + self, auto_remove_dir=auto_remove_dir, recursive=recursive + ) class Volumes(Iterable): - - marker = serializers.XMLNodeField('Marker') - volumes = serializers.XMLNodesReferencesField(Volume, 'Volume') + marker = serializers.XMLNodeField("Marker") + volumes = serializers.XMLNodesReferencesField(Volume, "Volume") def _get(self, item): return Volume(client=self._client, parent=self, name=item) @@ -160,25 +170,24 @@ def iterate(self, name=None, owner=None): :return: """ schema_name = self._get_schema_name() - params = {'expectmarker': 'true'} + params = {"expectmarker": "true"} if name is not None: - params['name'] = name + params["name"] = name if owner is not None: - params['owner'] = owner + params["owner"] = owner if schema_name is not None: - params['curr_schema'] = schema_name + params["curr_schema"] = schema_name def _it(): - last_marker = params.get('marker') - if 'marker' in params and \ - (last_marker is None or len(last_marker) == 0): + last_marker = params.get("marker") + if "marker" in params and (last_marker is None or len(last_marker) == 0): return url = self.resource() resp = self._client.get(url, params=params) v = Volumes.parse(self._client, resp, obj=self) - params['marker'] = v.marker + params["marker"] = v.marker return v.volumes @@ -190,30 +199,40 @@ def _it(): yield volume def _create(self, obj=None, **kwargs): + auto_create_dir = kwargs.pop("auto_create_dir", False) + volume = obj or Volume(parent=self, client=self._client, **kwargs) if volume.parent is None: volume._parent = self if volume._client is None: volume._client = self._client - headers = {'Content-Type': 'application/xml'} + headers = {"Content-Type": "application/xml"} data = volume.serialize() + params = {} + if auto_create_dir: + params["autoMkDir"] = "" + self._client.post( - self.resource(), data, headers=headers, curr_schema=self._get_schema_name() + self.resource(), + data, + headers=headers, + params=params, + curr_schema=self._get_schema_name(), ) return self[volume.name] def create_parted(self, obj=None, **kwargs): - return self._create(obj=obj, type='old', **kwargs) + return self._create(obj=obj, type="old", **kwargs) def create_fs(self, obj=None, **kwargs): - return self._create(obj=obj, type='new', **kwargs) + return self._create(obj=obj, type="new", **kwargs) def create_external(self, obj=None, **kwargs): - return self._create(obj=obj, type='external', **kwargs) + return self._create(obj=obj, type="external", **kwargs) - def delete(self, name): + def delete(self, name, auto_remove_dir=False, recursive=False): if not isinstance(name, Volume): volume = Volume(name=name, parent=self, client=self._client) else: @@ -221,6 +240,12 @@ def delete(self, name): name = name.name del self[name] # release cache + params = {} + if auto_remove_dir: + params["autoRmDir"] = "" + if recursive: + params["recursive"] = "true" + url = volume.resource() - self._client.delete(url, curr_schema=self._get_schema_name()) + self._client.delete(url, params=params, curr_schema=self._get_schema_name()) diff --git a/odps/models/worker.py b/odps/models/worker.py index d05c5d8b..486b0833 100644 --- a/odps/models/worker.py +++ b/odps/models/worker.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,21 +14,24 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ..compat import six from .. import serializers +from ..compat import six LOG_TYPES_MAPPING = { - 'hs_err_log': 'hs_err_*.log', - 'coreinfo': 'coreinfo.tmp', + "hs_err_log": "hs_err_*.log", + "coreinfo": "coreinfo.tmp", } -LOG_TYPES_MAPPING.update({k: k for k in 'stdout stderr waterfall_summary jstack pstack'.split()}) +LOG_TYPES_MAPPING.update( + {k: k for k in "stdout stderr waterfall_summary jstack pstack".split()} +) class Worker(serializers.JSONSerializableModel): """ Worker information class for worker information and log retrieval. """ - __slots__ = '_client', + + __slots__ = ("_client",) @classmethod def extract_from_json(cls, json_obj, client=None, parent=None): @@ -43,21 +46,24 @@ def get_log(self, log_type, size=0): :return: log content """ return self.parent.get_worker_log(self.log_id, log_type, size=size) - get_log.__doc__ = get_log.__doc__.format(log_types=', '.join(sorted(six.iterkeys(LOG_TYPES_MAPPING)))) + + get_log.__doc__ = get_log.__doc__.format( + log_types=", ".join(sorted(six.iterkeys(LOG_TYPES_MAPPING))) + ) class WorkerDetail2(Worker): - id = serializers.JSONNodeField('id') - log_id = serializers.JSONNodeField('logId') - type = serializers.JSONNodeField('type') - start_time = serializers.JSONNodeField('startTime', parse_callback=int) - end_time = serializers.JSONNodeField('endTime', parse_callback=int) - status = serializers.JSONNodeField('status') - gbi_counter = serializers.JSONNodeField('gblCounter') - input_bytes = serializers.JSONNodeField('input_bytes', parse_callback=int) - input_records = serializers.JSONNodeField('input_records', parse_callback=int) - output_bytes = serializers.JSONNodeField('output_bytes', parse_callback=int) - output_records = serializers.JSONNodeField('output_records', parse_callback=int) + id = serializers.JSONNodeField("id") + log_id = serializers.JSONNodeField("logId") + type = serializers.JSONNodeField("type") + start_time = serializers.JSONNodeField("startTime", parse_callback=int) + end_time = serializers.JSONNodeField("endTime", parse_callback=int) + status = serializers.JSONNodeField("status") + gbi_counter = serializers.JSONNodeField("gblCounter") + input_bytes = serializers.JSONNodeField("input_bytes", parse_callback=int) + input_records = serializers.JSONNodeField("input_records", parse_callback=int) + output_bytes = serializers.JSONNodeField("output_bytes", parse_callback=int) + output_records = serializers.JSONNodeField("output_records", parse_callback=int) @classmethod def extract_from_json(cls, json_obj, client=None, parent=None): @@ -68,9 +74,9 @@ def _extract(o): for v in o: _extract(v) elif isinstance(o, dict): - worker_type = o.get('name', '') - if 'instances' in o: - for v in o['instances']: + worker_type = o.get("name", "") + if "instances" in o: + for v in o["instances"]: w = cls.parse(v, parent=parent) w.type = worker_type w._client = client diff --git a/odps/models/xflow.py b/odps/models/xflow.py index 1ae45d50..bf76a622 100644 --- a/odps/models/xflow.py +++ b/odps/models/xflow.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,8 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .core import LazyLoad from .. import serializers, utils +from .core import LazyLoad class XFlow(LazyLoad): diff --git a/odps/models/xflows.py b/odps/models/xflows.py index 38dc3851..43c13446 100644 --- a/odps/models/xflows.py +++ b/odps/models/xflows.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,18 +17,17 @@ import time from collections import OrderedDict +from .. import compat, errors, options, serializers +from ..compat import six from .core import Iterable, XMLRemoteModel -from .xflow import XFlow from .instance import Instance -from .. import serializers, errors, compat, options -from ..compat import six +from .xflow import XFlow class XFlows(Iterable): - - marker = serializers.XMLNodeField('Marker') - max_items = serializers.XMLNodeField('MaxItems') - xflows = serializers.XMLNodesReferencesField(XFlow, 'odpsalgo') + marker = serializers.XMLNodeField("Marker") + max_items = serializers.XMLNodeField("MaxItems") + xflows = serializers.XMLNodesReferencesField(XFlow, "odpsalgo") def _get(self, name): return XFlow(client=self._client, parent=self, name=name) @@ -53,19 +52,18 @@ def __iter__(self): def iterate(self, owner=None): params = dict() if owner is not None: - params['owner'] = owner + params["owner"] = owner def _it(): - last_marker = params.get('marker') - if 'marker' in params and \ - (last_marker is None or len(last_marker) == 0): + last_marker = params.get("marker") + if "marker" in params and (last_marker is None or len(last_marker) == 0): return url = self.resource() resp = self._client.get(url, params=params) inst = XFlows.parse(self._client, resp, obj=self) - params['marker'] = inst.marker + params["marker"] = inst.marker return inst.xflows @@ -78,7 +76,7 @@ def _it(): def create(self, xml_source): url = self.resource() - headers = {'Content-Type': 'application/xml'} + headers = {"Content-Type": "application/xml"} self._client.post(url, xml_source, headers=headers) def delete(self, name): @@ -94,32 +92,40 @@ def delete(self, name): def update(self, xflow): url = xflow.resource() - headers = {'Content-Type': 'application/xml'} + headers = {"Content-Type": "application/xml"} self._client.put(url, xflow.xml_source, headers) xflow.reload() return xflow class XFlowInstance(XMLRemoteModel): - __slots__ = 'xflow_project', 'xflow_name', 'parameters', 'priority', 'properties' - _root = 'XflowInstance' + __slots__ = ( + "xflow_project", + "xflow_name", + "parameters", + "priority", + "properties", + ) + _root = "XflowInstance" - xflow_project = serializers.XMLNodeField('Project') - xflow_name = serializers.XMLNodeField('Xflow') + xflow_project = serializers.XMLNodeField("Project") + xflow_name = serializers.XMLNodeField("Xflow") parameters = serializers.XMLNodePropertiesField( - 'Parameters', 'Parameter', key_tag='Key', value_tag='Value', required=True + "Parameters", "Parameter", key_tag="Key", value_tag="Value", required=True ) priority = serializers.XMLNodeField( - 'Priority', parse_callback=int, serialize_callback=int + "Priority", parse_callback=int, serialize_callback=int ) properties = serializers.XMLNodePropertiesField( - 'Config', 'Property', key_tag='Name', value_tag='Value' + "Config", "Property", key_tag="Name", value_tag="Value" ) class AnonymousSubmitXFlowInstance(XMLRemoteModel): - _root = 'Instance' + _root = "Instance" - instance = serializers.XMLNodeReferenceField('XFlows.XFlowInstance', 'XflowInstance') + instance = serializers.XMLNodeReferenceField( + "XFlows.XFlowInstance", "XflowInstance" + ) @staticmethod def _gen_xflow_instance_xml(xflow_instance=None, **kw): @@ -129,48 +135,55 @@ def _gen_xflow_instance_xml(xflow_instance=None, **kw): inst = XFlows.AnonymousSubmitXFlowInstance(instance=xflow_instance) return inst.serialize() - def run_xflow(self, xflow_instance=None, project=None, hints=None, parameters=None, **kw): + def run_xflow( + self, xflow_instance=None, project=None, hints=None, parameters=None, **kw + ): project = project or self.parent hints = hints or {} if options.ml.xflow_settings: hints.update(options.ml.xflow_settings) if hints: - kw['properties'] = hints + kw["properties"] = hints if options.biz_id: - if kw.get('properties') is None: - kw['properties'] = OrderedDict() - kw['properties']['biz_id'] = str(options.biz_id) + if kw.get("properties") is None: + kw["properties"] = OrderedDict() + kw["properties"]["biz_id"] = str(options.biz_id) if parameters: new_params = OrderedDict() for k, v in six.iteritems(parameters): - if k == 'modelName' and '/' not in v: - new_params[k] = '%s/offlinemodels/%s' % (project.name, v) - elif k in ('inputTableName', 'outputTableName') and '.' not in v: - new_params[k] = '%s.%s' % (project.name, v) + if k == "modelName" and "/" not in v: + new_params[k] = "%s/offlinemodels/%s" % (project.name, v) + elif k in ("inputTableName", "outputTableName") and "." not in v: + new_params[k] = "%s.%s" % (project.name, v) else: new_params[k] = v parameters = new_params return project.instances.create( - xml=self._gen_xflow_instance_xml(xflow_instance=xflow_instance, parameters=parameters, **kw)) + xml=self._gen_xflow_instance_xml( + xflow_instance=xflow_instance, parameters=parameters, **kw + ) + ) class XFlowResult(XMLRemoteModel): class XFlowAction(XMLRemoteModel): - node_type = serializers.XMLNodeAttributeField('.', attr='NodeType') - instance_id = serializers.XMLNodeField('InstanceId') - name = serializers.XMLNodeField('Name') - result = serializers.XMLNodeReferenceField(Instance.InstanceResult, 'Result') + node_type = serializers.XMLNodeAttributeField(".", attr="NodeType") + instance_id = serializers.XMLNodeField("InstanceId") + name = serializers.XMLNodeField("Name") + result = serializers.XMLNodeReferenceField( + Instance.InstanceResult, "Result" + ) - actions = serializers.XMLNodesReferencesField(XFlowAction, 'Actions', 'Action') + actions = serializers.XMLNodesReferencesField(XFlowAction, "Actions", "Action") def get_xflow_results(self, instance): url = instance.resource() - resp = self._client.get(url, action='xresult') + resp = self._client.get(url, action="xresult") xflow_result = XFlows.XFlowResult.parse(self._client, resp) return {action.name: action for action in xflow_result.actions} def get_xflow_source(self, instance): - return self._client.get(instance.resource(), action='xsource').content + return self._client.get(instance.resource(), action="xsource").content def get_xflow_instance(self, instance): content = self.get_xflow_source(instance) @@ -182,11 +195,13 @@ def get_xflow_instance(self, instance): def get_xflow_sub_instances(self, instance): inst_dict = OrderedDict() - for x_result in filter(lambda xr: xr.node_type != 'Local', - six.itervalues(self.get_xflow_results(instance))): - if x_result.node_type == 'Instance': + for x_result in filter( + lambda xr: xr.node_type != "Local", + six.itervalues(self.get_xflow_results(instance)), + ): + if x_result.node_type == "Instance": inst_dict[x_result.name] = self.parent.instances[x_result.instance_id] - elif x_result.node_type == 'SubWorkflow': + elif x_result.node_type == "SubWorkflow": sub_instance = self.parent.instances[x_result.instance_id] sub_inst_dict = self.get_xflow_sub_instances(sub_instance) inst_dict.update(**sub_inst_dict) diff --git a/odps/readers.py b/odps/readers.py index e77992a5..c172d7d3 100644 --- a/odps/readers.py +++ b/odps/readers.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,21 +14,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -import csv import copy +import csv import itertools import math from collections import OrderedDict from requests import Response -from . import types, compat, utils, options +from . import compat, options, types, utils +from .compat import StringIO, six from .models.record import Record -from .compat import six, StringIO class AbstractRecordReader(object): - def __iter__(self): return self @@ -55,7 +54,7 @@ def _get_slice(cls, item): end = item.stop step = item.step or 1 else: - raise ValueError('Reader only supports index and slice operation.') + raise ValueError("Reader only supports index and slice operation.") return start, end, step @@ -64,14 +63,14 @@ def __getitem__(self, item): count = self._calc_count(start, end, step) if start < 0 or (count is not None and count <= 0) or step < 0: - raise ValueError('start, count, or step cannot be negative') + raise ValueError("start, count, or step cannot be negative") it = self._get_slice_iter(start=start, end=end, step=step) if isinstance(item, six.integer_types): try: return next(it) except StopIteration: - raise IndexError('Index out of range: %s' % item) + raise IndexError("Index out of range: %s" % item) return it def _get_slice_iter(self, start=None, end=None, step=None): @@ -122,52 +121,76 @@ def _iter(self, start=None, end=None, step=None): if end is not None and curr >= end: return - def _data_to_result_frame(self, data, unknown_as_string=True, as_type=None): + def _data_to_result_frame( + self, data, unknown_as_string=True, as_type=None, columns=None + ): from .df.backends.frame import ResultFrame - from .df.backends.odpssql.types import odps_schema_to_df_schema, odps_type_to_df_type + from .df.backends.odpssql.types import ( + odps_schema_to_df_schema, + odps_type_to_df_type, + ) kw = dict() - if getattr(self, 'schema', None) is not None: - kw['schema'] = odps_schema_to_df_schema(self.schema) - elif getattr(self, '_schema', None) is not None: + if getattr(self, "schema", None) is not None: + kw["schema"] = odps_schema_to_df_schema(self.schema) + elif getattr(self, "_schema", None) is not None: # do not remove as there might be coverage missing - kw['schema'] = odps_schema_to_df_schema(self._schema) + kw["schema"] = odps_schema_to_df_schema(self._schema) - if getattr(self, '_column_names', None) is not None: - self._columns = [self.schema[c] for c in self._column_names] - if getattr(self, '_columns', None) is not None: + column_names = columns or getattr(self, "_column_names", None) + if column_names is not None: + self._columns = [self.schema[c] for c in column_names] + if getattr(self, "_columns", None) is not None: cols = [] for col in self._columns: col = copy.copy(col) col.type = odps_type_to_df_type(col.type) cols.append(col) - kw['columns'] = cols + kw["columns"] = cols - if hasattr(self, 'raw'): + if hasattr(self, "raw"): try: import pandas as pd + from .df.backends.pd.types import pd_to_df_schema + data = pd.read_csv(StringIO(self.raw)) - kw['schema'] = pd_to_df_schema( + schema = kw["schema"] = pd_to_df_schema( data, unknown_as_string=unknown_as_string, as_type=as_type ) - kw.pop('columns', None) + columns = kw.pop("columns", None) + if columns and len(columns) < len(schema): + sel_cols = [c.name for c in self._columns] + data = data[sel_cols] + kw["schema"] = types.OdpsSchema(columns) except (ImportError, ValueError): pass if not kw: - raise ValueError('Cannot convert to ResultFrame from %s.' % type(self).__name__) + raise ValueError( + "Cannot convert to ResultFrame from %s." % type(self).__name__ + ) return ResultFrame(data, **kw) def to_result_frame( - self, unknown_as_string=True, as_type=None, start=None, count=None, **iter_kw + self, + unknown_as_string=True, + as_type=None, + start=None, + count=None, + columns=None, + **iter_kw ): read_row_batch_size = options.tunnel.read_row_batch_size if "end" in iter_kw: end = iter_kw["end"] else: - end = None if count is None else (start or 0) + count * (iter_kw.get("step") or 1) + end = ( + None + if count is None + else (start or 0) + count * (iter_kw.get("step") or 1) + ) frames = [] if hasattr(self, "raw"): @@ -183,18 +206,25 @@ def to_result_frame( if offset != read_row_batch_size - 1: continue - frames.append(self._data_to_result_frame( - data, unknown_as_string=unknown_as_string, as_type=as_type - )) + frames.append( + self._data_to_result_frame( + data, unknown_as_string=unknown_as_string, as_type=as_type + ) + ) data = [None] * read_row_batch_size if len(frames) > options.tunnel.batch_merge_threshold: frames = [frames[0].concat(*frames[1:])] if not frames or data[0] is not None: data = list(itertools.takewhile(lambda x: x is not None, data)) - frames.append(self._data_to_result_frame( - data, unknown_as_string=unknown_as_string, as_type=as_type - )) + frames.append( + self._data_to_result_frame( + data, + unknown_as_string=unknown_as_string, + as_type=as_type, + columns=columns, + ) + ) return frames[0].concat(*frames[1:]) def to_pandas(self, start=None, count=None, **kw): @@ -204,12 +234,12 @@ def to_pandas(self, start=None, count=None, **kw): class CsvRecordReader(AbstractRecordReader): - NULL_TOKEN = '\\N' - BACK_SLASH_ESCAPE = '\\x%02x' % ord('\\') + NULL_TOKEN = "\\N" + BACK_SLASH_ESCAPE = "\\x%02x" % ord("\\") def __init__(self, schema, stream, **kwargs): self._schema = schema - self._columns = None + self._csv_columns = None self._fp = stream if isinstance(self._fp, Response): self.raw = self._fp.content if six.PY2 else self._fp.text @@ -221,37 +251,47 @@ def __init__(self, schema, stream, **kwargs): else: self._csv = csv.reader(six.StringIO(self._escape_csv(self.raw))) + self._filtered_col_names = ( + set(x.lower() for x in kwargs["columns"]) if "columns" in kwargs else None + ) + self._columns = None + self._filtered_col_idxes = None + @classmethod def _escape_csv(cls, s): - escaped = utils.to_text(s).encode('unicode_escape') + escaped = utils.to_text(s).encode("unicode_escape") # Make invisible chars available to `csv` library. # Note that '\n' and '\r' should be unescaped. # '\\' should be replaced with '\x5c' before unescaping # to avoid mis-escaped strings like '\\n'. - return utils.to_text(escaped) \ - .replace('\\\\', cls.BACK_SLASH_ESCAPE) \ - .replace('\\n', '\n') \ - .replace('\\r', '\r') + return ( + utils.to_text(escaped) + .replace("\\\\", cls.BACK_SLASH_ESCAPE) + .replace("\\n", "\n") + .replace("\\r", "\r") + ) @classmethod def _escape_csv_bin(cls, s): - escaped = utils.to_binary(s).decode('latin1').encode('unicode_escape') + escaped = utils.to_binary(s).decode("latin1").encode("unicode_escape") # Make invisible chars available to `csv` library. # Note that '\n' and '\r' should be unescaped. # '\\' should be replaced with '\x5c' before unescaping # to avoid mis-escaped strings like '\\n'. - return utils.to_text(escaped) \ - .replace('\\\\', cls.BACK_SLASH_ESCAPE) \ - .replace('\\n', '\n') \ - .replace('\\r', '\r') + return ( + utils.to_text(escaped) + .replace("\\\\", cls.BACK_SLASH_ESCAPE) + .replace("\\n", "\n") + .replace("\\r", "\r") + ) @staticmethod def _unescape_csv(s): - return s.encode('utf-8').decode('unicode_escape') + return s.encode("utf-8").decode("unicode_escape") @staticmethod def _unescape_csv_bin(s): - return s.encode('utf-8').decode('unicode_escape').encode('latin1') + return s.encode("utf-8").decode("unicode_escape").encode("latin1") def _readline(self): try: @@ -268,33 +308,39 @@ def _readline(self): value = unescape_csv(value) if value == self.NULL_TOKEN: res.append(None) - elif self._columns and self._columns[i].type == types.boolean: - if value == 'true': + elif self._csv_columns and self._csv_columns[i].type == types.boolean: + if value == "true": res.append(True) - elif value == 'false': + elif value == "false": res.append(False) else: res.append(value) - elif self._columns and isinstance(self._columns[i].type, types.Map): - col_type = self._columns[i].type - if not (value.startswith('{') and value.endswith('}')): - raise ValueError('Dict format error!') + elif self._csv_columns and isinstance( + self._csv_columns[i].type, types.Map + ): + col_type = self._csv_columns[i].type + if not (value.startswith("{") and value.endswith("}")): + raise ValueError("Dict format error!") items = [] - for kv in value[1:-1].split(','): - k, v = kv.split(':', 1) + for kv in value[1:-1].split(","): + k, v = kv.split(":", 1) k = col_type.key_type.cast_value(k.strip(), types.string) v = col_type.value_type.cast_value(v.strip(), types.string) items.append((k, v)) res.append(OrderedDict(items)) - elif self._columns and isinstance(self._columns[i].type, types.Array): - col_type = self._columns[i].type - if not (value.startswith('[') and value.endswith(']')): - raise ValueError('Array format error!') + elif self._csv_columns and isinstance( + self._csv_columns[i].type, types.Array + ): + col_type = self._csv_columns[i].type + if not (value.startswith("[") and value.endswith("]")): + raise ValueError("Array format error!") items = [] - for item in value[1:-1].split(','): - item = col_type.value_type.cast_value(item.strip(), types.string) + for item in value[1:-1].split(","): + item = col_type.value_type.cast_value( + item.strip(), types.string + ) items.append(item) res.append(items) else: @@ -309,6 +355,9 @@ def __next__(self): values = self._readline() if values is None: raise StopIteration + + if self._filtered_col_idxes: + values = [values[idx] for idx in self._filtered_col_idxes] return Record(self._columns, values=values) next = __next__ @@ -323,22 +372,36 @@ def read(self, start=None, count=None, step=None): return self._iter(start=start, end=end, step=step) def _load_columns(self): - if self._columns is not None: + if self._csv_columns is not None: return values = self._readline() - self._columns = [] + self._csv_columns = [] for value in values: if self._schema is None: - self._columns.append(types.Column(name=value, typo='string')) + self._csv_columns.append(types.Column(name=value, typo="string")) else: if self._schema.is_partition(value): - self._columns.append(self._schema.get_partition(value)) + self._csv_columns.append(self._schema.get_partition(value)) else: - self._columns.append(self._schema.get_column(value)) + self._csv_columns.append(self._schema.get_column(value)) + + if self._csv_columns is not None and self._filtered_col_names: + self._filtered_col_idxes = [] + self._columns = [] + for idx, col in enumerate(self._csv_columns): + if col.name.lower() in self._filtered_col_names: + self._filtered_col_idxes.append(idx) + self._columns.append(col) + else: + self._columns = self._csv_columns + + def to_pandas(self, start=None, count=None, **kw): + kw.pop("n_process", None) + return super(CsvRecordReader, self).to_pandas(start=start, count=count, **kw) def close(self): - if hasattr(self._fp, 'close'): + if hasattr(self._fp, "close"): self._fp.close() def __enter__(self): diff --git a/odps/rest.py b/odps/rest.py index 70771be0..77adc82b 100644 --- a/odps/rest.py +++ b/odps/rest.py @@ -15,14 +15,17 @@ """Restful client enhanced by URL building and request signing facilities. """ from __future__ import absolute_import + import json import logging +import os import platform import re import threading from string import Template import requests + try: from requests import ConnectTimeout except ImportError: @@ -32,22 +35,23 @@ except ImportError: requests_unixsocket = None -from . import __version__ -from . import errors, utils -from .config import options -from .utils import get_package_version, get_survey_calls, clear_survey_calls +from . import __version__, errors, utils from .compat import six, urlparse +from .config import options +from .utils import clear_survey_calls, get_package_version, get_survey_calls try: import requests.packages.urllib3.util.ssl_ - requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL' + + requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = "ALL" requests.packages.urllib3.disable_warnings() except ImportError: pass try: import urllib3.util.ssl_ - urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL' + + urllib3.util.ssl_.DEFAULT_CIPHERS = "ALL" urllib3.disable_warnings() except ImportError: pass @@ -74,34 +78,35 @@ def default_user_agent(): py_system = platform.system() py_release = platform.release() except IOError: - py_system = 'Unknown' - py_release = 'Unknown' + py_system = "Unknown" + py_release = "Unknown" ua_template = Template( options.user_agent_pattern - or '$pyodps_version $mars_version $maxframe_version $python_version $os_version' + or os.getenv("PYODPS_USER_AGENT_PATTERN") + or "$pyodps_version $mars_version $maxframe_version $python_version $os_version" ) substitutes = dict( - pyodps_version='%s/%s' % ('pyodps', __version__), - python_version='%s/%s' % (py_implementation, py_version), - os_version='%s/%s' % (py_system, py_release), - mars_version='', - maxframe_version='', + pyodps_version="%s/%s" % ("pyodps", __version__), + python_version="%s/%s" % (py_implementation, py_version), + os_version="%s/%s" % (py_system, py_release), + mars_version="", + maxframe_version="", ) try: from mars import __version__ as mars_version - except ImportError: + except: mars_version = None if mars_version: - substitutes["mars_version"] = '%s/%s' % ('mars', mars_version) + substitutes["mars_version"] = "%s/%s" % ("mars", mars_version) try: maxframe_version = get_package_version("maxframe") except: maxframe_version = None if maxframe_version: - substitutes["maxframe_version"] = '%s/%s' % ('maxframe', maxframe_version) + substitutes["maxframe_version"] = "%s/%s" % ("maxframe", maxframe_version) _default_user_agent = ua_template.safe_substitute(**substitutes) _default_user_agent = re.sub(" +", " ", _default_user_agent).strip() @@ -120,9 +125,16 @@ class RestClient(object): _endpoints_without_v4_sign = set() def __init__( - self, account, endpoint, project=None, schema=None, user_agent=None, region_name=None, **kwargs + self, + account, + endpoint, + project=None, + schema=None, + user_agent=None, + region_name=None, + **kwargs ): - if endpoint.endswith('/'): + if endpoint.endswith("/"): endpoint = endpoint[:-1] self._account = account self._endpoint = endpoint @@ -130,9 +142,9 @@ def __init__( self._user_agent = user_agent or default_user_agent() self.project = project self.schema = schema - self._proxy = kwargs.get('proxy') - self._app_account = kwargs.get('app_account') - self._tag = kwargs.get('tag') + self._proxy = kwargs.get("proxy") + self._app_account = kwargs.get("app_account") + self._tag = kwargs.get("tag") if isinstance(self._proxy, six.string_types): self._proxy = dict(http=self._proxy, https=self._proxy) @@ -172,24 +184,23 @@ def session(self): if parsed_url.scheme == "http+unix": session = requests_unixsocket.Session() session.mount( - 'http+unix://', - requests_unixsocket.adapters.UnixAdapter(**adapter_options) + "http+unix://", + requests_unixsocket.adapters.UnixAdapter(**adapter_options), ) else: session = requests.Session() # mount adapters with retry times - session.mount( - 'http://', requests.adapters.HTTPAdapter(**adapter_options) - ) - session.mount( - 'https://', requests.adapters.HTTPAdapter(**adapter_options) - ) + session.mount("http://", requests.adapters.HTTPAdapter(**adapter_options)) + session.mount("https://", requests.adapters.HTTPAdapter(**adapter_options)) session_cache[self._endpoint] = session return session def request(self, url, method, stream=False, **kwargs): sign_region_name = kwargs.get("region_name") or self._region_name - if self._endpoint in self._endpoints_without_v4_sign or not options.enable_v4_sign: + if ( + self._endpoint in self._endpoints_without_v4_sign + or not options.enable_v4_sign + ): sign_region_name = None auth_expire_retried = False @@ -228,18 +239,18 @@ def _request(self, url, method, stream=False, **kwargs): region_name = kwargs.pop("region_name", None) - logger.debug('Start request.') - logger.debug('%s: %s', method.upper(), url) + logger.debug("Start request.") + logger.debug("%s: %s", method.upper(), url) if logger.getEffectiveLevel() <= logging.DEBUG: for k, v in kwargs.items(): - logger.debug('%s: %s', k, v) + logger.debug("%s: %s", k, v) # Construct user agent without handling the letter case. - headers = kwargs.get('headers', {}) + headers = kwargs.get("headers", {}) headers = {k: str(v) for k, v in six.iteritems(headers)} - headers['User-Agent'] = self._user_agent - kwargs['headers'] = headers - params = kwargs.setdefault('params', {}) + headers["User-Agent"] = self._user_agent + kwargs["headers"] = headers + params = kwargs.setdefault("params", {}) actions = kwargs.pop("actions", None) or kwargs.pop("action", None) or [] if isinstance(actions, six.string_types): @@ -249,22 +260,24 @@ def _request(self, url, method, stream=False, **kwargs): url += separator + "&".join(actions) curr_project = kwargs.pop("curr_project", None) or self.project - if 'curr_project' not in params and curr_project is not None: - params['curr_project'] = curr_project + if "curr_project" not in params and curr_project is not None: + params["curr_project"] = curr_project curr_schema = kwargs.pop("curr_schema", None) or self.schema - if 'curr_schema' not in params and curr_schema is not None: - params['curr_schema'] = curr_schema + if "curr_schema" not in params and curr_schema is not None: + params["curr_schema"] = curr_schema - timeout = kwargs.pop('timeout', None) + timeout = kwargs.pop("timeout", None) req = requests.Request(method, url, **kwargs) prepared_req = req.prepare() logger.debug("request url + params %s", prepared_req.path_url) - prepared_req.headers.pop('Authorization', None) - prepared_req.headers.pop('application-authentication', None) - self._account.sign_request(prepared_req, self._endpoint, region_name=region_name) - if getattr(self, '_app_account', None) is not None: + prepared_req.headers.pop("Authorization", None) + prepared_req.headers.pop("application-authentication", None) + self._account.sign_request( + prepared_req, self._endpoint, region_name=region_name + ) + if getattr(self, "_app_account", None) is not None: self._app_account.sign_request( prepared_req, self._endpoint, region_name=region_name ) @@ -278,33 +291,39 @@ def _request(self, url, method, stream=False, **kwargs): proxies=self._proxy, ) except ConnectTimeout: - raise errors.ConnectTimeout('Connecting to endpoint %s timeout.' % self._endpoint) + raise errors.ConnectTimeout( + "Connecting to endpoint %s timeout." % self._endpoint + ) - logger.debug('response.status_code %d', res.status_code) - logger.debug('response.headers: \n%s', res.headers) + logger.debug("response.status_code %d", res.status_code) + logger.debug("response.headers: \n%s", res.headers) if not stream: - logger.debug('response.content: %s\n', res.content) + logger.debug("response.content: %s\n", res.content) # Automatically detect error if not self.is_ok(res): errors.throw_if_parsable(res, self._endpoint, self._tag) return res def get(self, url, stream=False, **kwargs): - return self.request(url, 'get', stream=stream, **kwargs) + return self.request(url, "get", stream=stream, **kwargs) def post(self, url, data=None, **kwargs): - data = utils.to_binary(data, encoding='utf-8') if isinstance(data, six.string_types) else data - return self.request(url, 'post', data=data, **kwargs) + data = ( + utils.to_binary(data, encoding="utf-8") + if isinstance(data, six.string_types) + else data + ) + return self.request(url, "post", data=data, **kwargs) def put(self, url, data=None, **kwargs): data = utils.to_binary(data) if isinstance(data, six.string_types) else data - return self.request(url, 'put', data=data, **kwargs) + return self.request(url, "put", data=data, **kwargs) def head(self, url, **kwargs): - return self.request(url, 'head', **kwargs) + return self.request(url, "head", **kwargs) def delete(self, url, **kwargs): - return self.request(url, 'delete', **kwargs) + return self.request(url, "delete", **kwargs) def upload_survey_log(self): try: @@ -316,7 +335,9 @@ def upload_survey_log(self): return if self.project is None: return - url = '/'.join([self.endpoint, 'projects', RestModel._encode(self.project), 'logs']) + url = "/".join( + [self.endpoint, "projects", RestModel._encode(self.project), "logs"] + ) self.put(url, json.dumps(survey)) except: pass diff --git a/odps/serializers.py b/odps/serializers.py index 220c40ae..97105804 100644 --- a/odps/serializers.py +++ b/odps/serializers.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime import email.header import inspect import json @@ -24,17 +25,17 @@ import requests from . import utils -from .compat import BytesIO, ElementTree, six +from .compat import BytesIO, ElementTree, long_type, six def _route_xml_path(root, *keys, **kw): - create_if_not_exists = kw.get('create_if_not_exists', False) + create_if_not_exists = kw.get("create_if_not_exists", False) if isinstance(root, six.string_types): root = ElementTree.fromstring(root) for key in keys: - if key == '.': + if key == ".": return root prev = root root = root.find(key) @@ -55,7 +56,7 @@ def _extract_encoded_json(content): def _route_json_path(root, *keys, **kw): - create_if_not_exists = kw.get('create_if_not_exists', False) + create_if_not_exists = kw.get("create_if_not_exists", False) if isinstance(root, six.string_types): root = _extract_encoded_json(root) @@ -74,6 +75,7 @@ def _route_json_path(root, *keys, **kw): def parse_ndarray(array): try: import numpy as np + return np.asarray(array) except ImportError: return array @@ -86,7 +88,7 @@ def serialize_ndarray(array): return array -def _wrap_with_none(func): +def none_or(func): def new_func(x): if x is None: return x @@ -96,30 +98,40 @@ def new_func(x): _serialize_types = dict() -_serialize_types['bool'] = (_wrap_with_none(utils.str_to_bool), _wrap_with_none(utils.bool_to_str)) -_serialize_types['json'] = (_wrap_with_none(json.loads), _wrap_with_none(json.dumps)) -_serialize_types['rfc822'] = (_wrap_with_none(utils.parse_rfc822), _wrap_with_none(utils.gen_rfc822)) -_serialize_types['rfc822l'] = ( - _wrap_with_none(utils.parse_rfc822), _wrap_with_none(lambda s: utils.gen_rfc822(s, localtime=True)) +_serialize_types["bool"] = (none_or(utils.str_to_bool), none_or(utils.bool_to_str)) +_serialize_types["json"] = (none_or(json.loads), none_or(json.dumps)) +_serialize_types["rfc822"] = (none_or(utils.parse_rfc822), none_or(utils.gen_rfc822)) +_serialize_types["rfc822l"] = ( + none_or(utils.parse_rfc822), + none_or(lambda s: utils.gen_rfc822(s, localtime=True)), +) +_serialize_types["ndarray"] = (none_or(parse_ndarray), none_or(serialize_ndarray)) +_serialize_types["timestamp_ms"] = ( + none_or(lambda x: datetime.datetime.fromtimestamp(long_type(x) / 1000.0)), + none_or(lambda x: long_type(x.timestamp() * 1000)), ) -_serialize_types['ndarray'] = (_wrap_with_none(parse_ndarray), _wrap_with_none(serialize_ndarray)) +_serialize_types["int"] = (none_or(long_type), none_or(str)) +_serialize_types["float"] = (none_or(float), none_or(str)) class SerializeField(object): def __init__(self, *keys, **kwargs): self._path_keys = keys - self._required = kwargs.get('required', False) # used when serialized - self._blank_if_null = kwargs.get('blank_if_null', - True if self._required else False) - self._default = kwargs.get('default') - if 'type' in kwargs: - self._parse_callback, self._serialize_callback = _serialize_types[kwargs.pop('type')] + self._required = kwargs.get("required", False) # used when serialized + self._blank_if_null = kwargs.get( + "blank_if_null", True if self._required else False + ) + self._default = kwargs.get("default") + if "type" in kwargs: + self._parse_callback, self._serialize_callback = _serialize_types[ + kwargs.pop("type") + ] else: - self._parse_callback = kwargs.get('parse_callback') - self._serialize_callback = kwargs.get('serialize_callback') + self._parse_callback = kwargs.get("parse_callback") + self._serialize_callback = kwargs.get("serialize_callback") - self.set_to_parent = kwargs.get('set_to_parent', False) + self.set_to_parent = kwargs.get("set_to_parent", False) def _to_str(self, val): if isinstance(val, six.string_types): @@ -153,7 +165,7 @@ def _model(self): if self._model_cls is not None: return self._model_cls - models = self._model_str.split('.') + models = self._model_str.split(".") model_name = models[0] module = None @@ -161,13 +173,14 @@ def _model(self): globs = stack[0].f_globals if model_name in globs: possible_module = globs[model_name] - if inspect.isclass(possible_module) and \ - issubclass(possible_module, SerializableModel): + if inspect.isclass(possible_module) and issubclass( + possible_module, SerializableModel + ): module = possible_module break if module is None: - raise ValueError('Unknown model name: %s' % self._model_str) + raise ValueError("Unknown model name: %s" % self._model_str) res = None for model in models[1:]: @@ -180,7 +193,11 @@ def _model(self): return res -_default_name_maker = dict(capitalized=utils.underline_to_capitalized, raw=lambda v: v, camel=utils.underline_to_camel) +_default_name_maker = dict( + capitalized=utils.underline_to_capitalized, + raw=lambda v: v, + camel=utils.underline_to_camel, +) class SerializableModelMetaClass(type): @@ -188,18 +205,20 @@ def __new__(mcs, name, bases, kv): slots = [] fields = dict() for base in bases: - base_slots = list(getattr(base, '__slots__', [])) - if '__weakref__' in base_slots: - base_slots.remove('__weakref__') + base_slots = list(getattr(base, "__slots__", [])) + if "__weakref__" in base_slots: + base_slots.remove("__weakref__") slots.extend(base_slots) - fields.update(getattr(base, '__fields', dict())) - slots.extend(kv.get('__slots__', [])) - fields.update(kv.get('__fields', dict())) + fields.update(getattr(base, "__fields", dict())) + slots.extend(kv.get("__slots__", [])) + fields.update(kv.get("__fields", dict())) attrs = [] parent_attrs = [] - def_name = kv.pop('_' + name + '__default_name', 'capitalized') - for attr, field in (pair for pair in six.iteritems(kv) if not pair[0].startswith('__')): + def_name = kv.pop("_" + name + "__default_name", "capitalized") + for attr, field in ( + pair for pair in six.iteritems(kv) if not pair[0].startswith("__") + ): if inspect.isclass(field) and issubclass(field, SerializeField): field = field() if isinstance(field, SerializeField): @@ -210,13 +229,16 @@ def __new__(mcs, name, bases, kv): if field.set_to_parent: parent_attrs.append(attr) fields[attr] = field - kv['_parent_attrs'] = set(parent_attrs) + kv["_parent_attrs"] = set(parent_attrs) slots = tuple(OrderedDict.fromkeys(slots)) slots_pos = dict([(v, k) for k, v in enumerate(slots)]) fields = OrderedDict( - sorted(six.iteritems(fields), key=lambda s: slots_pos.get(s[0], float('inf')))) + sorted( + six.iteritems(fields), key=lambda s: slots_pos.get(s[0], float("inf")) + ) + ) for attr in attrs: if attr in kv: @@ -224,18 +246,18 @@ def __new__(mcs, name, bases, kv): slots = tuple(slot for slot in slots if slot not in kv) if len(slots) > 0: - kv['__slots__'] = slots + kv["__slots__"] = slots if len(fields) > 0: - kv['__fields'] = fields + kv["__fields"] = fields return type.__new__(mcs, name, bases, kv) class SerializableModel(six.with_metaclass(SerializableModelMetaClass)): - __slots__ = '_parent', '__weakref__' + __slots__ = "_parent", "__weakref__" def __init__(self, **kwargs): - slots = getattr(self, '__slots__', []) + slots = getattr(self, "__slots__", []) for k, v in six.iteritems(kwargs): if k in slots: @@ -265,7 +287,7 @@ def _setattr(cls, obj, k, v, skip_null=True): setattr(obj, k, v) return - fields = getattr(type(obj), '__fields') + fields = getattr(type(obj), "__fields") if not isinstance(fields[k], HasSubModelField): setattr(obj, k, v) elif isinstance(v, list): @@ -276,20 +298,21 @@ def _setattr(cls, obj, k, v, skip_null=True): if sub_obj is None: setattr(obj, k, v) return - sub_fields = getattr(new_obj, '__fields', {}) + sub_fields = getattr(new_obj, "__fields", {}) for k in six.iterkeys(sub_fields): if sub_fields[k].set_to_parent is True: continue - cls._setattr(sub_obj, k, object.__getattribute__(new_obj, k), - skip_null=skip_null) + cls._setattr( + sub_obj, k, object.__getattribute__(new_obj, k), skip_null=skip_null + ) @classmethod def _init_obj(cls, content, obj=None, **kw): - fields = dict(getattr(cls, '__fields')) + fields = dict(getattr(cls, "__fields")) - _type = getattr(cls, '_type_indicator', None) - _name = 'name' if 'name' in fields else None - if obj is None and (_type is not None or 'name' in fields): + _type = getattr(cls, "_type_indicator", None) + _name = "name" if "name" in fields else None + if obj is None and (_type is not None or "name" in fields): kwargs = dict(kw) for field in (_name, _type): @@ -307,7 +330,7 @@ def deserial(cls, content, obj=None, **kw): obj = cls._init_obj(content, obj=obj, **kw) obj_type = type(obj) - fields = dict(getattr(obj_type, '__fields')) + fields = dict(getattr(obj_type, "__fields")) if isinstance(content, six.string_types): if issubclass(obj_type, XMLSerializableModel): @@ -322,14 +345,14 @@ def deserial(cls, content, obj=None, **kw): if isinstance(prop, SerializeField): kwargs = dict(kw) if isinstance(prop, HasSubModelField): - kwargs['_parent'] = obj + kwargs["_parent"] = obj if not prop.set_to_parent: self_kw[attr] = prop.parse(content, **kwargs) else: parent_kw[attr] = prop.parse(content, **kwargs) for k, v in six.iteritems(self_kw): - obj_type._setattr(obj, k, v, skip_null=getattr(obj_type, 'skip_null', True)) + obj_type._setattr(obj, k, v, skip_null=getattr(obj_type, "skip_null", True)) if obj.parent is not None: for k, v in six.iteritems(parent_kw): @@ -350,7 +373,7 @@ def serial(self): else: root = OrderedDict() - for attr, prop in six.iteritems(getattr(self, '__fields')): + for attr, prop in six.iteritems(getattr(self, "__fields")): if isinstance(prop, SerializeField): try: prop.serialize(root, object.__getattribute__(self, attr)) @@ -370,12 +393,12 @@ def extract(self, **base_kw): class XMLSerializableModel(SerializableModel): - __slots__ = '_root', + __slots__ = ("_root",) @classmethod def parse(cls, response, obj=None, **kw): - if 'parent' in kw: - kw['_parent'] = kw.pop('parent') + if "parent" in kw: + kw["_parent"] = kw.pop("parent") if isinstance(response, requests.Response): # PY2 prefer bytes, while PY3 prefer str response = response.content.decode() if six.PY3 else response.content @@ -388,24 +411,30 @@ def serialize(self): ElementTree.ElementTree(root).write(sio, encoding="utf-8", xml_declaration=True) xml_content = sio.getvalue() - prettified_xml = minidom.parseString(xml_content).toprettyxml(indent=' '*2, encoding='utf-8') - prettified_xml = utils.to_text(prettified_xml, encoding='utf-8') + prettified_xml = minidom.parseString(xml_content).toprettyxml( + indent=" " * 2, encoding="utf-8" + ) + prettified_xml = utils.to_text(prettified_xml, encoding="utf-8") - cdata_re = re.compile(r'<!\[CDATA\[.*\]\]>', (re.M | re.S)) + cdata_re = re.compile(r"<!\[CDATA\[.*\]\]>", (re.M | re.S)) for src_cdata in cdata_re.finditer(prettified_xml): src_cdata = src_cdata.group(0) - dest_cdata = src_cdata.replace('&', '&').replace('<', '<'). \ - replace('"', '"').replace('>', '>') + dest_cdata = ( + src_cdata.replace("&", "&") + .replace("<", "<") + .replace(""", '"') + .replace(">", ">") + ) prettified_xml = prettified_xml.replace(src_cdata, dest_cdata) - return prettified_xml.replace('"', '"') + return prettified_xml.replace(""", '"') class JSONSerializableModel(SerializableModel): @classmethod def parse(cls, response, obj=None, **kw): - if 'parent' in kw: - kw['_parent'] = kw.pop('parent') + if "parent" in kw: + kw["_parent"] = kw.pop("parent") if isinstance(response, requests.Response): # PY2 prefer bytes, while PY3 prefer str response = response.content.decode() if six.PY3 else response.content @@ -432,7 +461,7 @@ def parse(self, root, **kwargs): return val def _set_default_keys(self, *keys): - super(XMLTagField, self)._set_default_keys('.') + super(XMLTagField, self)._set_default_keys(".") class XMLNodeField(SerializeField): @@ -453,7 +482,7 @@ def parse(self, root, **kwargs): def serialize(self, root, value): value = value if value is not None else self._default if value is None and self._blank_if_null: - value = '' + value = "" if not self._required and value is None: return @@ -467,7 +496,7 @@ def serialize(self, root, value): class XMLNodeAttributeField(SerializeField): def __init__(self, *keys, **kwargs): - self._attr = kwargs.pop('attr', None) + self._attr = kwargs.pop("attr", None) super(XMLNodeAttributeField, self).__init__(*keys, **kwargs) @@ -495,7 +524,7 @@ def serialize(self, root, value): if self._default is not None: value = self._default elif self._blank_if_null: - value = '' + value = "" if not self._required and value is None: return @@ -559,8 +588,7 @@ def parse(self, root, **kwargs): node = node.text instance = self._model.deserial(node, **kwargs) - if isinstance(instance, XMLSerializableModel) and \ - instance._root is None: + if isinstance(instance, XMLSerializableModel) and instance._root is None: instance._root = node.tag if instance is None: @@ -583,8 +611,8 @@ def serialize(self, root, value): if prev_path_keys: root = _route_xml_path(root, create_if_not_exists=True, *prev_path_keys) - if isinstance(value, XMLSerializableModel) and getattr(value, '_root') is None: - setattr(value, '_root', self._path_keys[-1]) + if isinstance(value, XMLSerializableModel) and getattr(value, "_root") is None: + setattr(value, "_root", self._path_keys[-1]) val = value.serial() if isinstance(value, JSONSerializableModel): # JSON mixed in XML @@ -608,14 +636,17 @@ def parse(self, root, **kwargs): instances = [] tag = self._path_keys[-1] - if tag == '*': + if tag == "*": nodes = list(root) else: nodes = root.findall(self._path_keys[-1]) for node in nodes: instance = self._model.deserial(node, **kwargs) - if isinstance(instance, XMLSerializableModel) and instance._root is None: + if ( + isinstance(instance, XMLSerializableModel) + and instance._root is None + ): instance._root = node.tag instances.append(instance) @@ -642,9 +673,8 @@ def serialize(self, root, value): root = _route_xml_path(root, create_if_not_exists=True, *prev_path_keys) for it in value: - if isinstance(it, XMLSerializableModel) and \ - getattr(it, '_root') is None: - setattr(it, '_root', self._path_keys[-1]) + if isinstance(it, XMLSerializableModel) and getattr(it, "_root") is None: + setattr(it, "_root", self._path_keys[-1]) val = it.serial() if isinstance(it, JSONSerializableModel): @@ -659,8 +689,13 @@ def serialize(self, root, value): class XMLNodePropertiesField(SerializeField): def __init__(self, *keys, **kwargs): super(XMLNodePropertiesField, self).__init__(*keys, **kwargs) - self._key_tag = kwargs['key_tag'] - self._value_tag = kwargs['value_tag'] + self._key_tag = kwargs.get("key_tag") + self._key_attr = kwargs.get("key_attr") + if not self._key_tag and not self._key_attr: + raise TypeError("Need to specify one of key_tag or key_attr") + self._value_tag = kwargs.get("value_tag") + if not self._key_attr and not self._value_tag: + raise TypeError("Need to specify key_attr when value_tag is absent") def parse(self, root, **kwargs): prev_path_keys = self._path_keys[:-1] @@ -673,10 +708,20 @@ def parse(self, root, **kwargs): results = OrderedDict() for node in root.findall(self._path_keys[-1]): - key_node = node.find(self._key_tag) - value_node = node.find(self._value_tag) - if key_node is not None and value_node is not None: - results[key_node.text] = value_node.text + if self._key_attr is not None: + key = node.attrib.get(self._key_attr) + else: + key_node = node.find(self._key_tag) + key = key_node.text if key_node is not None else None + + if self._value_tag is not None: + value_node = node.find(self._value_tag) + value = value_node.text if value_node is not None else None + else: + value = node.text + + if key is not None and value is not None: + results[key] = value if results is None: return @@ -702,12 +747,19 @@ def serialize(self, root, value): for k, v in six.iteritems(value): element = ElementTree.Element(self._path_keys[-1]) - key_node = ElementTree.Element(self._key_tag) - key_node.text = utils.to_text(k) - element.append(key_node) - value_node = ElementTree.Element(self._value_tag) - value_node.text = utils.to_text(v) - element.append(value_node) + if self._key_attr is not None: + element.set(self._key_attr, utils.to_text(k)) + else: + key_node = ElementTree.Element(self._key_tag) + key_node.text = utils.to_text(k) + element.append(key_node) + + if self._value_tag is not None: + value_node = ElementTree.Element(self._value_tag) + value_node.text = utils.to_text(v) + element.append(value_node) + else: + element.text = utils.to_text(v) root.append(element) @@ -729,7 +781,7 @@ def parse(self, root, **kwargs): def serialize(self, root, value): value = value if value is not None else self._default if value is None and self._blank_if_null: - value = '' + value = "" if not self._required and value is None: return @@ -754,8 +806,7 @@ def parse(self, root, **kwargs): values = self._default if root is not None: - values = [self._to_str(node[self._path_keys[-1]]) - for node in root] + values = [self._to_str(node[self._path_keys[-1]]) for node in root] if values is None: return @@ -794,7 +845,7 @@ class JSONNodeReferenceField(HasSubModelField): def __init__(self, model, *keys, **kwargs): super(JSONNodeReferenceField, self).__init__(model, *keys, **kwargs) - self._check_before = kwargs.get('check_before') + self._check_before = kwargs.get("check_before") def parse(self, root, **kwargs): instance = self._default @@ -837,8 +888,7 @@ def parse(self, root, **kwargs): instances = self._default if isinstance(root, list): - instances = [self._model.deserial(node, **kwargs) - for node in root] + instances = [self._model.deserial(node, **kwargs) for node in root] elif root is not None: prev_path_keys = self._path_keys[:-1] if prev_path_keys: @@ -847,8 +897,7 @@ def parse(self, root, **kwargs): if root is not None: root = root.get(self._path_keys[-1]) if root is not None: - instances = [self._model.deserial(node, **kwargs) - for node in root] + instances = [self._model.deserial(node, **kwargs) for node in root] if instances is None: return diff --git a/odps/sqlalchemy_odps.py b/odps/sqlalchemy_odps.py index f157806d..d4ee9cf9 100644 --- a/odps/sqlalchemy_odps.py +++ b/odps/sqlalchemy_odps.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,30 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. +import contextlib import itertools +import sys import threading -import contextlib +import time from sqlalchemy import types as sa_types -from sqlalchemy.engine import default, Engine +from sqlalchemy.engine import Engine, default from sqlalchemy.exc import NoSuchTableError from sqlalchemy.sql import compiler, sqltypes + try: from sqlalchemy.dialects import mysql except ImportError: # for low sqlalchemy versions from sqlalchemy.databases import mysql -from . import options -from . import types +from . import options, types from .compat import six -from .core import ODPS, DEFAULT_ENDPOINT +from .core import DEFAULT_ENDPOINT, ODPS +from .errors import BaseODPSError, InternalServerError, NoSuchObject from .models import Table from .models.session import PUBLIC_SESSION_NAME -from .errors import NoSuchObject from .utils import to_str, to_text - test_setting = threading.local() test_setting.get_tables_filter = None @@ -81,20 +82,98 @@ def update_test_setting(**kw): _sqlalchemy_global_reusable_odps = {} +_sqlalchemy_obj_list_cache = {} + + +class ObjectCache(object): + def __init__(self, expire=24 * 3600): + self._expire_time = expire + self._items = dict() + self._cache_time = dict() + + def __getitem__(self, key): + if self._cache_time[key] < time.time() - self._expire_time: + self._cache_time.pop(key, None) + self._items.pop(key, None) + raise KeyError(key) + return self._items[key] + + def __setitem__(self, key, value): + self._items[key] = value + self._cache_time[key] = time.time() + + def get(self, key, default=None): + try: + return self[key] + except KeyError: + return default + class ODPSIdentifierPreparer(compiler.IdentifierPreparer): # Just quote everything to make things simpler / easier to upgrade reserved_words = compiler.RESERVED_WORDS.copy() keywords = [ - 'ADD', 'ALL', 'ALTER', 'AND', 'AS', 'ASC', 'BETWEEN', 'BIGINT', - 'BOOLEAN', 'BY', 'CASE', 'CAST', 'COLUMN', 'COMMENT', 'CREATE', - 'DESC', 'DISTINCT', 'DISTRIBUTE', 'DOUBLE', 'DROP', 'ELSE', 'FALSE', - 'FROM', 'FULL', 'GROUP', 'IF', 'IN', 'INSERT', 'INTO', 'IS', 'JOIN', - 'LEFT', 'LIFECYCLE', 'LIKE', 'LIMIT', 'MAPJOIN', 'NOT', 'NULL', - 'ON', 'OR', 'ORDER', 'OUTER', 'OVERWRITE', 'PARTITION', 'RENAME', - 'REPLACE', 'RIGHT', 'RLIKE', 'SELECT', 'SORT', 'STRING', 'TABLE', - 'TABLESAMPLE', 'TBLPROPERTIES', 'THEN', 'TOUCH', 'TRUE', 'UNION', - 'VIEW', 'WHEN', 'WHERE' + "ADD", + "ALL", + "ALTER", + "AND", + "AS", + "ASC", + "BETWEEN", + "BIGINT", + "BOOLEAN", + "BY", + "CASE", + "CAST", + "COLUMN", + "COMMENT", + "CREATE", + "DESC", + "DISTINCT", + "DISTRIBUTE", + "DOUBLE", + "DROP", + "ELSE", + "FALSE", + "FROM", + "FULL", + "GROUP", + "IF", + "IN", + "INSERT", + "INTO", + "IS", + "JOIN", + "LEFT", + "LIFECYCLE", + "LIKE", + "LIMIT", + "MAPJOIN", + "NOT", + "NULL", + "ON", + "OR", + "ORDER", + "OUTER", + "OVERWRITE", + "PARTITION", + "RENAME", + "REPLACE", + "RIGHT", + "RLIKE", + "SELECT", + "SORT", + "STRING", + "TABLE", + "TABLESAMPLE", + "TBLPROPERTIES", + "THEN", + "TOUCH", + "TRUE", + "UNION", + "VIEW", + "WHEN", + "WHERE", ] reserved_words.update(keywords) reserved_words.update([s.lower() for s in keywords]) @@ -102,8 +181,8 @@ class ODPSIdentifierPreparer(compiler.IdentifierPreparer): def __init__(self, dialect): super(ODPSIdentifierPreparer, self).__init__( dialect, - initial_quote='`', - escape_quote='`', + initial_quote="`", + escape_quote="`", ) def quote(self, ident, force=None): @@ -113,16 +192,18 @@ def quote(self, ident, force=None): class ODPSCompiler(compiler.SQLCompiler): def visit_column(self, *args, **kwargs): result = super(ODPSCompiler, self).visit_column(*args, **kwargs) - dot_count = result.count('.') - assert dot_count in (0, 1, 2), "Unexpected visit_column result {}".format(result) + dot_count = result.count(".") + assert dot_count in (0, 1, 2), "Unexpected visit_column result {}".format( + result + ) if dot_count == 2: # we have something of the form schema.table.column # hive doesn't like the schema in front, so chop it out - result = result[result.index('.') + 1:] + result = result[result.index(".") + 1 :] return result def visit_char_length_func(self, fn, **kw): - return 'length{}'.format(self.function_argspec(fn, **kw)) + return "length{}".format(self.function_argspec(fn, **kw)) def __unicode__(self): return to_text(self) @@ -130,31 +211,31 @@ def __unicode__(self): class ODPSTypeCompiler(compiler.GenericTypeCompiler): def visit_INTEGER(self, type_): - return 'INT' + return "INT" def visit_NUMERIC(self, type_): - return 'DECIMAL' + return "DECIMAL" def visit_CHAR(self, type_): - return 'STRING' + return "STRING" def visit_VARCHAR(self, type_): - return 'STRING' + return "STRING" def visit_NCHAR(self, type_): - return 'STRING' + return "STRING" def visit_TEXT(self, type_): - return 'STRING' + return "STRING" def visit_CLOB(self, type_): - return 'STRING' + return "STRING" def visit_BLOB(self, type_): - return 'BINARY' + return "BINARY" def visit_TIME(self, type_): - return 'TIMESTAMP' + return "TIMESTAMP" if hasattr(sqltypes.String, "RETURNS_UNICODE"): @@ -163,9 +244,13 @@ def visit_TIME(self, type_): _return_unicode_str = True +class ODPSPingError(BaseODPSError): + pass + + class ODPSDialect(default.DefaultDialect): - name = 'odps' - driver = 'rest' + name = "odps" + driver = "rest" preparer = ODPSIdentifierPreparer statement_compiler = ODPSCompiler supports_views = True @@ -189,6 +274,7 @@ class ODPSDialect(default.DefaultDialect): @classmethod def dbapi(cls): from . import dbapi + return dbapi def create_connect_args(self, url): @@ -197,34 +283,38 @@ def create_connect_args(self, url): if project is None and options.default_project: project = options.default_project access_id = url.username - if access_id is None and options.account is not None: - access_id = options.account.access_id secret_access_key = url.password - if secret_access_key is None and options.account is not None: - secret_access_key = options.account.secret_access_key logview_host = options.logview_host endpoint = None session_name = None use_sqa = False reuse_odps = False - fallback_policy = '' + project_as_schema = False + fallback_policy = "" + cache_names = False + cache_seconds = 24 * 3600 hints = {} if url.query: query = dict(url.query) if endpoint is None: - endpoint = query.pop('endpoint', None) + endpoint = query.pop("endpoint", None) if logview_host is None: - logview_host = query.pop( - 'logview_host', query.pop('logview', None) - ) + logview_host = query.pop("logview_host", query.pop("logview", None)) if session_name is None: - session_name = query.pop('session', None) - if use_sqa == False: - use_sqa = (query.pop('interactive_mode', 'false') != 'false') - if reuse_odps == False: - reuse_odps = (query.pop('reuse_odps', 'false') != 'false') + session_name = query.pop("session", None) + if use_sqa is False: + use_sqa = query.pop("interactive_mode", "false").lower() != "false" + if reuse_odps is False: + reuse_odps = query.pop("reuse_odps", "false").lower() != "false" + if query.get("project_as_schema", None) is not None: + project_as_schema = ( + query.pop("project_as_schema", "false").lower() != "false" + ) if fallback_policy == "": - fallback_policy = query.pop('fallback_policy', 'default') + fallback_policy = query.pop("fallback_policy", "default") + if cache_names is False: + cache_names = query.pop("cache_names", "false").lower() != "false" + cache_seconds = int(query.pop("cache_seconds", cache_seconds)) hints = query if endpoint is None: @@ -233,24 +323,35 @@ def create_connect_args(self, url): session_name = PUBLIC_SESSION_NAME kwargs = { - 'access_id': access_id, - 'secret_access_key': secret_access_key, - 'project': project, - 'endpoint': endpoint, - 'session_name': session_name, - 'use_sqa': use_sqa, - 'fallback_policy': fallback_policy, - 'hints': hints, + "access_id": access_id, + "secret_access_key": secret_access_key, + "project": project, + "endpoint": endpoint, + "session_name": session_name, + "use_sqa": use_sqa, + "fallback_policy": fallback_policy, + "project_as_schema": project_as_schema, + "hints": hints, } + if access_id is None: + kwargs.pop("access_id", None) + kwargs.pop("secret_access_key", None) + kwargs["account"] = options.account + for k, v in six.iteritems(kwargs): if v is None: - raise ValueError('{} should be provided to create connection, ' - 'you can either specify in connection string as format: ' - '"odps://:@", ' - 'or create an ODPS object and call `.to_global()` ' - 'to set it to global'.format(k)) + raise ValueError( + "{} should be provided to create connection, " + "you can either specify in connection string as format: " + '"odps://:@", ' + "or create an ODPS object and call `.to_global()` " + "to set it to global".format(k) + ) if logview_host is not None: - kwargs['logview_host'] = logview_host + kwargs["logview_host"] = logview_host + + if cache_names: + _sqlalchemy_obj_list_cache[url_string] = ObjectCache(expire=cache_seconds) if reuse_odps: # the odps object can only be reused only if it will be identical @@ -258,9 +359,9 @@ def create_connect_args(self, url): url_string in _sqlalchemy_global_reusable_odps and _sqlalchemy_global_reusable_odps.get(url_string) is not None ): - kwargs['odps'] = _sqlalchemy_global_reusable_odps.get(url_string) - kwargs['access_id'] = None - kwargs['secret_access_key'] = None + kwargs["odps"] = _sqlalchemy_global_reusable_odps.get(url_string) + kwargs["access_id"] = None + kwargs["secret_access_key"] = None else: _sqlalchemy_global_reusable_odps[url_string] = ODPS( access_id=access_id, @@ -281,14 +382,29 @@ def get_odps_from_url(self, url): odps_kw.pop("use_sqa", None) odps_kw.pop("fallback_policy", None) odps_kw.pop("hints", None) + odps_kw.pop("project_as_schema", None) odps_kw["overwrite_global"] = False return ODPS(**odps_kw) + @classmethod + def get_list_cache(cls, url, key): + url = str(url) + if url not in _sqlalchemy_obj_list_cache: + return None + return _sqlalchemy_obj_list_cache[url].get(key) + + @classmethod + def put_list_cache(cls, url, key, value): + url = str(url) + if url not in _sqlalchemy_obj_list_cache: + return + _sqlalchemy_obj_list_cache[url][key] = value + def get_schema_names(self, connection, **kw): conn = self._get_dbapi_connection(connection) if getattr(conn, "_project_as_schema", False): - fields = ['owner', 'user', 'group', 'prefix'] - if (conn.odps.project is None) or (kw.pop('listall', None) is not None): + fields = ["owner", "user", "group", "prefix"] + if (conn.odps.project is None) or (kw.pop("listall", None) is not None): kwargs = {f: kw.get(f) for f in fields} return [proj.name for proj in conn.odps.list_projects(**kwargs)] else: @@ -326,13 +442,15 @@ def get_columns(self, connection, table_name, schema=None, **kw): try: for col in table.table_schema.columns: col_type = _odps_type_to_sqlalchemy_type[type(col.type)] - result.append({ - 'name': col.name, - 'type': col_type, - 'nullable': True, - 'default': None, - 'comment': col.comment, - }) + result.append( + { + "name": col.name, + "type": col_type, + "nullable": True, + "default": None, + "comment": col.comment, + } + ) except NoSuchObject as e: # convert ODPSError to SQLAlchemy NoSuchTableError raise NoSuchTableError(str(e)) @@ -351,8 +469,13 @@ def get_indexes(self, connection, table_name, schema=None, **kw): return [] def _iter_tables(self, connection, schema=None, types=None, **kw): + cache_key = ("tables", schema, tuple(types)) + cached = self.get_list_cache(connection.engine.url, cache_key) + if cached is not None: + return cached + conn = self._get_dbapi_connection(connection) - filter_ = getattr(test_setting, 'get_tables_filter', None) + filter_ = getattr(test_setting, "get_tables_filter", None) if filter_ is None: filter_ = lambda x: True schema_kw = self._get_schema_kw(connection, schema=schema) @@ -367,7 +490,9 @@ def _iter_tables(self, connection, schema=None, types=None, **kw): its.append(conn.odps.list_tables(**list_kw)) it = itertools.chain(*its) - return [t.name for t in it if filter_(t.name)] + result = [t.name for t in it if filter_(t.name)] + self.put_list_cache(connection.engine.url, cache_key, result) + return result def get_table_names(self, connection, schema=None, **kw): return self._iter_tables( @@ -389,9 +514,41 @@ def get_table_comment(self, connection, table_name, schema=None, **kw): conn = self._get_dbapi_connection(connection) schema_kw = self._get_schema_kw(connection, schema=schema) comment = conn.odps.get_table(table_name, **schema_kw).comment - return { - 'text': comment - } + return {"text": comment} + + @classmethod + def _is_stack_superset(cls, tb): + try: + cur_frame = tb.tb_frame + while cur_frame is not None: + if "superset" in cur_frame.f_code.co_filename: + return True + cur_frame = cur_frame.f_back + return False + except: # pragma: no cover + return False + + def do_ping(self, dbapi_connection): + """Stop raising RuntimeError when ping by Superset""" + try: + return super(ODPSDialect, self).do_ping(dbapi_connection) + except InternalServerError: + raise + except BaseException as ex: + _, _, tb = sys.exc_info() + if not self._is_stack_superset(tb): + raise + new_err = ODPSPingError(ex.args[0]) + for attr in ( + "request_id", + "instance_id", + "code", + "host_id", + "endpoint", + "tag", + ): + setattr(new_err, attr, getattr(ex, attr)) + six.reraise(ODPSPingError, new_err, tb) def do_rollback(self, dbapi_connection): # No transactions for ODPS diff --git a/odps/src/crc32c_c.pxd b/odps/src/crc32c_c.pxd index f3026f34..ca405027 100644 --- a/odps/src/crc32c_c.pxd +++ b/odps/src/crc32c_c.pxd @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ from libc.stdint cimport * + cdef class Crc32c: cdef uint32_t _crc @@ -23,4 +24,4 @@ cdef class Crc32c: cpdef uint32_t getvalue(self) - cpdef reset(self) \ No newline at end of file + cpdef reset(self) diff --git a/odps/src/stringstream.pxd b/odps/src/stringstream.pxd index 7aa37516..4330e72a 100644 --- a/odps/src/stringstream.pxd +++ b/odps/src/stringstream.pxd @@ -1,5 +1,6 @@ from libcpp.string cimport string + cdef extern from "" namespace "std" nogil: cdef cppclass stringstream: stringstream() except + diff --git a/odps/src/types_c.pxd b/odps/src/types_c.pxd index 977bc0e5..35400f8d 100644 --- a/odps/src/types_c.pxd +++ b/odps/src/types_c.pxd @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/odps/src/types_c.pyx b/odps/src/types_c.pyx index b71cad3f..a07dfbaf 100644 --- a/odps/src/types_c.pyx +++ b/odps/src/types_c.pyx @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,19 +13,22 @@ # limitations under the License. import sys + +from cpython.bool cimport PyBool_Check +from cpython.datetime cimport PyDateTime_Check, import_datetime from libc.stdint cimport * from libc.string cimport * -from cpython.bool cimport PyBool_Check -from cpython.datetime cimport import_datetime, PyDateTime_Check + from datetime import datetime -from .. import types, options +from .. import options, types + cdef int64_t bigint_min = types.bigint._bounds[0] cdef int64_t bigint_max = types.bigint._bounds[1] cdef int string_len_max = types.string._max_length cdef object pd_na_type = types.pd_na_type -cdef bint is_py3 = sys.version_info[0] == 3 +cdef bint is_py3 = sys.version_info[0] >= 3 cdef: int64_t BOOL_TYPE_ID = types.boolean._type_id @@ -46,7 +49,7 @@ cdef object _validate_bigint(object val, int64_t max_field_size): cdef int64_t i_val = val if bigint_min <= i_val <= bigint_max: return i_val - raise ValueError('InvalidData: Bigint(%s) out of range' % val) + raise ValueError("InvalidData: Bigint(%s) out of range" % val) cdef object _validate_string(object val, int64_t max_field_size): @@ -57,15 +60,15 @@ cdef object _validate_string(object val, int64_t max_field_size): if max_field_size == 0: max_field_size = string_len_max - if isinstance(val, bytes): + if type(val) is bytes or isinstance(val, bytes): s_size = len( val) - u_val = ( val).decode('utf-8') - elif isinstance(val, unicode): + u_val = ( val).decode("utf-8") + elif type(val) is unicode or isinstance(val, unicode): u_val = val s_size = 4 * len(u_val) if s_size > max_field_size: # only encode when strings are long enough - s_size = len(u_val.encode('utf-8')) + s_size = len(u_val.encode("utf-8")) else: raise TypeError("Invalid data type: expect bytes or unicode, got %s" % type(val)) @@ -85,10 +88,10 @@ cdef object _validate_binary(object val, int64_t max_field_size): if max_field_size == 0: max_field_size = string_len_max - if isinstance(val, bytes): + if type(val) is bytes or isinstance(val, bytes): bytes_val = val - elif isinstance(val, unicode): - bytes_val = ( val).encode('utf-8') + elif type(val) is unicode or isinstance(val, unicode): + bytes_val = ( val).encode("utf-8") else: raise TypeError("Invalid data type: expect bytes or unicode, got %s" % type(val)) @@ -106,7 +109,7 @@ cdef object _validate_datetime(object val, int64_t max_field_size): if PyDateTime_Check(val): return val if isinstance(val, (bytes, unicode)): - return py_strptime(val, '%Y-%m-%d %H:%M:%S') + return py_strptime(val, "%Y-%m-%d %H:%M:%S") raise TypeError("Invalid data type: expect datetime, got %s" % type(val)) @@ -121,12 +124,12 @@ cdef object _validate_timestamp(object val, int64_t max_field_size): pd_ts = pd.Timestamp pd_ts_strptime = pd_ts.strptime except (ImportError, ValueError): - raise ImportError('To use TIMESTAMP in pyodps, you need to install pandas.') + raise ImportError("To use TIMESTAMP in pyodps, you need to install pandas.") if isinstance(val, pd_ts): return val if isinstance(val, (bytes, unicode)): - return pd_ts_strptime(val, '%Y-%m-%d %H:%M:%S') + return pd_ts_strptime(val, "%Y-%m-%d %H:%M:%S") raise TypeError("Invalid data type: expect timestamp, got %s" % type(val)) @@ -250,7 +253,7 @@ cdef class SchemaSnapshot: cdef class BaseRecord: def __cinit__(self, columns=None, schema=None, values=None, max_field_size=None): - self._c_schema_snapshot = getattr(schema, '_snapshot', None) + self._c_schema_snapshot = getattr(schema, "_snapshot", None) if columns is not None: self._c_columns = columns self._c_name_indexes = {col.name: i for i, col in enumerate(self._c_columns)} @@ -261,7 +264,7 @@ cdef class BaseRecord: self._max_field_size = max_field_size or 0 if self._c_columns is None: - raise ValueError('Either columns or schema should not be provided') + raise ValueError("Either columns or schema should not be provided") self._c_values = [None] * len(self._c_columns) if values is not None: @@ -295,7 +298,7 @@ cdef class BaseRecord: self._c_name_indexes = value def _mode(self): - return 'c' + return "c" cdef size_t _get_non_partition_col_count(self): if self._c_schema_snapshot is not None: @@ -347,8 +350,8 @@ cdef class BaseRecord: n_values != self._get_non_partition_col_count() ): raise ValueError( - 'The values set to records are against the schema, ' - 'expect len %s, got len %s' % (len(self._c_columns), n_values) + "The values set to records are against the schema, " + "expect len %s, got len %s" % (len(self._c_columns), n_values) ) if type(values) is list: @@ -372,7 +375,7 @@ cdef class BaseRecord: self._set(key, value) def __getattr__(self, item): - if item == '_name_indexes': + if item == "_name_indexes": return self._c_name_indexes if item in self._c_name_indexes: i = self._c_name_indexes[item] diff --git a/odps/src/utils_c.pxd b/odps/src/utils_c.pxd index ff3cfd99..bbf2bc06 100644 --- a/odps/src/utils_c.pxd +++ b/odps/src/utils_c.pxd @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/odps/src/utils_c.pyx b/odps/src/utils_c.pyx index 2a1ad3e9..7de81d7d 100644 --- a/odps/src/utils_c.pyx +++ b/odps/src/utils_c.pyx @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,26 +13,30 @@ # limitations under the License. cimport cython + import sys import threading import time + from cpython.datetime cimport ( PyDateTime_DateTime, datetime, - datetime_year, - datetime_month, datetime_day, datetime_hour, - datetime_minute, - datetime_second, datetime_microsecond, + datetime_minute, + datetime_month, datetime_new, - timedelta_new, + datetime_second, + datetime_year, import_datetime, + timedelta_new, ) + from datetime import datetime + from libc.stdint cimport int64_t -from libc.time cimport time_t, tm, mktime, localtime, gmtime +from libc.time cimport gmtime, localtime, mktime, time_t, tm try: import zoneinfo @@ -46,6 +50,7 @@ except ImportError: from ..compat import utc from ..config import options + cdef extern from "timegm.c": time_t timegm(tm* t) nogil @@ -75,11 +80,15 @@ except OverflowError: _min_datetime_mills = int( (datetime.min - datetime.utcfromtimestamp(0)).total_seconds() * 1000 ) -_antique_errmsg = 'Date older than 1928/01/01 and may contain errors. ' \ - 'Ignore this error by configuring `options.allow_antique_date` to True.' -_min_datetime_errmsg = 'Date exceed range Python can handle. If you are reading data with tunnel, read '\ - 'the value as None by setting options.tunnel.overflow_date_as_none to True, ' \ - 'or convert the value into strings with SQL before processing them with Python.' +_antique_errmsg = ( + "Date older than 1928/01/01 and may contain errors. " + "Ignore this error by configuring `options.allow_antique_date` to True." +) +_min_datetime_errmsg = ( + "Date exceed range Python can handle. If you are reading data with tunnel, read " + "the value as None by setting options.tunnel.overflow_date_as_none to True, " + "or convert the value into strings with SQL before processing them with Python." +) cdef inline bint datetime_hastzinfo(object o): return (o).hastzinfo @@ -90,7 +99,9 @@ cdef class CMillisecondsConverter: def _get_tz(tz): if type(tz) is unicode or type(tz) is bytes: if pytz is None and zoneinfo is None: - raise ImportError('Package `pytz` is needed when specifying string-format time zone.') + raise ImportError( + "Package `pytz` is needed when specifying string-format time zone." + ) else: return zoneinfo.ZoneInfo(tz) if zoneinfo is not None else pytz.timezone(tz) else: @@ -116,7 +127,7 @@ cdef class CMillisecondsConverter: self._is_dst = is_dst self._tz = self._get_tz(self._local_tz) if not self._use_default_tz else None - self._tz_has_localize = hasattr(self._tz, 'localize') + self._tz_has_localize = hasattr(self._tz, "localize") cdef int _build_tm_struct(self, datetime dt, tm *p_tm) except? -1: p_tm.tm_year = datetime_year(dt) - 1900 diff --git a/odps/superset_odps.py b/odps/superset_odps.py index 41829b84..0f405331 100644 --- a/odps/superset_odps.py +++ b/odps/superset_odps.py @@ -1,4 +1,6 @@ +import contextlib import logging +import sys try: from sqlalchemy import Column @@ -7,9 +9,8 @@ try: from superset import sql_parse - from superset.db_engine_specs.base import BaseEngineSpec + from superset.db_engine_specs.base import BaseEngineSpec, TimestampExpression from superset.exceptions import SupersetException - from superset.extensions import cache_manager except ImportError: # import fallback for tests only sql_parse = None @@ -20,9 +21,7 @@ class BaseEngineSpec(object): @classmethod def get_engine(cls, database, schema=None, source=None): - return database.get_sqla_engine_with_context( - schema=schema, source=source - ) + return database.get_sqla_engine_with_context(schema=schema, source=source) @classmethod def get_table_names( # pylint: disable=unused-argument @@ -37,18 +36,10 @@ def get_table_names( # pylint: disable=unused-argument def get_dbapi_mapped_exception(cls, ex): return ex - class CacheManagerCls(object): - def __init__(self): - self.cache = self - - def memoize(self): - return lambda x: x - - cache_manager = CacheManagerCls() - class SupersetException(Exception): pass + try: from superset.constants import TimeGrain except ImportError: @@ -73,7 +64,8 @@ class TimeGrain: logger = logging.getLogger(__name__) -_builtin_funcs = set(""" +_builtin_funcs = set( + """ ABS ACOS ADD_MONTHS ALL_MATCH ANY_MATCH ANY_VALUE ATAN2 APPROX_DISTINCT ARG_MAX ARG_MIN ARRAY ARRAY_CONTAINS ARRAY_DISTINCT ARRAY_EXCEPT ARRAY_INTERSECT ARRAY_JOIN ARRAY_MAX ARRAY_MIN ARRAY_NORMALIZE @@ -113,7 +105,8 @@ class TimeGrain: UNBASE64 UNHEX UNIQUE_ID UNIX_TIMESTAMP URL_DECODE URL_ENCODE UUID VAR_SAMP VARIANCE/VAR_POP WEEKDAY WEEKOFYEAR WIDTH_BUCKET WM_CONCAT YEAR ZIP_WITH -""".strip().split()) +""".strip().split() +) class ODPSEngineSpec(BaseEngineSpec): @@ -123,31 +116,55 @@ class ODPSEngineSpec(BaseEngineSpec): # pylint: disable=line-too-long _time_grain_expressions = { None: "{col}", - TimeGrain.SECOND: "from_unixtime(unix_timestamp({col}), 'yyyy-MM-dd HH:mm:ss')", - TimeGrain.MINUTE: "from_unixtime(unix_timestamp({col}), 'yyyy-MM-dd HH:mm:00')", - TimeGrain.HOUR: "from_unixtime(unix_timestamp({col}), 'yyyy-MM-dd HH:00:00')", - TimeGrain.DAY: "from_unixtime(unix_timestamp({col}), 'yyyy-MM-dd 00:00:00')", - TimeGrain.WEEK: "date_format(date_sub({col}, CAST(7-from_unixtime(unix_timestamp({col}),'u') as int)), 'yyyy-MM-dd 00:00:00')", - TimeGrain.MONTH: "from_unixtime(unix_timestamp({col}), 'yyyy-MM-01 00:00:00')", - TimeGrain.QUARTER: "date_format(add_months(datetrunc({col}, 'MM'), -(month({col})-1)%3), 'yyyy-MM-dd 00:00:00')", - TimeGrain.YEAR: "from_unixtime(unix_timestamp({col}), 'yyyy-01-01 00:00:00')", - TimeGrain.WEEK_ENDING_SATURDAY: "date_format(date_add({col}, INT(6-from_unixtime(unix_timestamp({col}), 'u'))), 'yyyy-MM-dd 00:00:00')", - TimeGrain.WEEK_STARTING_SUNDAY: "date_format(date_add({col}, -INT(from_unixtime(unix_timestamp({col}), 'u'))), 'yyyy-MM-dd 00:00:00')", + TimeGrain.SECOND: "datetrunc({col}, 'ss')", + TimeGrain.MINUTE: "datetrunc({col}, 'mi')", + TimeGrain.HOUR: "datetrunc({col}, 'hh')", + TimeGrain.DAY: "datetrunc({col}, 'dd')", + TimeGrain.WEEK: "datetrunc(dateadd({col}, 1 - dayofweek({col}), 'dd'), 'dd')", + TimeGrain.MONTH: "datetrunc({col}, 'month')", + TimeGrain.QUARTER: "datetrunc(dateadd({col}, -3, 'mm'), 'dd')", + TimeGrain.YEAR: "datetrunc({col}, 'yyyy')", + TimeGrain.WEEK_ENDING_SATURDAY: "datetrunc(dateadd({col}, 6 - dayofweek({col}), 'dd'), 'dd')", + TimeGrain.WEEK_STARTING_SUNDAY: "datetrunc(dateadd({col}, 7 - dayofweek({col}), 'dd'), 'dd')", } + _py_format_to_odps_sql_format = [ + ("%Y", "YYYY"), + ("%m", "MM"), + ("%d", "DD"), + ("%H", "HH"), + ("%M", "MI"), + ("%S", "SS"), + ("%%", "%"), + ] @classmethod - def _get_odps_entry(cls, database): + def get_timestamp_expr(cls, col, pdf, time_grain): + time_expr = ( + super(ODPSEngineSpec, cls).get_timestamp_expr(col, pdf, time_grain).key + ) + for pat, sub in cls._py_format_to_odps_sql_format: + pdf = pdf.replace(pat, sub) + return TimestampExpression(time_expr, col, type_=col.type) + + @classmethod + @contextlib.contextmanager + def _get_database_engine(cls, database): en = cls.get_engine(database) - if hasattr(en, "__enter__"): - engine = en.__enter__() - else: - engine = en + try: + if hasattr(en, "__enter__"): + engine = en.__enter__() + else: + engine = en - odps_entry = engine.dialect.get_odps_from_url(engine.url) + yield engine + finally: + if hasattr(en, "__exit__"): + en.__exit__(*sys.exc_info()) - if hasattr(en, "__exit__"): - en.__exit__(None, None, None) - return odps_entry + @classmethod + def _get_odps_entry(cls, database): + with cls._get_database_engine(database) as engine: + return engine.dialect.get_odps_from_url(engine.url) @classmethod def get_catalog_names( # pylint: disable=unused-argument @@ -196,23 +213,29 @@ def latest_sub_partition( # type: ignore @classmethod def get_table_names(cls, database, inspector, schema): logger.info("Start listing tables for schema %s", schema) - tables = super(ODPSEngineSpec, cls).get_table_names( - database, inspector, schema - ) - return set([ - n for n in tables if not n.startswith(TEMP_TABLE_PREFIX) - ]) + tables = super(ODPSEngineSpec, cls).get_table_names(database, inspector, schema) + return set([n for n in tables if not n.startswith(TEMP_TABLE_PREFIX)]) @classmethod - @cache_manager.cache.memoize() def get_function_names(cls, database): + with cls._get_database_engine(database) as engine: + cached = engine.dialect.get_list_cache(engine.url, ("functions",)) + if cached is not None: + return cached + odps_entry = cls._get_odps_entry(database) - funcs = set([ - func.name for func in odps_entry.list_functions() - if not func.name.startswith("pyodps_") - ]) - funcs = funcs | _builtin_funcs - return sorted(funcs) + funcs = set( + [ + func.name + for func in odps_entry.list_functions() + if not func.name.startswith("pyodps_") + ] + ) + funcs = sorted(funcs | _builtin_funcs) + + with cls._get_database_engine(database) as engine: + engine.dialect.put_list_cache(engine.url, ("functions",), funcs) + return funcs @classmethod def execute(cls, cursor, query, **kwargs): @@ -226,12 +249,20 @@ def execute(cls, cursor, query, **kwargs): hints = { "odps.sql.jobconf.odps2": "true", } - if not getattr(cursor.connection, "_project_as_schema", True): + conn_project_as_schema = getattr( + cursor.connection, "_project_as_schema", None + ) + conn_project_as_schema = ( + True if conn_project_as_schema is None else conn_project_as_schema + ) + if not conn_project_as_schema: # sqlalchemy cursor need odps schema support - hints.update({ - "odps.sql.allow.namespace.schema": "true", - "odps.namespace.schema": "true", - }) + hints.update( + { + "odps.sql.allow.namespace.schema": "true", + "odps.namespace.schema": "true", + } + ) cursor.execute(query, hints=hints) except Exception as ex: six.raise_from(cls.get_dbapi_mapped_exception(ex), ex) diff --git a/odps/tempobj.py b/odps/tempobj.py index 98c0f883..1b6e7648 100644 --- a/odps/tempobj.py +++ b/odps/tempobj.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,7 +19,6 @@ import glob import hashlib import json -import logging import os import platform import stat @@ -30,14 +29,14 @@ import time import uuid +from . import utils from .accounts import AliyunAccount -from .compat import pickle, six, builtins, futures +from .compat import builtins, futures, pickle, six from .config import options from .errors import NoSuchObject -from . import utils -TEMP_ROOT = utils.build_pyodps_dir('tempobjs') -SESSION_KEY = '%d_%s' % (int(time.time()), uuid.uuid4()) +TEMP_ROOT = utils.build_pyodps_dir("tempobjs") +SESSION_KEY = "%d_%s" % (int(time.time()), uuid.uuid4()) CLEANER_THREADS = 100 USER_FILE_RIGHTS = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR @@ -96,7 +95,7 @@ def __init__(self, **kwargs): self.os = os self.sys = sys self._g_env = copy.copy(globals()) - self.is_windows = 'windows' in platform.platform().lower() + self.is_windows = "windows" in platform.platform().lower() self.pid = os.getpid() self.os_sep = os.sep self.executable = sys.executable @@ -106,16 +105,18 @@ def __init__(self, **kwargs): package_root = os.path.dirname(__file__) if package_root not in import_paths: import_paths.append(package_root) - self.import_path_json = utils.to_text(json.dumps(import_paths, ensure_ascii=False)) + self.import_path_json = utils.to_text( + json.dumps(import_paths, ensure_ascii=False) + ) self.builtins = builtins - self.io = __import__('io', fromlist=['']) + self.io = __import__("io", fromlist=[""]) if six.PY3: - self.conv_bytes = (lambda s: s.encode() if isinstance(s, str) else s) - self.conv_unicode = (lambda s: s if isinstance(s, str) else s.decode()) + self.conv_bytes = lambda s: s.encode() if isinstance(s, str) else s + self.conv_unicode = lambda s: s if isinstance(s, str) else s.decode() else: - self.conv_bytes = (lambda s: s.encode() if isinstance(s, unicode) else s) - self.conv_unicode = (lambda s: s if isinstance(s, unicode) else s.decode()) + self.conv_bytes = lambda s: s.encode() if isinstance(s, unicode) else s + self.conv_unicode = lambda s: s if isinstance(s, unicode) else s.decode() self.subprocess = subprocess self.temp_dir = tempfile.gettempdir() self.template = CLEANUP_SCRIPT_TMPL @@ -127,7 +128,7 @@ def __init__(self, **kwargs): class TempObject(object): __slots__ = () - _type = '' + _type = "" _priority = 0 def __init__(self, *args, **kwargs): @@ -154,7 +155,9 @@ def __ne__(self, other): return not self.__eq__(other) def __getstate__(self): - return {slot: getattr(self, slot) for slot in self.__slots__ if hasattr(self, slot)} + return { + slot: getattr(self, slot) for slot in self.__slots__ if hasattr(self, slot) + } def __setstate__(self, state): for slot, value in state.items(): @@ -162,52 +165,62 @@ def __setstate__(self, state): class TempTable(TempObject): - __slots__ = 'table', 'project', 'schema' - _type = 'Table' + __slots__ = "table", "project", "schema" + _type = "Table" def drop(self, odps): odps.delete_table( - self.table, if_exists=True, project=self.project, schema=self.schema, async_=True + self.table, + if_exists=True, + project=self.project, + schema=self.schema, + async_=True, ) class TempModel(TempObject): - __slots__ = 'model', 'project', 'schema' - _type = 'OfflineModel' + __slots__ = "model", "project", "schema" + _type = "OfflineModel" def drop(self, odps): try: - odps.delete_offline_model(self.model, project=self.project, schema=self.schema) + odps.delete_offline_model( + self.model, project=self.project, schema=self.schema + ) except NoSuchObject: pass class TempFunction(TempObject): - __slots__ = 'function', 'project', 'schema' - _type = 'Function' + __slots__ = "function", "project", "schema" + _type = "Function" _priority = 1 def drop(self, odps): try: - odps.delete_function(self.function, project=self.project, schema=self.schema) + odps.delete_function( + self.function, project=self.project, schema=self.schema + ) except NoSuchObject: pass class TempResource(TempObject): - __slots__ = 'resource', 'project', 'schema' - _type = 'Resource' + __slots__ = "resource", "project", "schema" + _type = "Resource" def drop(self, odps): try: - odps.delete_resource(self.resource, project=self.project, schema=self.schema) + odps.delete_resource( + self.resource, project=self.project, schema=self.schema + ) except NoSuchObject: pass class TempVolumePartition(TempObject): - __slots__ = 'volume', 'partition', 'project', 'schema' - _type = 'VolumePartition' + __slots__ = "volume", "partition", "project", "schema" + _type = "VolumePartition" def drop(self, odps): try: @@ -245,7 +258,11 @@ def _cleaner(obj): pool = futures.ThreadPoolExecutor(CLEANER_THREADS) list(pool.map(_cleaner, reversed(list(self._container)))) else: - for o in sorted(list(self._container), key=lambda ro: type(ro)._priority, reverse=True): + for o in sorted( + list(self._container), + key=lambda ro: type(ro)._priority, + reverse=True, + ): _cleaner(o) for obj in cleaned: if obj in self._container: @@ -262,7 +279,7 @@ def dump(self): if self._file_name is None: return try: - with open(self._file_name, 'wb') as outf: + with open(self._file_name, "wb") as outf: pickle.dump(list(self._container), outf, protocol=0) outf.close() except OSError: @@ -271,7 +288,7 @@ def dump(self): def load(self): try: - with open(self._file_name, 'rb') as inpf: + with open(self._file_name, "rb") as inpf: contents = pickle.load(inpf) self._container.update(contents) except (EOFError, OSError): @@ -279,7 +296,7 @@ def load(self): class ObjectRepositoryLib(dict): - biz_ids = set([options.biz_id, ]) if options.biz_id else set(['default', ]) + biz_ids = set([options.biz_id]) if options.biz_id else set(["default"]) odps_info = dict() biz_ids_json = json.dumps(list(biz_ids)) @@ -322,18 +339,26 @@ def _exec_cleanup_script(self): return env.cleaned = True - script = env.template.format(import_paths=env.import_path_json, odps_info=self.odps_info_json, - host_pid=env.pid, biz_ids=self.biz_ids_json) + script = env.template.format( + import_paths=env.import_path_json, + odps_info=self.odps_info_json, + host_pid=env.pid, + biz_ids=self.biz_ids_json, + ) - script_name = env.temp_dir + env.os_sep + 'tmp_' + str(env.pid) + '_cleanup_script.py' - script_file = env.io.FileIO(script_name, 'w') + script_name = ( + env.temp_dir + env.os_sep + "tmp_" + str(env.pid) + "_cleanup_script.py" + ) + script_file = env.io.FileIO(script_name, "w") script_file.write(env.conv_bytes(script)) script_file.close() try: if env.is_windows: env.os.chmod(script_name, env.file_right) else: - env.subprocess.call(['chmod', oct(env.file_right).replace('o', ''), script_name]) + env.subprocess.call( + ["chmod", oct(env.file_right).replace("o", ""), script_name] + ) except: pass @@ -341,18 +366,25 @@ def _exec_cleanup_script(self): if env.is_windows: si = subprocess.STARTUPINFO() si.dwFlags |= subprocess.STARTF_USESHOWWINDOW - kwargs['startupinfo'] = si + kwargs["startupinfo"] = si env.subprocess.call([env.executable, script_name], **kwargs) _cleaned_keys = set() -_obj_repos = ObjectRepositoryLib() # this line should be put last due to initialization dependency +_obj_repos = ( + ObjectRepositoryLib() +) # this line should be put last due to initialization dependency atexit.register(_obj_repos._exec_cleanup_script) def _is_pid_running(pid): - if 'windows' in platform.platform().lower(): - task_lines = os.popen('TASKLIST /FI "PID eq {0}" /NH'.format(pid)).read().strip().splitlines() + if "windows" in platform.platform().lower(): + task_lines = ( + os.popen('TASKLIST /FI "PID eq {0}" /NH'.format(pid)) + .read() + .strip() + .splitlines() + ) if not task_lines: return False return str(pid) in set(task_lines[0].split()) @@ -372,7 +404,7 @@ def clean_objects(odps, biz_ids=None, use_threads=False): files = [] biz_ids = biz_ids or _obj_repos.biz_ids for biz_id in biz_ids: - files.extend(glob.glob(os.path.join(TEMP_ROOT, biz_id, odps_key, '*.his'))) + files.extend(glob.glob(os.path.join(TEMP_ROOT, biz_id, odps_key, "*.his"))) for fn in files: repo = ObjectRepository(fn) @@ -392,11 +424,11 @@ def clean_stored_objects(odps): files = [] for biz_id in _obj_repos.biz_ids: - files.extend(glob.glob(os.path.join(TEMP_ROOT, biz_id, odps_key, '*.his'))) + files.extend(glob.glob(os.path.join(TEMP_ROOT, biz_id, odps_key, "*.his"))) def clean_thread(): for fn in files: - writer_pid = int(fn.rsplit('__', 1)[-1].split('.', 1)[0]) + writer_pid = int(fn.rsplit("__", 1)[-1].split(".", 1)[0]) # we do not clean running process, unless its pid equals host_pid if writer_pid != host_pid and _is_pid_running(writer_pid): @@ -416,13 +448,13 @@ def clean_thread(): def _gen_repository_key(odps): - if getattr(odps.account, 'access_id', None): + if getattr(odps.account, "access_id", None): keys = [odps.account.access_id, odps.endpoint, str(odps.project)] - elif getattr(odps.account, 'token', None): + elif getattr(odps.account, "token", None): keys = [utils.to_str(odps.account.token), odps.endpoint, str(odps.project)] else: return - return hashlib.md5(utils.to_binary('####'.join(keys))).hexdigest() + return hashlib.md5(utils.to_binary("####".join(keys))).hexdigest() def _put_objects(odps, objs): @@ -430,7 +462,7 @@ def _put_objects(odps, objs): if odps_key is None: return - biz_id = options.biz_id if options.biz_id else 'default' + biz_id = options.biz_id if options.biz_id else "default" ObjectRepositoryLib.add_biz_id(biz_id) if odps_key not in _obj_repos: if isinstance(odps.account, AliyunAccount): @@ -441,7 +473,9 @@ def _put_objects(odps, objs): os.makedirs(file_dir) except OSError: pass - file_name = os.path.join(file_dir, 'temp_objs_{0}__{1}.his'.format(SESSION_KEY, os.getpid())) + file_name = os.path.join( + file_dir, "temp_objs_{0}__{1}.his".format(SESSION_KEY, os.getpid()) + ) _obj_repos[odps_key] = ObjectRepository(file_name) [_obj_repos[odps_key].put(o, False) for o in objs] _obj_repos[odps_key].dump() @@ -449,40 +483,67 @@ def _put_objects(odps, objs): def register_temp_table(odps, table, project=None, schema=None): if isinstance(table, six.string_types): - table = [table, ] - _put_objects(odps, [ - TempTable(t, project or odps.project, schema=schema or odps.schema) for t in table - ]) + table = [table] + _put_objects( + odps, + [ + TempTable(t, project or odps.project, schema=schema or odps.schema) + for t in table + ], + ) def register_temp_model(odps, model, project=None, schema=None): if isinstance(model, six.string_types): - model = [model, ] - _put_objects(odps, [ - TempModel(m, project or odps.project, schema=schema or odps.schema) for m in model - ]) + model = [model] + _put_objects( + odps, + [ + TempModel(m, project or odps.project, schema=schema or odps.schema) + for m in model + ], + ) def register_temp_resource(odps, resource, project=None, schema=None): if isinstance(resource, six.string_types): - resource = [resource, ] - _put_objects(odps, [ - TempResource(r, project if project else odps.project, schema=schema or odps.schema) for r in resource - ]) + resource = [resource] + _put_objects( + odps, + [ + TempResource( + r, project if project else odps.project, schema=schema or odps.schema + ) + for r in resource + ], + ) def register_temp_function(odps, func, project=None, schema=None): if isinstance(func, six.string_types): - func = [func, ] - _put_objects(odps, [ - TempFunction(f, project if project else odps.project, schema=schema or odps.schema) for f in func - ]) + func = [func] + _put_objects( + odps, + [ + TempFunction( + f, project if project else odps.project, schema=schema or odps.schema + ) + for f in func + ], + ) -def register_temp_volume_partition(odps, volume_partition_tuple, project=None, schema=None): +def register_temp_volume_partition( + odps, volume_partition_tuple, project=None, schema=None +): if isinstance(volume_partition_tuple, tuple): - volume_partition_tuple = [volume_partition_tuple, ] - _put_objects(odps, [ - TempVolumePartition(v, p, project if project else odps.project, schema=schema or odps.schema) - for v, p in volume_partition_tuple - ]) + volume_partition_tuple = [volume_partition_tuple] + _put_objects( + odps, + [ + TempVolumePartition( + v, p, project if project else odps.project, schema=schema or odps.schema + ) + for v, p in volume_partition_tuple + ], + ) diff --git a/odps/tests/core.py b/odps/tests/core.py index e6f2d074..f96652ae 100644 --- a/odps/tests/core.py +++ b/odps/tests/core.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -31,35 +31,28 @@ _raw_flaky = None from .. import compat, errors, options, utils -from ..compat import six, ConfigParser +from ..compat import ConfigParser, six -LOCK_FILE_NAME = os.path.join(tempfile.gettempdir(), 'pyodps_test_lock_') +LOCK_FILE_NAME = os.path.join(tempfile.gettempdir(), "pyodps_test_lock_") LOGGING_CONFIG = { - 'version': 1, + "version": 1, "filters": { - "odps": { - "name": "odps" - }, + "odps": {"name": "odps"}, }, "formatters": { - "msgonly": { - "format": "%(message)s" - }, + "msgonly": {"format": "%(message)s"}, }, "handlers": { "console": { "class": "logging.StreamHandler", - "level": 'INFO', + "level": "INFO", "formatter": "msgonly", - "filters": ["odps",], + "filters": ["odps"], }, }, - "root": { - "level": "NOTSET", - "handlers": ["console"] - }, - "disable_existing_loggers": False + "root": {"level": "NOTSET", "handlers": ["console"]}, + "disable_existing_loggers": False, } @@ -113,8 +106,12 @@ def _load_config_odps(config, section_name, overwrite_global=True): attr_name = section_name odps_entry = ODPS( - access_id, secret_access_key, project, endpoint, - schema=schema, tunnel_endpoint=tunnel_endpoint, + access_id, + secret_access_key, + project, + endpoint, + schema=schema, + tunnel_endpoint=tunnel_endpoint, seahawks_url=seahawks_url, overwrite_global=overwrite_global, ) @@ -129,10 +126,10 @@ def get_config(): if not Config.config: config = ConfigParser.ConfigParser() Config.config = config - config_path = os.path.join(os.path.dirname(__file__), 'test.conf') + config_path = os.path.join(os.path.dirname(__file__), "test.conf") if not os.path.exists(config_path): raise OSError( - 'Please configure test.conf (you can rename test.conf.template)' + "Please configure test.conf (you can rename test.conf.template)" ) config.read(config_path) @@ -140,6 +137,7 @@ def get_config(): _load_config_odps(config, "odps_with_storage_tier", overwrite_global=False) _load_config_odps(config, "odps_with_schema", overwrite_global=False) _load_config_odps(config, "odps_with_tunnel_quota", overwrite_global=False) + _load_config_odps(config, "odps_with_long_string", overwrite_global=False) # make sure main config overrides other configs _load_config_odps(config, "odps") config.tunnel = TableTunnel(config.odps, endpoint=config.odps._tunnel_endpoint) @@ -158,7 +156,10 @@ def get_config(): oss_endpoint = config.get("oss", "endpoint") config.oss_config = ( - oss_access_id, oss_secret_access_key, oss_bucket_name, oss_endpoint + oss_access_id, + oss_secret_access_key, + oss_bucket_name, + oss_endpoint, ) import oss2 @@ -168,8 +169,8 @@ def get_config(): except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ImportError): pass - logging_level = config.get('test', 'logging_level') - LOGGING_CONFIG['handlers']['console']['level'] = logging_level + logging_level = config.get("test", "logging_level") + LOGGING_CONFIG["handlers"]["console"]["level"] = logging_level else: config = Config.config @@ -182,10 +183,10 @@ def get_config(): def tn(s, limit=128): - if os.environ.get('TEST_NAME_SUFFIX') is not None: - suffix = '_' + os.environ.get('TEST_NAME_SUFFIX').lower() + if os.environ.get("TEST_NAME_SUFFIX") is not None: + suffix = "_" + os.environ.get("TEST_NAME_SUFFIX").lower() if len(s) + len(suffix) > limit: - s = s[:limit - len(suffix)] + s = s[: limit - len(suffix)] table_name = s + suffix with _test_tables_lock: _test_tables_to_drop.add(table_name) @@ -207,15 +208,16 @@ def drop_test_tables(odps): def in_coverage_mode(): - return 'COVERAGE_FILE' in os.environ or 'unittest' in sys.argv[0] + return "COVERAGE_FILE" in os.environ or "unittest" in sys.argv[0] def start_coverage(): if not in_coverage_mode(): return - os.environ['COVERAGE_PROCESS_START'] = '' + os.environ["COVERAGE_PROCESS_START"] = "" try: import coverage + coverage.process_startup() except ImportError: pass @@ -239,6 +241,7 @@ def ident(x): def ignore_case(case, reason): if isinstance(case, types.FunctionType) and not case.__name__.startswith("test"): + @six.wraps(case) def wrapped(*args, **kwargs): pytest.skip(reason) @@ -251,15 +254,15 @@ def wrapped(*args, **kwargs): def ci_skip_case(obj): - if 'CI_MODE' in os.environ: - return ignore_case(obj, 'Intentionally skipped in CI mode.') + if "CI_MODE" in os.environ: + return ignore_case(obj, "Intentionally skipped in CI mode.") else: return obj def module_depend_case(mod_names): if isinstance(mod_names, six.string_types): - mod_names = [mod_names, ] + mod_names = [mod_names] def _decorator(obj): for mod_name in mod_names: @@ -267,29 +270,31 @@ def _decorator(obj): if sys.version_info[0] == 2 and mod_name in sys.modules: continue try: - __import__(mod_name, fromlist=['']) + __import__(mod_name, fromlist=[""]) except ImportError: - return ignore_case(obj, 'Skipped due to absence of %s.' % mod_name) + return ignore_case(obj, "Skipped due to absence of %s." % mod_name) return obj + return _decorator -numpy_case = module_depend_case('numpy') -pandas_case = module_depend_case('pandas') -pyarrow_case = module_depend_case('pyarrow') -sqlalchemy_case = module_depend_case('sqlalchemy') +numpy_case = module_depend_case("numpy") +pandas_case = module_depend_case("pandas") +pyarrow_case = module_depend_case("pyarrow") +sqlalchemy_case = module_depend_case("sqlalchemy") def odps2_typed_case(func): @six.wraps(func) def _wrapped(*args, **kwargs): from odps import options + options.sql.use_odps2_extension = True old_settings = options.sql.settings options.sql.settings = old_settings or {} - options.sql.settings.update({'odps.sql.hive.compatible': True}) - options.sql.settings.update({'odps.sql.decimal.odps2': True}) + options.sql.settings.update({"odps.sql.hive.compatible": True}) + options.sql.settings.update({"odps.sql.decimal.odps2": True}) try: func(*args, **kwargs) finally: @@ -300,18 +305,24 @@ def _wrapped(*args, **kwargs): def global_locked(lock_key): - def _decorator(func): if callable(lock_key): - file_name = LOCK_FILE_NAME + '_' + func.__module__.replace('.', '__') + '__' + func.__name__ + '.lck' + file_name = ( + LOCK_FILE_NAME + + "_" + + func.__module__.replace(".", "__") + + "__" + + func.__name__ + + ".lck" + ) else: - file_name = LOCK_FILE_NAME + '_' + lock_key + '.lck' + file_name = LOCK_FILE_NAME + "_" + lock_key + ".lck" @six.wraps(func) def _decorated(*args, **kwargs): while os.path.exists(file_name): time.sleep(0.5) - open(file_name, 'w').close() + open(file_name, "w").close() try: return func(*args, **kwargs) finally: @@ -342,14 +353,12 @@ def wait_filled(container_fun, countdown=10): time.sleep(1) countdown -= 1 if countdown <= 0: - raise SystemError('Waiting for container content time out.') + raise SystemError("Waiting for container content time out.") def run_sub_tests_in_parallel(n_parallel, sub_tests): test_pool = compat.futures.ThreadPoolExecutor(n_parallel) - futures = [ - test_pool.submit(sub_test) for idx, sub_test in enumerate(sub_tests) - ] + futures = [test_pool.submit(sub_test) for idx, sub_test in enumerate(sub_tests)] try: first_exc = None for fut in futures: @@ -387,11 +396,12 @@ def force_drop_schema(schema): def get_result(res): from odps.df.backends.frame import ResultFrame + if isinstance(res, ResultFrame): res = res.values try: - import pandas as pd import numpy as np + import pandas as pd except (ImportError, ValueError): np = pd = None @@ -421,6 +431,7 @@ def conv(t): def get_code_mode(): from odps import crc as _crc + if hasattr(_crc.Crc32c, "_method"): return _crc.Crc32c._method else: @@ -435,7 +446,8 @@ def py_and_c(modules=None, reloader=None): modules.append("odps.crc") try: - import cython + import cython # noqa: F401 + has_cython = True except ImportError: has_cython = False @@ -445,10 +457,10 @@ def mod_reloader(request): if impl == "c" and not has_cython: pytest.skip("Must install cython to run this test.") - old_config = getattr(options, 'force_{0}'.format(impl)) - setattr(options, 'force_{0}'.format(impl), True) + old_config = getattr(options, "force_{0}".format(impl)) + setattr(options, "force_{0}".format(impl), True) - for mod_name in (modules or []): + for mod_name in modules or []: mod = importlib.import_module(mod_name) compat.reload_module(mod) @@ -460,12 +472,12 @@ def mod_reloader(request): try: yield finally: - setattr(options, 'force_{0}'.format(impl), old_config) + setattr(options, "force_{0}".format(impl), old_config) mod_reloader.__name__ = fixture_name def wrap_fun(fun): - func_mod = __import__(fun.__module__, fromlist=['']) + func_mod = __import__(fun.__module__, fromlist=[""]) if not hasattr(func_mod, fixture_name): setattr(func_mod, fixture_name, mod_reloader) diff --git a/odps/tests/dictconfig.py b/odps/tests/dictconfig.py index 6be259ff..8603f006 100644 --- a/odps/tests/dictconfig.py +++ b/odps/tests/dictconfig.py @@ -2,7 +2,7 @@ # reproduced with permission. It is provided here for backwards # compatibility for Python versions prior to 2.7. # -# Copyright 2009-2010 by Vinay Sajip. All Rights Reserved. +# Copyright 2009-2024 by Vinay Sajip. All Rights Reserved. # # Permission to use, copy, modify, and distribute this software and its # documentation for any purpose and without fee is hereby granted, @@ -25,32 +25,35 @@ from ..compat import six -IDENTIFIER = re.compile('^[a-z_][a-z0-9_]*$', re.I) +IDENTIFIER = re.compile("^[a-z_][a-z0-9_]*$", re.I) + def valid_ident(s): m = IDENTIFIER.match(s) if not m: - raise ValueError('Not a valid Python identifier: %r' % s) + raise ValueError("Not a valid Python identifier: %r" % s) return True + # # This function is defined in logging only in recent versions of Python # try: from logging import _checkLevel except ImportError: + def _checkLevel(level): if isinstance(level, int): rv = level elif str(level) == level: if level not in logging._levelNames: - raise ValueError('Unknown level: %r' % level) + raise ValueError("Unknown level: %r" % level) rv = logging._levelNames[level] else: - raise TypeError('Level not an integer or a ' - 'valid string: %r' % level) + raise TypeError("Level not an integer or a " "valid string: %r" % level) return rv + # The ConvertingXXX classes are wrappers around standard Python containers, # and they serve to convert any suitable values in the container. The # conversion converts base dicts, lists and tuples to their wrapped @@ -60,17 +63,17 @@ def _checkLevel(level): # Each wrapper should have a configurator attribute holding the actual # configurator to use for conversion. + class ConvertingDict(dict): """A converting dictionary wrapper.""" def __getitem__(self, key): value = dict.__getitem__(self, key) result = self.configurator.convert(value) - #If the converted value is different, save for next time + # If the converted value is different, save for next time if value is not result: self[key] = result - if type(result) in (ConvertingDict, ConvertingList, - ConvertingTuple): + if type(result) in (ConvertingDict, ConvertingList, ConvertingTuple): result.parent = self result.key = key return result @@ -78,11 +81,10 @@ def __getitem__(self, key): def get(self, key, default=None): value = dict.get(self, key, default) result = self.configurator.convert(value) - #If the converted value is different, save for next time + # If the converted value is different, save for next time if value is not result: self[key] = result - if type(result) in (ConvertingDict, ConvertingList, - ConvertingTuple): + if type(result) in (ConvertingDict, ConvertingList, ConvertingTuple): result.parent = self result.key = key return result @@ -91,22 +93,22 @@ def pop(self, key, default=None): value = dict.pop(self, key, default) result = self.configurator.convert(value) if value is not result: - if type(result) in (ConvertingDict, ConvertingList, - ConvertingTuple): + if type(result) in (ConvertingDict, ConvertingList, ConvertingTuple): result.parent = self result.key = key return result + class ConvertingList(list): """A converting list wrapper.""" + def __getitem__(self, key): value = list.__getitem__(self, key) result = self.configurator.convert(value) - #If the converted value is different, save for next time + # If the converted value is different, save for next time if value is not result: self[key] = result - if type(result) in (ConvertingDict, ConvertingList, - ConvertingTuple): + if type(result) in (ConvertingDict, ConvertingList, ConvertingTuple): result.parent = self result.key = key return result @@ -115,38 +117,39 @@ def pop(self, idx=-1): value = list.pop(self, idx) result = self.configurator.convert(value) if value is not result: - if type(result) in (ConvertingDict, ConvertingList, - ConvertingTuple): + if type(result) in (ConvertingDict, ConvertingList, ConvertingTuple): result.parent = self return result + class ConvertingTuple(tuple): """A converting tuple wrapper.""" + def __getitem__(self, key): value = tuple.__getitem__(self, key) result = self.configurator.convert(value) if value is not result: - if type(result) in (ConvertingDict, ConvertingList, - ConvertingTuple): + if type(result) in (ConvertingDict, ConvertingList, ConvertingTuple): result.parent = self result.key = key return result + class BaseConfigurator(object): """ The configurator base class which defines some useful defaults. """ - CONVERT_PATTERN = re.compile(r'^(?P[a-z]+)://(?P.*)$') + CONVERT_PATTERN = re.compile(r"^(?P[a-z]+)://(?P.*)$") - WORD_PATTERN = re.compile(r'^\s*(\w+)\s*') - DOT_PATTERN = re.compile(r'^\.\s*(\w+)\s*') - INDEX_PATTERN = re.compile(r'^\[\s*(\w+)\s*\]\s*') - DIGIT_PATTERN = re.compile(r'^\d+$') + WORD_PATTERN = re.compile(r"^\s*(\w+)\s*") + DOT_PATTERN = re.compile(r"^\.\s*(\w+)\s*") + INDEX_PATTERN = re.compile(r"^\[\s*(\w+)\s*\]\s*") + DIGIT_PATTERN = re.compile(r"^\d+$") value_converters = { - 'ext' : 'ext_convert', - 'cfg' : 'cfg_convert', + "ext": "ext_convert", + "cfg": "cfg_convert", } # We might want to use a different one, e.g. importlib @@ -161,12 +164,12 @@ def resolve(self, s): Resolve strings to objects using standard import and attribute syntax. """ - name = s.split('.') + name = s.split(".") used = name.pop(0) try: found = self.importer(used) for frag in name: - used += '.' + frag + used += "." + frag try: found = getattr(found, frag) except AttributeError: @@ -175,7 +178,7 @@ def resolve(self, s): return found except ImportError: e, tb = sys.exc_info()[1:] - v = ValueError('Cannot resolve %r: %s' % (s, e)) + v = ValueError("Cannot resolve %r: %s" % (s, e)) v.__cause__, v.__traceback__ = e, tb raise v @@ -190,9 +193,9 @@ def cfg_convert(self, value): if m is None: raise ValueError("Unable to convert %r" % value) else: - rest = rest[m.end():] + rest = rest[m.end() :] d = self.config[m.groups()[0]] - #print d, rest + # print d, rest while rest: m = self.DOT_PATTERN.match(rest) if m: @@ -205,16 +208,15 @@ def cfg_convert(self, value): d = d[idx] else: try: - n = int(idx) # try as number first (most likely) + n = int(idx) # try as number first (most likely) d = d[n] except TypeError: d = d[idx] if m: - rest = rest[m.end():] + rest = rest[m.end() :] else: - raise ValueError('Unable to convert ' - '%r at %r' % (value, rest)) - #rest should be empty + raise ValueError("Unable to convert " "%r at %r" % (value, rest)) + # rest should be empty return d def convert(self, value): @@ -229,28 +231,31 @@ def convert(self, value): elif not isinstance(value, ConvertingList) and isinstance(value, list): value = ConvertingList(value) value.configurator = self - elif not isinstance(value, ConvertingTuple) and\ - isinstance(value, tuple): + elif not isinstance(value, ConvertingTuple) and isinstance(value, tuple): value = ConvertingTuple(value) value.configurator = self - elif isinstance(value, six.string_types): # str for py3k + elif isinstance(value, six.string_types): # str for py3k m = self.CONVERT_PATTERN.match(value) if m: d = m.groupdict() - prefix = d['prefix'] + prefix = d["prefix"] converter = self.value_converters.get(prefix, None) if converter: - suffix = d['suffix'] + suffix = d["suffix"] converter = getattr(self, converter) value = converter(suffix) return value def configure_custom(self, config): """Configure an object with a user-supplied factory.""" - c = config.pop('()') - if not hasattr(c, '__call__') and hasattr(types, 'ClassType') and type(c) != types.ClassType: + c = config.pop("()") + if ( + not hasattr(c, "__call__") + and hasattr(types, "ClassType") + and type(c) != types.ClassType + ): c = self.resolve(c) - props = config.pop('.', None) + props = config.pop(".", None) # Check for valid identifiers kwargs = dict([(k, config[k]) for k in config if valid_ident(k)]) result = c(**kwargs) @@ -265,6 +270,7 @@ def as_tuple(self, value): value = tuple(value) return value + class DictConfigurator(BaseConfigurator): """ Configure logging using a dictionary-like object to describe the @@ -275,128 +281,130 @@ def configure(self): """Do the configuration.""" config = self.config - if 'version' not in config: + if "version" not in config: raise ValueError("dictionary doesn't specify a version") - if config['version'] != 1: - raise ValueError("Unsupported version: %s" % config['version']) - incremental = config.pop('incremental', False) + if config["version"] != 1: + raise ValueError("Unsupported version: %s" % config["version"]) + incremental = config.pop("incremental", False) EMPTY_DICT = {} logging._acquireLock() try: if incremental: - handlers = config.get('handlers', EMPTY_DICT) + handlers = config.get("handlers", EMPTY_DICT) # incremental handler config only if handler name # ties in to logging._handlers (Python 2.7) if sys.version_info[:2] == (2, 7): for name in handlers: if name not in logging._handlers: - raise ValueError('No handler found with ' - 'name %r' % name) + raise ValueError("No handler found with " "name %r" % name) else: try: handler = logging._handlers[name] handler_config = handlers[name] - level = handler_config.get('level', None) + level = handler_config.get("level", None) if level: handler.setLevel(_checkLevel(level)) except Exception as e: - raise ValueError('Unable to configure handler ' - '%r: %s' % (name, e)) - loggers = config.get('loggers', EMPTY_DICT) + raise ValueError( + "Unable to configure handler " "%r: %s" % (name, e) + ) + loggers = config.get("loggers", EMPTY_DICT) for name in loggers: try: self.configure_logger(name, loggers[name], True) except Exception as e: - raise ValueError('Unable to configure logger ' - '%r: %s' % (name, e)) - root = config.get('root', None) + raise ValueError( + "Unable to configure logger " "%r: %s" % (name, e) + ) + root = config.get("root", None) if root: try: self.configure_root(root, True) except Exception as e: - raise ValueError('Unable to configure root ' - 'logger: %s' % e) + raise ValueError("Unable to configure root " "logger: %s" % e) else: - disable_existing = config.pop('disable_existing_loggers', True) + disable_existing = config.pop("disable_existing_loggers", True) logging._handlers.clear() del logging._handlerList[:] # Do formatters first - they don't refer to anything else - formatters = config.get('formatters', EMPTY_DICT) + formatters = config.get("formatters", EMPTY_DICT) for name in formatters: try: - formatters[name] = self.configure_formatter( - formatters[name]) + formatters[name] = self.configure_formatter(formatters[name]) except Exception as e: - raise ValueError('Unable to configure ' - 'formatter %r: %s' % (name, e)) + raise ValueError( + "Unable to configure " "formatter %r: %s" % (name, e) + ) # Next, do filters - they don't refer to anything else, either - filters = config.get('filters', EMPTY_DICT) + filters = config.get("filters", EMPTY_DICT) for name in filters: try: filters[name] = self.configure_filter(filters[name]) except Exception as e: - raise ValueError('Unable to configure ' - 'filter %r: %s' % (name, e)) + raise ValueError( + "Unable to configure " "filter %r: %s" % (name, e) + ) # Next, do handlers - they refer to formatters and filters # As handlers can refer to other handlers, sort the keys # to allow a deterministic order of configuration - handlers = config.get('handlers', EMPTY_DICT) + handlers = config.get("handlers", EMPTY_DICT) for name in sorted(handlers): try: handler = self.configure_handler(handlers[name]) handler.name = name handlers[name] = handler except Exception as e: - raise ValueError('Unable to configure handler ' - '%r: %s' % (name, e)) + raise ValueError( + "Unable to configure handler " "%r: %s" % (name, e) + ) # Next, do loggers - they refer to handlers and filters - #we don't want to lose the existing loggers, - #since other threads may have pointers to them. - #existing is set to contain all existing loggers, - #and as we go through the new configuration we - #remove any which are configured. At the end, - #what's left in existing is the set of loggers - #which were in the previous configuration but - #which are not in the new configuration. + # we don't want to lose the existing loggers, + # since other threads may have pointers to them. + # existing is set to contain all existing loggers, + # and as we go through the new configuration we + # remove any which are configured. At the end, + # what's left in existing is the set of loggers + # which were in the previous configuration but + # which are not in the new configuration. root = logging.root existing = list(root.manager.loggerDict) - #The list needs to be sorted so that we can - #avoid disabling child loggers of explicitly - #named loggers. With a sorted list it is easier - #to find the child loggers. + # The list needs to be sorted so that we can + # avoid disabling child loggers of explicitly + # named loggers. With a sorted list it is easier + # to find the child loggers. existing.sort() - #We'll keep the list of existing loggers - #which are children of named loggers here... + # We'll keep the list of existing loggers + # which are children of named loggers here... child_loggers = [] - #now set up the new ones... - loggers = config.get('loggers', EMPTY_DICT) + # now set up the new ones... + loggers = config.get("loggers", EMPTY_DICT) for name in loggers: if name in existing: i = existing.index(name) prefixed = name + "." pflen = len(prefixed) num_existing = len(existing) - i = i + 1 # look at the entry after name - while (i < num_existing) and\ - (existing[i][:pflen] == prefixed): + i = i + 1 # look at the entry after name + while (i < num_existing) and (existing[i][:pflen] == prefixed): child_loggers.append(existing[i]) i = i + 1 existing.remove(name) try: self.configure_logger(name, loggers[name]) except Exception as e: - raise ValueError('Unable to configure logger ' - '%r: %s' % (name, e)) - - #Disable any old loggers. There's no point deleting - #them as other threads may continue to hold references - #and by disabling them, you stop them doing any logging. - #However, don't disable children of named loggers, as that's - #probably not what was intended by the user. + raise ValueError( + "Unable to configure logger " "%r: %s" % (name, e) + ) + + # Disable any old loggers. There's no point deleting + # them as other threads may continue to hold references + # and by disabling them, you stop them doing any logging. + # However, don't disable children of named loggers, as that's + # probably not what was intended by the user. for log in existing: logger = root.manager.loggerDict[log] if log in child_loggers: @@ -407,44 +415,43 @@ def configure(self): logger.disabled = True # And finally, do the root logger - root = config.get('root', None) + root = config.get("root", None) if root: try: self.configure_root(root) except Exception as e: - raise ValueError('Unable to configure root ' - 'logger: %s' % e) + raise ValueError("Unable to configure root " "logger: %s" % e) finally: logging._releaseLock() def configure_formatter(self, config): """Configure a formatter from a dictionary.""" - if '()' in config: - factory = config['()'] # for use in exception handler + if "()" in config: + factory = config["()"] # for use in exception handler try: result = self.configure_custom(config) except TypeError as te: if "'format'" not in str(te): raise - #Name of parameter changed from fmt to format. - #Retry with old name. - #This is so that code can be used with older Python versions - #(e.g. by Django) - config['fmt'] = config.pop('format') - config['()'] = factory + # Name of parameter changed from fmt to format. + # Retry with old name. + # This is so that code can be used with older Python versions + # (e.g. by Django) + config["fmt"] = config.pop("format") + config["()"] = factory result = self.configure_custom(config) else: - fmt = config.get('format', None) - dfmt = config.get('datefmt', None) + fmt = config.get("format", None) + dfmt = config.get("datefmt", None) result = logging.Formatter(fmt, dfmt) return result def configure_filter(self, config): """Configure a filter from a dictionary.""" - if '()' in config: + if "()" in config: result = self.configure_custom(config) else: - name = config.get('name', '') + name = config.get("name", "") result = logging.Filter(name) return result @@ -452,42 +459,48 @@ def add_filters(self, filterer, filters): """Add filters to a filterer from a list of names.""" for f in filters: try: - filterer.addFilter(self.config['filters'][f]) + filterer.addFilter(self.config["filters"][f]) except Exception as e: - raise ValueError('Unable to add filter %r: %s' % (f, e)) + raise ValueError("Unable to add filter %r: %s" % (f, e)) def configure_handler(self, config): """Configure a handler from a dictionary.""" - formatter = config.pop('formatter', None) + formatter = config.pop("formatter", None) if formatter: try: - formatter = self.config['formatters'][formatter] + formatter = self.config["formatters"][formatter] except Exception as e: - raise ValueError('Unable to set formatter ' - '%r: %s' % (formatter, e)) - level = config.pop('level', None) - filters = config.pop('filters', None) - if '()' in config: - c = config.pop('()') - if not hasattr(c, '__call__') and hasattr(types, 'ClassType') and type(c) != types.ClassType: + raise ValueError("Unable to set formatter " "%r: %s" % (formatter, e)) + level = config.pop("level", None) + filters = config.pop("filters", None) + if "()" in config: + c = config.pop("()") + if ( + not hasattr(c, "__call__") + and hasattr(types, "ClassType") + and type(c) != types.ClassType + ): c = self.resolve(c) factory = c else: - klass = self.resolve(config.pop('class')) - #Special case for handler which refers to another handler - if issubclass(klass, logging.handlers.MemoryHandler) and\ - 'target' in config: + klass = self.resolve(config.pop("class")) + # Special case for handler which refers to another handler + if issubclass(klass, logging.handlers.MemoryHandler) and "target" in config: try: - config['target'] = self.config['handlers'][config['target']] + config["target"] = self.config["handlers"][config["target"]] except Exception as e: - raise ValueError('Unable to set target handler ' - '%r: %s' % (config['target'], e)) - elif issubclass(klass, logging.handlers.SMTPHandler) and\ - 'mailhost' in config: - config['mailhost'] = self.as_tuple(config['mailhost']) - elif issubclass(klass, logging.handlers.SysLogHandler) and\ - 'address' in config: - config['address'] = self.as_tuple(config['address']) + raise ValueError( + "Unable to set target handler " "%r: %s" % (config["target"], e) + ) + elif ( + issubclass(klass, logging.handlers.SMTPHandler) and "mailhost" in config + ): + config["mailhost"] = self.as_tuple(config["mailhost"]) + elif ( + issubclass(klass, logging.handlers.SysLogHandler) + and "address" in config + ): + config["address"] = self.as_tuple(config["address"]) factory = klass kwargs = dict([(k, config[k]) for k in config if valid_ident(k)]) try: @@ -495,11 +508,11 @@ def configure_handler(self, config): except TypeError as te: if "'stream'" not in str(te): raise - #The argument name changed from strm to stream - #Retry with old name. - #This is so that code can be used with older Python versions - #(e.g. by Django) - kwargs['strm'] = kwargs.pop('stream') + # The argument name changed from strm to stream + # Retry with old name. + # This is so that code can be used with older Python versions + # (e.g. by Django) + kwargs["strm"] = kwargs.pop("stream") result = factory(**kwargs) if formatter: result.setFormatter(formatter) @@ -513,25 +526,25 @@ def add_handlers(self, logger, handlers): """Add handlers to a logger from a list of names.""" for h in handlers: try: - logger.addHandler(self.config['handlers'][h]) + logger.addHandler(self.config["handlers"][h]) except Exception as e: - raise ValueError('Unable to add handler %r: %s' % (h, e)) + raise ValueError("Unable to add handler %r: %s" % (h, e)) def common_logger_config(self, logger, config, incremental=False): """ Perform configuration which is common to root and non-root loggers. """ - level = config.get('level', None) + level = config.get("level", None) if level is not None: logger.setLevel(_checkLevel(level)) if not incremental: - #Remove any existing handlers + # Remove any existing handlers for h in logger.handlers[:]: logger.removeHandler(h) - handlers = config.get('handlers', None) + handlers = config.get("handlers", None) if handlers: self.add_handlers(logger, handlers) - filters = config.get('filters', None) + filters = config.get("filters", None) if filters: self.add_filters(logger, filters) @@ -539,7 +552,7 @@ def configure_logger(self, name, config, incremental=False): """Configure a non-root logger from a dictionary.""" logger = logging.getLogger(name) self.common_logger_config(logger, config, incremental) - propagate = config.get('propagate', None) + propagate = config.get("propagate", None) if propagate is not None: logger.propagate = propagate @@ -548,6 +561,7 @@ def configure_root(self, config, incremental=False): root = logging.getLogger() self.common_logger_config(root, config, incremental) + dictConfigClass = DictConfigurator diff --git a/odps/tests/test_accounts.py b/odps/tests/test_accounts.py index 422f935c..4a66d0b3 100644 --- a/odps/tests/test_accounts.py +++ b/odps/tests/test_accounts.py @@ -59,12 +59,14 @@ def test_sign_server_account(odps): server = SignServer() server.accounts[odps.account.access_id] = odps.account.secret_access_key try: - server.start(('127.0.0.1', 0)) - account = SignServerAccount(odps.account.access_id, server.server.server_address) + server.start(("127.0.0.1", 0)) + account = SignServerAccount( + odps.account.access_id, server.server.server_address + ) odps = odps.as_account(account=account) - odps.delete_table(tn('test_sign_account_table'), if_exists=True) - t = odps.create_table(tn('test_sign_account_table'), 'col string', lifecycle=1) - assert odps.exist_table(tn('test_sign_account_table')) is True + odps.delete_table(tn("test_sign_account_table"), if_exists=True) + t = odps.create_table(tn("test_sign_account_table"), "col string", lifecycle=1) + assert odps.exist_table(tn("test_sign_account_table")) is True t.drop(async_=True) finally: server.stop() @@ -74,16 +76,23 @@ def test_tokenized_sign_server_account(odps): server = SignServer(token=str(uuid.uuid4())) server.accounts[odps.account.access_id] = odps.account.secret_access_key try: - server.start(('127.0.0.1', 0)) - account = SignServerAccount(odps.account.access_id, server.server.server_address) + server.start(("127.0.0.1", 0)) + account = SignServerAccount( + odps.account.access_id, server.server.server_address + ) odps = ODPS(None, None, odps.project, odps.endpoint, account=account) - pytest.raises(SignServerError, lambda: odps.delete_table(tn('test_sign_account_table'), if_exists=True)) + pytest.raises( + SignServerError, + lambda: odps.delete_table(tn("test_sign_account_table"), if_exists=True), + ) - account = SignServerAccount(odps.account.access_id, server.server.server_address, token=server.token) + account = SignServerAccount( + odps.account.access_id, server.server.server_address, token=server.token + ) odps = ODPS(None, None, odps.project, odps.endpoint, account=account) - odps.delete_table(tn('test_sign_account_table'), if_exists=True) - t = odps.create_table(tn('test_sign_account_table'), 'col string', lifecycle=1) - assert odps.exist_table(tn('test_sign_account_table')) is True + odps.delete_table(tn("test_sign_account_table"), if_exists=True) + t = odps.create_table(tn("test_sign_account_table"), "col string", lifecycle=1) + assert odps.exist_table(tn("test_sign_account_table")) is True t.drop(async_=True) finally: server.stop() @@ -150,23 +159,28 @@ def test_bearer_token_account(odps): task_name = inst.get_task_names()[0] logview_address = inst.get_logview_address() - token = logview_address[logview_address.find('token=') + len('token='):] + token = logview_address[logview_address.find("token=") + len("token=") :] bearer_token_account = BearerTokenAccount(token=token) bearer_token_odps = ODPS( None, None, odps.project, odps.endpoint, account=bearer_token_account ) bearer_token_instance = bearer_token_odps.get_instance(inst.id) - assert inst.get_task_result(task_name) == bearer_token_instance.get_task_result(task_name) - assert inst.get_task_summary(task_name) == bearer_token_instance.get_task_summary(task_name) + assert inst.get_task_result(task_name) == bearer_token_instance.get_task_result( + task_name + ) + assert inst.get_task_summary(task_name) == bearer_token_instance.get_task_summary( + task_name + ) with pytest.raises(errors.NoPermission): - bearer_token_odps.create_table(tn('test_bearer_token_account_table_test1'), - 'col string', lifecycle=1) + bearer_token_odps.create_table( + tn("test_bearer_token_account_table_test1"), "col string", lifecycle=1 + ) def test_fake_bearer_token(odps): - fake_token_account = BearerTokenAccount(token='fake-token') + fake_token_account = BearerTokenAccount(token="fake-token") bearer_token_odps = ODPS( None, None, @@ -177,8 +191,9 @@ def test_fake_bearer_token(odps): ) with pytest.raises(errors.ODPSError): - bearer_token_odps.create_table(tn('test_bearer_token_account_table_test2'), - 'col string', lifecycle=1) + bearer_token_odps.create_table( + tn("test_bearer_token_account_table_test2"), "col string", lifecycle=1 + ) def test_bearer_token_load_and_update(odps): @@ -197,7 +212,9 @@ def test_bearer_token_load_and_update(odps): env_odps = ODPS(project=odps.project, endpoint=odps.endpoint) assert isinstance(env_odps.account, BearerTokenAccount) assert env_odps.account.token == token - assert env_odps.account._last_modified_time > datetime.datetime.fromtimestamp(create_timestamp) + assert env_odps.account._last_modified_time > datetime.datetime.fromtimestamp( + create_timestamp + ) last_timestamp = env_odps.account._last_modified_time env_odps.account.reload() @@ -242,7 +259,9 @@ def _new_is_ok2(self, resp): def _new_is_ok3(self, resp): if odps.endpoint not in self._endpoints_without_v4_sign: - raise errors.Unauthorized("The request authorization header is invalid or missing.") + raise errors.Unauthorized( + "The request authorization header is invalid or missing." + ) return resp.ok old_enable_v4_sign = options.enable_v4_sign @@ -250,17 +269,17 @@ def _new_is_ok3(self, resp): options.enable_v4_sign = True RestClient._endpoints_without_v4_sign.clear() with mock.patch("odps.rest.RestClient.is_ok", new=_new_is_ok): - odps.delete_table(tn('test_sign_account_table'), if_exists=True) + odps.delete_table(tn("test_sign_account_table"), if_exists=True) assert odps.endpoint in RestClient._endpoints_without_v4_sign RestClient._endpoints_without_v4_sign.clear() with mock.patch("odps.rest.RestClient.is_ok", new=_new_is_ok2): - odps.delete_table(tn('test_sign_account_table'), if_exists=True) + odps.delete_table(tn("test_sign_account_table"), if_exists=True) assert odps.endpoint in RestClient._endpoints_without_v4_sign RestClient._endpoints_without_v4_sign.clear() with mock.patch("odps.rest.RestClient.is_ok", new=_new_is_ok3): - odps.delete_table(tn('test_sign_account_table'), if_exists=True) + odps.delete_table(tn("test_sign_account_table"), if_exists=True) assert odps.endpoint in RestClient._endpoints_without_v4_sign finally: RestClient._endpoints_without_v4_sign.difference_update([odps.endpoint]) @@ -340,15 +359,13 @@ def get_credential(self): ) def test_credential_provider_account(odps, provider_cls): account = CredentialProviderAccount(provider_cls(odps)) - cred_odps = ODPS( - account, None, odps.project, odps.endpoint - ) + cred_odps = ODPS(account, None, odps.project, odps.endpoint) - table_name = tn('test_bearer_token_account_table') + table_name = tn("test_bearer_token_account_table") cred_odps.delete_table(table_name, if_exists=True) - t = cred_odps.create_table(table_name, 'col string', lifecycle=1) + t = cred_odps.create_table(table_name, "col string", lifecycle=1) with t.open_writer() as writer: - records = [['val1'], ['val2'], ['val3']] + records = [["val1"], ["val2"], ["val3"]] writer.write(records) cred_odps.delete_table(table_name) diff --git a/odps/tests/test_config.py b/odps/tests/test_config.py index 1985ea91..b84a26f6 100644 --- a/odps/tests/test_config.py +++ b/odps/tests/test_config.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,12 +21,12 @@ from ..accounts import AliyunAccount from ..config import ( Config, - options, - option_context, + OptionError, + any_validator, is_integer, is_null, - any_validator, - OptionError, + option_context, + options, ) @@ -38,7 +38,10 @@ def test_options(): assert options.account == old_config.account else: assert options.account.access_id == old_config.account.access_id - assert options.account.secret_access_key == old_config.account.secret_access_key + assert ( + options.account.secret_access_key + == old_config.account.secret_access_key + ) assert options.endpoint == old_config.endpoint assert options.default_project == old_config.default_project assert local_options.tunnel.endpoint is None @@ -48,15 +51,18 @@ def test_options(): assert local_options.console.max_lines is None assert local_options.console.max_width is None - local_options.account = AliyunAccount('test', '') - assert local_options.account.access_id == 'test' + local_options.account = AliyunAccount("test", "") + assert local_options.account.access_id == "test" - local_options.register_option('nest.inner.value', 50, - validator=any_validator(is_null, is_integer)) + local_options.register_option( + "nest.inner.value", 50, validator=any_validator(is_null, is_integer) + ) assert local_options.nest.inner.value == 50 + def set(val): local_options.nest.inner.value = val - pytest.raises(ValueError, lambda: set('test')) + + pytest.raises(ValueError, lambda: set("test")) set(None) assert local_options.nest.inner.value is None set(30) @@ -84,25 +90,26 @@ def set(val): def set_notexist(): options.display.val = 3 + pytest.raises(OptionError, set_notexist) def test_redirection(): local_config = Config() - local_config.register_option('test.redirect_src', 10) - local_config.redirect_option('test.redirect_redir', 'test.redirect_src') + local_config.register_option("test.redirect_src", 10) + local_config.redirect_option("test.redirect_redir", "test.redirect_src") - assert 'test' in dir(local_config) - assert 'redirect_redir' in dir(local_config.test) + assert "test" in dir(local_config) + assert "redirect_redir" in dir(local_config.test) local_config.test.redirect_redir = 20 assert local_config.test.redirect_src == 20 local_config.test.redirect_src = 10 assert local_config.test.redirect_redir == 10 - local_config.unregister_option('test.redirect_redir') - local_config.unregister_option('test.redirect_src') + local_config.unregister_option("test.redirect_redir") + local_config.unregister_option("test.redirect_src") pytest.raises(AttributeError, lambda: local_config.test.redirect_redir) pytest.raises(AttributeError, lambda: local_config.test.redirect_src) @@ -112,11 +119,12 @@ def test_set_display_option(): options.display.unicode.ambiguous_as_wide = True assert options.display.max_rows == 10 assert options.display.unicode.ambiguous_as_wide is True - options.register_pandas('display.non_exist', True) + options.register_pandas("display.non_exist", True) assert options.display.non_exist try: import pandas as pd + assert pd.options.display.max_rows == 10 assert pd.options.display.unicode.ambiguous_as_wide is True except ImportError: @@ -126,12 +134,12 @@ def test_set_display_option(): def test_dump_and_load(): with option_context() as local_options: local_options.register_option( - 'test.value', 50, validator=any_validator(is_null, is_integer) + "test.value", 50, validator=any_validator(is_null, is_integer) ) d = local_options.dumps() - assert d['test.value'] == 50 + assert d["test.value"] == 50 - d['test.value'] = 100 + d["test.value"] = 100 local_options.loads(d) assert local_options.test.value == 100 @@ -139,7 +147,7 @@ def test_dump_and_load(): def test_add_validator(): with option_context() as local_options: local_options.register_option( - 'test.value', 50, validator=any_validator(is_null, is_integer) + "test.value", 50, validator=any_validator(is_null, is_integer) ) with pytest.raises(ValueError): local_options.test.value = "abcd" diff --git a/odps/tests/test_crc.py b/odps/tests/test_crc.py index 8d986dd0..3e2961ec 100644 --- a/odps/tests/test_crc.py +++ b/odps/tests/test_crc.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,9 +22,9 @@ def test_crc32c(): crc_obj = crc.Crc32c() assert 0 == crc_obj.getvalue() - buf = bytearray(b'abc') + buf = bytearray(b"abc") crc_obj.update(buf) assert 910901175 == crc_obj.getvalue() - buf = bytearray(b'1111111111111111111') + buf = bytearray(b"1111111111111111111") crc_obj.update(buf) assert 2917307201 == crc_obj.getvalue() diff --git a/odps/tests/test_dag.py b/odps/tests/test_dag.py index 6013dfc9..b3e6ae05 100644 --- a/odps/tests/test_dag.py +++ b/odps/tests/test_dag.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,12 +22,12 @@ def test_dag(): dag = DAG() - labels = tuple('abcde') + labels = tuple("abcde") node1, node2, node3, node4, node5 = labels for i in range(1, 6): - dag.add_node(locals()['node%d' % i]) + dag.add_node(locals()["node%d" % i]) dag.add_edge(node1, node3) dag.add_edge(node2, node4) @@ -35,32 +35,32 @@ def test_dag(): dag.add_edge(node4, node5) loc_vars = locals() - assert sorted([loc_vars['node%d' % i] for i in range(1, 6)]) == sorted(dag.nodes()) + assert sorted([loc_vars["node%d" % i] for i in range(1, 6)]) == sorted(dag.nodes()) assert dag.contains_node(node1) is True assert dag.contains_edge(node2, node4) is True assert dag.contains_edge(node2, node5) is False try: - assert list('bacde') == dag.topological_sort() + assert list("bacde") == dag.topological_sort() except AssertionError: - assert list('abcde') == dag.topological_sort() - assert list('bca') == dag.ancestors([node4, ]) - assert list('de') == dag.descendants([node3, ]) + assert list("abcde") == dag.topological_sort() + assert list("bca") == dag.ancestors([node4]) + assert list("de") == dag.descendants([node3]) dag.add_edge(node2, node1) - assert list('bacde') == dag.topological_sort() - assert list('ab') == dag.ancestors([node3, ]) - assert list('daec') == dag.descendants([node2, ]) + assert list("bacde") == dag.topological_sort() + assert list("ab") == dag.ancestors([node3]) + assert list("daec") == dag.descendants([node2]) pytest.raises(DAGValidationError, lambda: dag.add_edge(node4, node2)) pytest.raises(DAGValidationError, lambda: dag.add_edge(node4, node1)) - assert dag.successors(node2) == list('da') - assert dag.predecessors(node4) == list('bc') + assert dag.successors(node2) == list("da") + assert dag.predecessors(node4) == list("bc") dag.remove_node(node4) - assert ''.join(dag.topological_sort()) in set(['beac', 'ebac']) + assert "".join(dag.topological_sort()) in set(["beac", "ebac"]) assert dag.contains_node(node4) is False pytest.raises(KeyError, lambda: dag.remove_node(node4)) assert dag.contains_edge(node4, node5) is False @@ -68,16 +68,16 @@ def test_dag(): pytest.raises(KeyError, lambda: dag.remove_edge(node4, node5)) pytest.raises(KeyError, lambda: dag.successors(node4)) - assert list('ab') == dag.ancestors([node3, ]) - assert list('') == dag.ancestors([node5, ]) - assert list('ac') == dag.descendants([node2, ]) - assert list('') == dag.descendants([node5, ]) + assert list("ab") == dag.ancestors([node3]) + assert list("") == dag.ancestors([node5]) + assert list("ac") == dag.descendants([node2]) + assert list("") == dag.descendants([node5]) dag.remove_edge(node2, node1) assert dag.contains_edge(node2, node1) is False - assert list('a') == dag.ancestors([node3, ]) - assert list('c') == dag.descendants([node1, ]) - assert set('abe') == set(dag.indep_nodes()) + assert list("a") == dag.ancestors([node3]) + assert list("c") == dag.descendants([node1]) + assert set("abe") == set(dag.indep_nodes()) dag.reset_graph() assert len(dag.nodes()) == 0 @@ -86,12 +86,12 @@ def test_dag(): def test_reversed_dag(): dag = DAG(reverse=True) - labels = tuple('abcde') + labels = tuple("abcde") node1, node2, node3, node4, node5 = labels for i in range(1, 6): - dag.add_node(locals()['node%d' % i]) + dag.add_node(locals()["node%d" % i]) dag.add_edge(node1, node3) dag.add_edge(node2, node4) @@ -99,29 +99,29 @@ def test_reversed_dag(): dag.add_edge(node4, node5) loc_vars = locals() - assert sorted([loc_vars['node%d' % i] for i in range(1, 6)]) == sorted(dag.nodes()) + assert sorted([loc_vars["node%d" % i] for i in range(1, 6)]) == sorted(dag.nodes()) assert dag.contains_node(node1) is True assert dag.contains_edge(node2, node4) is True assert dag.contains_edge(node2, node5) is False assert list(labels) == dag.topological_sort() - assert list('bca') == dag.ancestors([node4, ]) - assert list('de') == dag.descendants([node3, ]) + assert list("bca") == dag.ancestors([node4]) + assert list("de") == dag.descendants([node3]) dag.add_edge(node2, node1) - assert list('bacde') == dag.topological_sort() - assert list('ab') == dag.ancestors([node3, ]) - assert list('daec') == dag.descendants([node2, ]) + assert list("bacde") == dag.topological_sort() + assert list("ab") == dag.ancestors([node3]) + assert list("daec") == dag.descendants([node2]) pytest.raises(DAGValidationError, lambda: dag.add_edge(node4, node2)) pytest.raises(DAGValidationError, lambda: dag.add_edge(node4, node1)) - assert dag.successors(node2) == list('da') - assert dag.predecessors(node4) == list('bc') + assert dag.successors(node2) == list("da") + assert dag.predecessors(node4) == list("bc") dag.remove_node(node4) - assert ''.join(dag.topological_sort()) in set(['beac', 'ebac']) + assert "".join(dag.topological_sort()) in set(["beac", "ebac"]) assert dag.contains_node(node4) is False pytest.raises(KeyError, lambda: dag.remove_node(node4)) assert dag.contains_edge(node4, node5) is False @@ -129,16 +129,16 @@ def test_reversed_dag(): pytest.raises(KeyError, lambda: dag.remove_edge(node4, node5)) pytest.raises(KeyError, lambda: dag.successors(node4)) - assert list('ab') == dag.ancestors([node3, ]) - assert list('') == dag.ancestors([node5, ]) - assert list('ac') == dag.descendants([node2, ]) - assert list('') == dag.descendants([node5, ]) + assert list("ab") == dag.ancestors([node3]) + assert list("") == dag.ancestors([node5]) + assert list("ac") == dag.descendants([node2]) + assert list("") == dag.descendants([node5]) dag.remove_edge(node2, node1) assert dag.contains_edge(node2, node1) is False - assert list('a') == dag.ancestors([node3, ]) - assert list('c') == dag.descendants([node1, ]) - assert set('abe') == set(dag.indep_nodes()) + assert list("a") == dag.ancestors([node3]) + assert list("c") == dag.descendants([node1]) + assert set("abe") == set(dag.indep_nodes()) dag.reset_graph() assert len(dag.nodes()) == 0 diff --git a/odps/tests/test_errors.py b/odps/tests/test_errors.py index 2c600e7b..e280054d 100644 --- a/odps/tests/test_errors.py +++ b/odps/tests/test_errors.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,11 +16,17 @@ from collections import namedtuple -from ..errors import ODPSError, InternalServerError, ScriptError, \ - RequestTimeTooSkewed, BadGatewayError, \ - parse_instance_error, parse_response +from ..errors import ( + BadGatewayError, + InternalServerError, + ODPSError, + RequestTimeTooSkewed, + ScriptError, + parse_instance_error, + parse_response, +) -_PseudoResponse = namedtuple('PseudoResponse', 'content text headers status_code') +_PseudoResponse = namedtuple("PseudoResponse", "content text headers status_code") def test_xml_parse_response(): @@ -34,10 +40,10 @@ def test_xml_parse_response(): """ exc = parse_response(_PseudoResponse(xml_response, None, {}, 500)) assert isinstance(exc, InternalServerError) - assert exc.code == 'InternalServerError' - assert exc.args[0] == 'System internal error' - assert exc.request_id == 'REQ_ID' - assert exc.host_id == 'host' + assert exc.code == "InternalServerError" + assert exc.args[0] == "System internal error" + assert exc.request_id == "REQ_ID" + assert exc.host_id == "host" def test_json_parse_response(): @@ -48,13 +54,16 @@ def test_json_parse_response(): "HostId": "host" } """ - exc = parse_response(_PseudoResponse(json_response.encode(), json_response, - {'x-odps-request-id': 'REQ_ID'}, 500)) + exc = parse_response( + _PseudoResponse( + json_response.encode(), json_response, {"x-odps-request-id": "REQ_ID"}, 500 + ) + ) assert isinstance(exc, InternalServerError) - assert exc.code == 'InternalServerError' - assert exc.args[0] == 'System internal error' - assert exc.request_id == 'REQ_ID' - assert exc.host_id == 'host' + assert exc.code == "InternalServerError" + assert exc.args[0] == "System internal error" + assert exc.request_id == "REQ_ID" + assert exc.host_id == "host" def test_nginx_gateway_error(): @@ -116,13 +125,17 @@ def test_instance_error(): def test_parse_request_time_skew(): import time from datetime import datetime + from ..compat import utc def get_timestamp(dt): if dt.tzinfo: delta = dt.astimezone(utc) - datetime(1970, 1, 1, tzinfo=utc) - return (delta.microseconds + 0.0 + - (delta.seconds + delta.days * 24 * 3600) * 10 ** 6) / 10 ** 6 + return ( + delta.microseconds + + 0.0 + + (delta.seconds + delta.days * 24 * 3600) * 10**6 + ) / 10**6 else: return time.mktime(dt.timetuple()) + dt.microsecond / 1000000.0 @@ -137,8 +150,12 @@ def get_timestamp(dt): exc = parse_response(_PseudoResponse(xml_response, None, {}, 500)) assert isinstance(exc, RequestTimeTooSkewed) assert exc.max_interval_date == 900000 - assert get_timestamp(exc.expire_date) == get_timestamp(datetime(2018, 1, 20, 16, 20, 17, 12000, tzinfo=utc)) - assert get_timestamp(exc.now_date) == get_timestamp(datetime(2018, 1, 20, 19, 20, 9, 34000, tzinfo=utc)) + assert get_timestamp(exc.expire_date) == get_timestamp( + datetime(2018, 1, 20, 16, 20, 17, 12000, tzinfo=utc) + ) + assert get_timestamp(exc.now_date) == get_timestamp( + datetime(2018, 1, 20, 19, 20, 9, 34000, tzinfo=utc) + ) xml_response = """ diff --git a/odps/tests/test_inter.py b/odps/tests/test_inter.py index 02a71ba1..a758f316 100644 --- a/odps/tests/test_inter.py +++ b/odps/tests/test_inter.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,18 +30,18 @@ @pytest.fixture(autouse=True) def install_cloud_unpickler(): - cloudpickle.CloudUnpickler(StringIO('abcdefg')) + cloudpickle.CloudUnpickler(StringIO("abcdefg")) def test_room(): from ..inter import enter, setup, teardown - access_id = 'test_access_id' - access_key = 'test_access_key' - project = 'test_default_project' - endpoint = 'test_endpoint' + access_id = "test_access_id" + access_key = "test_access_key" + project = "test_default_project" + endpoint = "test_endpoint" - test_room = '__test' + test_room = "__test" teardown(test_room) @@ -54,8 +54,9 @@ def test_room(): assert endpoint == options.endpoint assert options.tunnel.endpoint is None - pytest.raises(InteractiveError, - lambda: setup(access_id, access_key, project, room=test_room)) + pytest.raises( + InteractiveError, lambda: setup(access_id, access_key, project, room=test_room) + ) assert test_room in list_rooms() @@ -68,34 +69,34 @@ class FakeRoom(Room): def _init(self): return - room = FakeRoom('__test') + room = FakeRoom("__test") room._room_dir = tempfile.mkdtemp() try: - s = TableSchema.from_lists(['name', 'id'], ['string', 'bigint']) - table_name = tn('pyodps_test_room_stores') + s = TableSchema.from_lists(["name", "id"], ["string", "bigint"]) + table_name = tn("pyodps_test_room_stores") odps.delete_table(table_name, if_exists=True) t = odps.create_table(table_name, s) - data = [['name1', 1], ['name2', 2]] + data = [["name1", 1], ["name2", 2]] with t.open_writer() as writer: writer.write(data) del t t = odps.get_table(table_name) - assert t.table_schema.names == ['name', 'id'] + assert t.table_schema.names == ["name", "id"] try: - room.store('table', t) + room.store("table", t) - t2 = room['table'] + t2 = room["table"] assert t2.name == table_name with t2.open_reader() as reader: values = [r.values for r in reader] assert data == values - assert room.list_stores() == [['table', None]] + assert room.list_stores() == [["table", None]] finally: t.drop() finally: diff --git a/odps/tests/test_serializers.py b/odps/tests/test_serializers.py index e6d95ea3..2133a646 100644 --- a/odps/tests/test_serializers.py +++ b/odps/tests/test_serializers.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,15 +14,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime import email.header import textwrap import time -from datetime import datetime -from ..serializers import * from .. import utils +from ..serializers import * -expected_xml_template = ''' +expected_xml_template = """ example 1 %s @@ -51,88 +51,128 @@ {"label": "json", "tags": [{"tag": "t1"}, {"tag": "t2"}], "nest": {"name": "n"}, "nests": {"nest": [{"name": "n1"}, {"name": "n2"}]}} false true + + + false + + + + test-val + -''' +""" -LIST_OBJ_TMPL = ''' +LIST_OBJ_TMPL = """ %s %s -''' +""" -LIST_OBJ_LAST_TMPL = ''' +LIST_OBJ_LAST_TMPL = """ %s -''' +""" class Example(XMLSerializableModel): - __slots__ = 'name', 'type', 'date', 'lessons', 'teacher', 'student',\ - 'professors', 'properties', 'jsn', 'bool_false', 'bool_true' - - _root = 'Example' + __slots__ = ( + "name", + "type", + "date", + "lessons", + "teacher", + "student", + "professors", + "properties", + "jsn", + "bool_false", + "bool_true", + ) + + _root = "Example" class Teacher(XMLSerializableModel): - - name = XMLNodeField('Name') - tag = XMLTagField('.') + name = XMLNodeField("Name") + tag = XMLTagField(".") def __eq__(self, other): - return isinstance(other, Example.Teacher) and \ - self.name == other.name + return isinstance(other, Example.Teacher) and self.name == other.name class Student(XMLSerializableModel): - name = XMLNodeAttributeField(attr='name') - content = XMLNodeField('.') + name = XMLNodeAttributeField(attr="name") + content = XMLNodeField(".") def __eq__(self, other): - return isinstance(other, Example.Student) and \ - self.name == other.name and \ - self.content == other.content + return ( + isinstance(other, Example.Student) + and self.name == other.name + and self.content == other.content + ) class Json(JSONSerializableModel): - - __slots__ = 'label', 'tags', 'nest', 'nests' + __slots__ = "label", "tags", "nest", "nests" class Nest(JSONSerializableModel): - name = JSONNodeField('name') + name = JSONNodeField("name") def __eq__(self, other): - return isinstance(other, Example.Json.Nest) and \ - self.name == other.name - - label = JSONNodeField('label') - tags = JSONNodesField('tags', 'tag') - nest = JSONNodeReferenceField(Nest, 'nest') - nests = JSONNodesReferencesField(Nest, 'nests', 'nest') - - name = XMLNodeField('Name') - type = XMLNodeAttributeField('.', attr='type') - date = XMLNodeField('Created', type='rfc822l') - bool_true = XMLNodeField('Enabled', type='bool') - bool_false = XMLNodeField('Disabled', type='bool') - lessons = XMLNodesField('Lessons', 'Lesson') - teacher = XMLNodeReferenceField(Teacher, 'Teacher') - student = XMLNodeReferenceField(Student, 'Student') - professors = XMLNodesReferencesField(Teacher, 'Professors', 'Professor') - properties = XMLNodePropertiesField('Config', 'Property', key_tag='Name', value_tag='Value') - jsn = XMLNodeReferenceField(Json, 'json') + return isinstance(other, Example.Json.Nest) and self.name == other.name + + label = JSONNodeField("label") + tags = JSONNodesField("tags", "tag") + nest = JSONNodeReferenceField(Nest, "nest") + nests = JSONNodesReferencesField(Nest, "nests", "nest") + + name = XMLNodeField("Name") + type = XMLNodeAttributeField(".", attr="type") + date = XMLNodeField("Created", type="rfc822l") + bool_true = XMLNodeField("Enabled", type="bool") + bool_false = XMLNodeField("Disabled", type="bool") + lessons = XMLNodesField("Lessons", "Lesson") + teacher = XMLNodeReferenceField(Teacher, "Teacher") + student = XMLNodeReferenceField(Student, "Student") + professors = XMLNodesReferencesField(Teacher, "Professors", "Professor") + properties = XMLNodePropertiesField( + "Config", "Property", key_tag="Name", value_tag="Value" + ) + properties2 = XMLNodePropertiesField( + "Config2", "Property", key_attr="name", value_tag="Value" + ) + properties3 = XMLNodePropertiesField("Config3", "Property", key_attr="name") + jsn = XMLNodeReferenceField(Json, "json") def test_serializers(): - teacher = Example.Teacher(name='t1') - student = Example.Student(name='s1', content='s1_content') - professors = [Example.Teacher(name='p1'), Example.Teacher(name='p2')] - jsn = Example.Json(label='json', tags=['t1', 't2'], - nest=Example.Json.Nest(name='n'), - nests=[Example.Json.Nest(name='n1'), Example.Json.Nest(name='n2')]) - - dt = datetime.fromtimestamp(time.mktime(datetime.now().timetuple())) - example = Example(name='example 1', type='ex', date=dt, bool_true=True, bool_false=False, - lessons=['less1', 'less2'], teacher=teacher, student=student, - professors=professors, properties={'test': 'true'}, jsn=jsn) + teacher = Example.Teacher(name="t1") + student = Example.Student(name="s1", content="s1_content") + professors = [Example.Teacher(name="p1"), Example.Teacher(name="p2")] + jsn = Example.Json( + label="json", + tags=["t1", "t2"], + nest=Example.Json.Nest(name="n"), + nests=[Example.Json.Nest(name="n1"), Example.Json.Nest(name="n2")], + ) + + dt = datetime.datetime.fromtimestamp( + time.mktime(datetime.datetime.now().timetuple()) + ) + example = Example( + name="example 1", + type="ex", + date=dt, + bool_true=True, + bool_false=False, + lessons=["less1", "less2"], + teacher=teacher, + student=student, + professors=professors, + properties={"test": "true"}, + properties2={"test2": "false"}, + properties3={"test3": "test-val"}, + jsn=jsn, + ) sel = example.serialize() assert utils.to_str( @@ -150,9 +190,18 @@ def test_serializers(): assert example.teacher == parsed_example.teacher assert example.student == parsed_example.student assert list(example.professors) == list(parsed_example.professors) - assert len(example.properties) == len(parsed_example.properties) and \ - (any(example.properties[it] == parsed_example.properties[it]) - for it in example.properties) + assert len(example.properties) == len(parsed_example.properties) and ( + any(example.properties[it] == parsed_example.properties[it]) + for it in example.properties + ) + assert len(example.properties2) == len(parsed_example.properties2) and ( + any(example.properties2[it] == parsed_example.properties2[it]) + for it in example.properties2 + ) + assert len(example.properties3) == len(parsed_example.properties3) and ( + any(example.properties3[it] == parsed_example.properties3[it]) + for it in example.properties3 + ) assert example.jsn.label == parsed_example.jsn.label assert example.jsn.tags == parsed_example.jsn.tags assert example.jsn.nest == parsed_example.jsn.nest @@ -160,16 +209,25 @@ def test_serializers(): def test_coded_json(): - parsed_example = Example.parse(expected_xml_template % utils.gen_rfc822(datetime.now(), localtime=True)) + parsed_example = Example.parse( + expected_xml_template + % utils.gen_rfc822(datetime.datetime.now(), localtime=True) + ) json_bytes = parsed_example.jsn.serialize().encode("iso-8859-1") coded_json = email.header.Header(json_bytes, "iso-8859-1").encode() - coded = textwrap.dedent(''' + coded = ( + textwrap.dedent( + """ {JSON_CODED} - ''').strip().replace("{JSON_CODED}", coded_json) + """ + ) + .strip() + .replace("{JSON_CODED}", coded_json) + ) parsed = Example.parse(coded) assert list(parsed.jsn.nests) == list(parsed_example.jsn.nests) @@ -186,8 +244,8 @@ def gen_objs(marker): class Objs(XMLSerializableModel): skip_null = False - marker = XMLNodeField('marker') - obj = XMLNodeField('obj') + marker = XMLNodeField("marker") + obj = XMLNodeField("obj") objs = Objs() i = 1 diff --git a/odps/tests/test_sqlalchemy_odps.py b/odps/tests/test_sqlalchemy_odps.py index d03fb651..d85b0d75 100644 --- a/odps/tests/test_sqlalchemy_odps.py +++ b/odps/tests/test_sqlalchemy_odps.py @@ -1,6 +1,5 @@ -#!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,7 +22,9 @@ import mock import pytest -from .. import options +from .. import ODPS, errors, options +from ..accounts import BearerTokenAccount +from ..dbapi import connect as dbapi_connect from ..df import DataFrame from ..utils import to_str @@ -37,7 +38,7 @@ from sqlalchemy.schema import Column, MetaData, Table from sqlalchemy.sql import expression, text - from ..sqlalchemy_odps import update_test_setting + from ..sqlalchemy_odps import ODPSPingError, update_test_setting _ONE_ROW_COMPLEX_CONTENTS = [ True, @@ -47,13 +48,13 @@ 9223372036854775807, 0.5, 0.25, - 'a string', + "a string", pd.Timestamp(1970, 1, 1, 8), - b'123', + b"123", [1, 2], OrderedDict({1: 2, 3: 4}), OrderedDict({"a": 1, "b": 2}), - decimal.Decimal('0.1'), + decimal.Decimal("0.1"), ] except ImportError: dependency_installed = False @@ -62,7 +63,7 @@ def create_one_row(o): - table = 'one_row' + table = "one_row" if not o.exist_table(table): o.execute_sql("CREATE TABLE one_row (number_of_rows INT);") o.execute_sql("INSERT INTO TABLE one_row VALUES (1);") @@ -70,7 +71,7 @@ def create_one_row(o): def create_one_row_complex(o): need_writes = [False] * 2 - for i, table in enumerate(['one_row_complex', 'one_row_complex_null']): + for i, table in enumerate(["one_row_complex", "one_row_complex_null"]): if o.exist_table(table): continue @@ -91,12 +92,15 @@ def create_one_row_complex(o): `struct` STRUCT, `decimal` DECIMAL(10, 1) ); - """.format(table) + """.format( + table + ) o.execute_sql(ddl) need_writes[i] = True if need_writes[0]: - o.execute_sql(""" + o.execute_sql( + """ INSERT OVERWRITE TABLE one_row_complex SELECT true, CAST(127 AS TINYINT), @@ -113,10 +117,12 @@ def create_one_row_complex(o): named_struct('a', 1, 'b', 2), 0.1 FROM one_row; - """) + """ + ) if need_writes[1]: - o.execute_sql(""" + o.execute_sql( + """ INSERT OVERWRITE TABLE one_row_complex_null SELECT null, null, @@ -133,39 +139,42 @@ def create_one_row_complex(o): CAST(null AS STRUCT), null FROM one_row; - """) + """ + ) def create_many_rows(o): - table = 'many_rows' + table = "many_rows" if not o.exist_table(table): - df = pd.DataFrame({'a': np.arange(10000, dtype=np.int32)}) - o.execute_sql(""" + df = pd.DataFrame({"a": np.arange(10000, dtype=np.int32)}) + o.execute_sql( + """ CREATE TABLE many_rows ( a INT ) PARTITIONED BY ( b STRING ) - """) + """ + ) DataFrame(df).persist("many_rows", partition="b='blah'", odps=o) def create_test(o): - table = 'dummy_table' + table = "dummy_table" if not o.exist_table(table): - o.create_table(table, 'a int') + o.create_table(table, "a int") @pytest.fixture def engine(odps, request): - engine_url = 'odps://{}:{}@{}/?endpoint={}&SKYNET_PYODPS_HINT=hint'.format( + engine_url = "odps://{}:{}@{}/?endpoint={}&SKYNET_PYODPS_HINT=hint".format( odps.account.access_id, odps.account.secret_access_key, odps.project, odps.endpoint, ) - if getattr(request, "param", False): - engine_url += "&reuse_odps=true" + if getattr(request, "param", None): + engine_url += "&" + request.param # create an engine to enable cache create_engine(engine_url) @@ -175,6 +184,7 @@ def engine(odps, request): from .. import sqlalchemy_odps sqlalchemy_odps._sqlalchemy_global_reusable_odps.clear() + sqlalchemy_odps._sqlalchemy_obj_list_cache.clear() @pytest.fixture @@ -189,12 +199,10 @@ def connection(engine): @pytest.fixture(autouse=True) def setup(odps): if not dependency_installed: - pytest.skip('dependency for sqlalchemy_odps not installed') + pytest.skip("dependency for sqlalchemy_odps not installed") options.sql.use_odps2_extension = True - options.sql.settings = { - 'odps.sql.decimal.odps2': True - } + options.sql.settings = {"odps.sql.decimal.odps2": True} # create test tables create_many_rows(odps) @@ -213,6 +221,27 @@ def setup(odps): options.sql.settings = None +@pytest.fixture +def bearer_token_odps(odps): + policy = { + "Statement": [{"Action": ["*"], "Effect": "Allow", "Resource": "acs:odps:*:*"}], + "Version": "1", + } + token = odps.get_project().generate_auth_token(policy, "bearer", 2) + bearer_token_account = BearerTokenAccount(token=token) + try: + yield ODPS( + None, + None, + odps.project, + odps.endpoint, + account=bearer_token_account, + overwrite_global=False, + ) + finally: + options.account = options.default_project = options.endpoint = None + + def _get_sa_table(table_name, engine, *args, **kw): try: # sqlalchemy 1.x @@ -224,9 +253,34 @@ def _get_sa_table(table_name, engine, *args, **kw): return Table(table_name, metadata, *args, **kw) -@pytest.mark.parametrize("engine", [False, True], indirect=True) +def test_dbapi_bearer_token(bearer_token_odps, setup): + conn = dbapi_connect(bearer_token_odps) + cursor = conn.cursor() + cursor.execute("select * from one_row") + rows = list(cursor) + assert len(rows) == 1 + assert len(rows[0]) == 1 + + +def test_query_with_bearer_token(bearer_token_odps, setup): + bearer_token_odps.to_global() + engine = create_engine("odps://") + connection = engine.connect() + result = connection.execute(text("SELECT * FROM one_row")) + rows = result.fetchall() + assert len(rows) == 1 + assert rows[0].number_of_rows == 1 # number_of_rows is the column name + assert len(rows[0]) == 1 + + +@pytest.mark.parametrize("engine", ["project_as_schema=false"], indirect=True) +def test_query_param(engine, connection): + assert connection.connection.dbapi_connection._project_as_schema is False + + +@pytest.mark.parametrize("engine", [None, "reuse_odps=true"], indirect=True) def test_basic_query(engine, connection): - result = connection.execute(text('SELECT * FROM one_row')) + result = connection.execute(text("SELECT * FROM one_row")) instance = result.cursor._instance rows = result.fetchall() @@ -239,7 +293,7 @@ def test_basic_query(engine, connection): def test_one_row_complex_null(engine, connection): - one_row_complex_null = _get_sa_table('one_row_complex_null', engine, autoload=True) + one_row_complex_null = _get_sa_table("one_row_complex_null", engine, autoload=True) rows = connection.execute(one_row_complex_null.select()).fetchall() assert len(rows) == 1 assert list(rows[0]) == [None] * len(rows[0]) @@ -247,52 +301,50 @@ def test_one_row_complex_null(engine, connection): def test_reflect_no_such_table(engine, connection): """reflecttable should throw an exception on an invalid table""" - pytest.raises(NoSuchTableError, - lambda: _get_sa_table('this_does_not_exist', engine, autoload=True) + pytest.raises( + NoSuchTableError, + lambda: _get_sa_table("this_does_not_exist", engine, autoload=True), ) - pytest.raises(NoSuchTableError, + pytest.raises( + NoSuchTableError, lambda: _get_sa_table( - 'this_does_not_exist', engine, schema='also_does_not_exist', autoload=True - ) + "this_does_not_exist", engine, schema="also_does_not_exist", autoload=True + ), ) def test_reflect_include_columns(engine, connection): """When passed include_columns, reflecttable should filter out other columns""" - one_row_complex = _get_sa_table('one_row_complex', engine) + one_row_complex = _get_sa_table("one_row_complex", engine) insp = reflection.Inspector.from_engine(engine) - insp.reflect_table(one_row_complex, include_columns=['int'], exclude_columns=[]) + insp.reflect_table(one_row_complex, include_columns=["int"], exclude_columns=[]) assert len(one_row_complex.c) == 1 assert one_row_complex.c.int is not None pytest.raises(AttributeError, lambda: one_row_complex.c.tinyint) def test_reflect_with_schema(odps, engine, connection): - dummy = _get_sa_table( - 'dummy_table', engine, schema=odps.project, autoload=True - ) + dummy = _get_sa_table("dummy_table", engine, schema=odps.project, autoload=True) assert len(dummy.c) == 1 assert dummy.c.a is not None @pytest.mark.filterwarnings( - "default:Omitting:sqlalchemy.exc.SAWarning" - if dependency_installed - else "default" + "default:Omitting:sqlalchemy.exc.SAWarning" if dependency_installed else "default" ) def test_reflect_partitions(engine, connection): """reflecttable should get the partition column as an index""" - many_rows = _get_sa_table('many_rows', engine, autoload=True) + many_rows = _get_sa_table("many_rows", engine, autoload=True) assert len(many_rows.c) == 2 insp = reflection.Inspector.from_engine(engine) - many_rows = _get_sa_table('many_rows', engine) - insp.reflect_table(many_rows, include_columns=['a'], exclude_columns=[]) + many_rows = _get_sa_table("many_rows", engine) + insp.reflect_table(many_rows, include_columns=["a"], exclude_columns=[]) assert len(many_rows.c) == 1 - many_rows = _get_sa_table('many_rows', engine) - insp.reflect_table(many_rows, include_columns=['b'], exclude_columns=[]) + many_rows = _get_sa_table("many_rows", engine) + insp.reflect_table(many_rows, include_columns=["b"], exclude_columns=[]) assert len(many_rows.c) == 1 @@ -300,65 +352,67 @@ def test_reflect_partitions(engine, connection): def test_unicode(engine, connection): """Verify that unicode strings make it through SQLAlchemy and the backend""" unicode_str = "中文" - one_row = _get_sa_table('one_row', engine) - returned_str = connection.execute(sqlalchemy.select( - expression.bindparam("好", unicode_str) - ).select_from(one_row)).scalar() + one_row = _get_sa_table("one_row", engine) + returned_str = connection.execute( + sqlalchemy.select(expression.bindparam("好", unicode_str)).select_from(one_row) + ).scalar() assert to_str(returned_str) == unicode_str +@pytest.mark.parametrize("engine", ["project_as_schema=true"], indirect=True) def test_reflect_schemas_with_project(odps, engine, connection): - try: - options.sqlalchemy.project_as_schema = True - insp = sqlalchemy.inspect(engine) - schemas = insp.get_schema_names() - assert odps.project in schemas - finally: - options.sqlalchemy.project_as_schema = False + insp = sqlalchemy.inspect(engine) + schemas = insp.get_schema_names() + assert odps.project in schemas def test_reflect_schemas(odps, engine, connection): insp = sqlalchemy.inspect(engine) schemas = insp.get_schema_names() - assert 'default' in schemas + assert "default" in schemas +@pytest.mark.parametrize("engine", [None, "cache_names=true"], indirect=True) def test_get_table_names(odps, engine, connection): def _new_list_tables(*_, **__): - yield odps.get_table('one_row') - yield odps.get_table('one_row_complex') - yield odps.get_table('dummy_table') - - with mock.patch("odps.core.ODPS.list_tables", new=_new_list_tables), \ - update_test_setting( - get_tables_filter=lambda x: x.startswith('one_row') or - x.startswith('dummy_table')): + yield odps.get_table("one_row") + yield odps.get_table("one_row_complex") + yield odps.get_table("dummy_table") + + with mock.patch( + "odps.core.ODPS.list_tables", new=_new_list_tables + ), update_test_setting( + get_tables_filter=lambda x: x.startswith("one_row") + or x.startswith("dummy_table") + ): meta = MetaData() meta.reflect(bind=engine) - assert 'one_row' in meta.tables - assert 'one_row_complex' in meta.tables + assert "one_row" in meta.tables + assert "one_row_complex" in meta.tables insp = sqlalchemy.inspect(engine) - assert 'dummy_table' in insp.get_table_names(schema=odps.project) + assert "dummy_table" in insp.get_table_names(schema=odps.project) + # make sure cache works well + assert "dummy_table" in insp.get_table_names(schema=odps.project) def test_has_table(engine, connection): insp = reflection.Inspector.from_engine(engine) - assert insp.has_table('one_row') is True - assert insp.has_table('this_table_does_not_exist') is False + assert insp.has_table("one_row") is True + assert insp.has_table("this_table_does_not_exist") is False def test_char_length(engine, connection): - one_row_complex = _get_sa_table('one_row_complex', engine, autoload=True) + one_row_complex = _get_sa_table("one_row_complex", engine, autoload=True) result = connection.execute( sqlalchemy.select(sqlalchemy.func.char_length(one_row_complex.c.string)) ).scalar() - assert result == len('a string') + assert result == len("a string") def test_reflect_select(engine, connection): """reflecttable should be able to fill in a table from the name""" - one_row_complex = _get_sa_table('one_row_complex', engine, autoload=True) + one_row_complex = _get_sa_table("one_row_complex", engine, autoload=True) assert len(one_row_complex.c) == 14 assert isinstance(one_row_complex.c.string, Column) row = connection.execute(one_row_complex.select()).fetchone() @@ -383,17 +437,21 @@ def test_reflect_select(engine, connection): def test_type_map(engine, connection): """sqlalchemy should use the dbapi_type_map to infer types from raw queries""" - row = connection.execute(text('SELECT * FROM one_row_complex')).fetchone() + row = connection.execute(text("SELECT * FROM one_row_complex")).fetchone() assert list(row) == _ONE_ROW_COMPLEX_CONTENTS def test_reserved_words(engine, connection): """odps uses backticks""" # Use keywords for the table/column name - fake_table = _get_sa_table('select', engine, Column('sort', sqlalchemy.types.String)) - query = str(fake_table.select().where(fake_table.c.sort == 'a').compile(bind=engine)) - assert '`select`' in query - assert '`sort`' in query + fake_table = _get_sa_table( + "select", engine, Column("sort", sqlalchemy.types.String) + ) + query = str( + fake_table.select().where(fake_table.c.sort == "a").compile(bind=engine) + ) + assert "`select`" in query + assert "`sort`" in query assert '"select"' not in query assert '"sort"' not in query @@ -401,28 +459,51 @@ def test_reserved_words(engine, connection): def test_lots_of_types(engine, connection): # take type list from sqlalchemy.types types = [ - 'INT', 'CHAR', 'VARCHAR', 'NCHAR', 'TEXT', 'Text', 'FLOAT', - 'NUMERIC', 'DECIMAL', 'TIMESTAMP', 'DATETIME', 'CLOB', 'BLOB', - 'BOOLEAN', 'SMALLINT', 'DATE', 'TIME', - 'String', 'Integer', 'SmallInteger', - 'Numeric', 'Float', 'DateTime', 'Date', 'Time', 'LargeBinary', - 'Boolean', 'Unicode', 'UnicodeText', + "INT", + "CHAR", + "VARCHAR", + "NCHAR", + "TEXT", + "Text", + "FLOAT", + "NUMERIC", + "DECIMAL", + "TIMESTAMP", + "DATETIME", + "CLOB", + "BLOB", + "BOOLEAN", + "SMALLINT", + "DATE", + "TIME", + "String", + "Integer", + "SmallInteger", + "Numeric", + "Float", + "DateTime", + "Date", + "Time", + "LargeBinary", + "Boolean", + "Unicode", + "UnicodeText", ] cols = [] for i, t in enumerate(types): cols.append(Column(str(i), getattr(sqlalchemy.types, t))) - table = _get_sa_table('test_table', engine, *cols) + table = _get_sa_table("test_table", engine, *cols) table.drop(bind=engine, checkfirst=True) table.create(bind=engine) table.drop(bind=engine) def test_insert_select(engine, connection): - one_row = _get_sa_table('one_row', engine, autoload=True) - table = _get_sa_table('insert_test', engine, Column('a', sqlalchemy.types.Integer)) + one_row = _get_sa_table("one_row", engine, autoload=True) + table = _get_sa_table("insert_test", engine, Column("a", sqlalchemy.types.Integer)) table.drop(bind=engine, checkfirst=True) table.create(bind=engine) - connection.execute(table.insert().from_select(['a'], one_row.select())) + connection.execute(table.insert().from_select(["a"], one_row.select())) result = connection.execute(table.select()).fetchall() expected = [(1,)] @@ -430,12 +511,10 @@ def test_insert_select(engine, connection): def test_insert_values(engine, connection): - table = _get_sa_table( - 'insert_test', engine, Column('a', sqlalchemy.types.Integer) - ) + table = _get_sa_table("insert_test", engine, Column("a", sqlalchemy.types.Integer)) table.drop(bind=engine, checkfirst=True) table.create(bind=engine) - connection.execute(table.insert().values([{'a': 1}, {'a': 2}])) + connection.execute(table.insert().values([{"a": 1}, {"a": 2}])) result = connection.execute(table.select()).fetchall() expected = [(1,), (2,)] @@ -447,7 +526,7 @@ def test_supports_san_rowcount(engine, connection): def test_desc_sql(engine, connection): - sql = 'desc one_row' + sql = "desc one_row" result = connection.execute(text(sql)).fetchall() assert len(result) == 1 assert len(result[0]) == 1 @@ -455,4 +534,30 @@ def test_desc_sql(engine, connection): def test_table_comment(engine, connection): insp = sqlalchemy.inspect(engine) - assert insp.get_table_comment('one_row')['text'] == '' + assert insp.get_table_comment("one_row")["text"] == "" + + +def test_do_ping(engine, connection): + engine.dialect.do_ping(engine.raw_connection()) + err_data = None + + def _patched_do_ping(*_, **__): + raise err_data + + with mock.patch( + "sqlalchemy.engine.default.DefaultDialect.do_ping", new=_patched_do_ping + ): + err_data = errors.InternalServerError("InternalServerError") + with pytest.raises(errors.InternalServerError): + engine.dialect.do_ping(engine.raw_connection()) + + err_data = errors.NoSuchObject("no_such_obj") + with pytest.raises(errors.NoSuchObject): + engine.dialect.do_ping(engine.raw_connection()) + + with pytest.raises(ODPSPingError) as ex_data, mock.patch( + "odps.sqlalchemy_odps.ODPSDialect._is_stack_superset", + new=lambda *_: True, + ): + engine.dialect.do_ping(engine.raw_connection()) + assert not isinstance(ex_data.value, RuntimeError) diff --git a/odps/tests/test_superset_odps.py b/odps/tests/test_superset_odps.py index 96be9a3d..87cb8fa8 100644 --- a/odps/tests/test_superset_odps.py +++ b/odps/tests/test_superset_odps.py @@ -16,14 +16,14 @@ def ss_db_inspector(odps, request): import sqlalchemy as sa - engine_url = 'odps://{}:{}@{}/?endpoint={}&SKYNET_PYODPS_HINT=hint'.format( + engine_url = "odps://{}:{}@{}/?endpoint={}&SKYNET_PYODPS_HINT=hint".format( odps.account.access_id, odps.account.secret_access_key, odps.project, odps.endpoint, ) - if getattr(request, "param", False): - engine_url += "&reuse_odps=true" + if getattr(request, "param", None): + engine_url += "&" + request.param # create an engine to enable cache sa.create_engine(engine_url) sa_engine = sa.create_engine(engine_url) @@ -38,6 +38,7 @@ def ss_db_inspector(odps, request): from .. import sqlalchemy_odps sqlalchemy_odps._sqlalchemy_global_reusable_odps.clear() + sqlalchemy_odps._sqlalchemy_obj_list_cache.clear() def test_get_table_names(ss_db_inspector): @@ -56,13 +57,18 @@ def _new_list_tables(*args, **kwargs): assert tables[0] is not None -@pytest.mark.parametrize("ss_db_inspector", [False, True], indirect=True) +@pytest.mark.parametrize( + "ss_db_inspector", [None, "reuse_odps=true&cache_names=true"], indirect=True +) def test_get_function_names(ss_db_inspector): db, inspector = ss_db_inspector spec = ODPSEngineSpec() functions = list(spec.get_function_names(db)) assert len(functions) > 0 + # make sure cache works + functions = list(spec.get_function_names(db)) + assert len(functions) > 0 def test_get_catalog_names(odps, ss_db_inspector): @@ -82,8 +88,10 @@ def test_execute_sql(odps): odps.delete_table(table_name, if_exists=True) conn = dbapi.connect( - odps.account.access_id, odps.account.secret_access_key, - odps.project, odps.endpoint + odps.account.access_id, + odps.account.secret_access_key, + odps.project, + odps.endpoint, ) cursor = conn.cursor() @@ -129,7 +137,7 @@ def test_latest_partition(odps, ss_db_inspector): def test_df_to_sql(odps, ss_db_inspector): - db, inspector = ss_db_inspector + db, _inspector = ss_db_inspector SSTable = namedtuple("SSTable", "table schema") spec = ODPSEngineSpec() diff --git a/odps/tests/test_tempobjs.py b/odps/tests/test_tempobjs.py index 997f1336..fed7e585 100644 --- a/odps/tests/test_tempobjs.py +++ b/odps/tests/test_tempobjs.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,19 +15,19 @@ # limitations under the License. import copy +import json import os -import sys import subprocess +import sys import tempfile -import json from time import sleep import pytest from .. import tempobj, utils -from .core import tn, in_coverage_mode +from .core import in_coverage_mode, tn -TEMP_TABLE_NAME = tn('pyodps_test_tempobj_cleanup') +TEMP_TABLE_NAME = tn("pyodps_test_tempobj_cleanup") SECONDARY_PROCESS_CODE = """ #-*- coding:utf-8 -*- @@ -139,52 +139,63 @@ def evaluate(self, arg0): fun = odps.create_function(function_name, class_type=resource_name + '.TempFun', resources=[res, ]) """ + @pytest.fixture(autouse=True) def pack_cleanup_script(): if in_coverage_mode(): # pack cleanup script - tempobj.CLEANUP_SCRIPT_TMPL = 'from odps.tests.core import start_coverage\nstart_coverage()\n\n' + \ - tempobj.CLEANUP_SCRIPT_TMPL + tempobj.CLEANUP_SCRIPT_TMPL = ( + "from odps.tests.core import start_coverage\nstart_coverage()\n\n" + + tempobj.CLEANUP_SCRIPT_TMPL + ) tempobj._obj_repos = tempobj.ObjectRepositoryLib() def _get_odps_json(odps): - return json.dumps(dict(access_id=odps.account.access_id, - secret_access_key=odps.account.secret_access_key, - project=odps.project, - endpoint=odps.endpoint)) + return json.dumps( + dict( + access_id=odps.account.access_id, + secret_access_key=odps.account.secret_access_key, + project=odps.project, + endpoint=odps.endpoint, + ) + ) def test_temp_object(): class TestTempObject(tempobj.TempObject): - _type = 'Temp' - __slots__ = 'param1', 'param2' + _type = "Temp" + __slots__ = "param1", "param2" class TestTempObject2(TestTempObject): - _type = 'Temp2' + _type = "Temp2" - obj1 = TestTempObject('v1', param2='v2') - assert (obj1.param1, obj1.param2) == ('v1', 'v2') - obj2 = TestTempObject('v1', 'v2') + obj1 = TestTempObject("v1", param2="v2") + assert (obj1.param1, obj1.param2) == ("v1", "v2") + obj2 = TestTempObject("v1", "v2") assert obj1 == obj2 - assert obj1 != 'String' + assert obj1 != "String" assert hash(obj1) == hash(obj2) - assert obj1 != TestTempObject2('v1', 'v2') + assert obj1 != TestTempObject2("v1", "v2") def test_drop(odps): - tempobj.register_temp_table(odps, 'non_exist_table') - tempobj.register_temp_model(odps, 'non_exist_model') - tempobj.register_temp_function(odps, 'non_exist_function') - tempobj.register_temp_resource(odps, 'non_exist_resource') - tempobj.register_temp_volume_partition(odps, ('non_exist_vol', 'non_exist_vol_part')) + tempobj.register_temp_table(odps, "non_exist_table") + tempobj.register_temp_model(odps, "non_exist_model") + tempobj.register_temp_function(odps, "non_exist_function") + tempobj.register_temp_resource(odps, "non_exist_resource") + tempobj.register_temp_volume_partition( + odps, ("non_exist_vol", "non_exist_vol_part") + ) tempobj.clean_stored_objects(odps) def test_cleanup(odps): - odps.execute_sql('drop table if exists {0}'.format(TEMP_TABLE_NAME)) - odps.execute_sql('create table {0} (col1 string) lifecycle 1'.format(TEMP_TABLE_NAME)) + odps.execute_sql("drop table if exists {0}".format(TEMP_TABLE_NAME)) + odps.execute_sql( + "create table {0} (col1 string) lifecycle 1".format(TEMP_TABLE_NAME) + ) tempobj.register_temp_table(odps, TEMP_TABLE_NAME) tempobj.clean_objects(odps, use_threads=True) sleep(10) @@ -192,8 +203,10 @@ def test_cleanup(odps): def test_cleanup_script(odps): - odps.execute_sql('drop table if exists {0}'.format(TEMP_TABLE_NAME)) - odps.execute_sql('create table {0} (col1 string) lifecycle 1'.format(TEMP_TABLE_NAME)) + odps.execute_sql("drop table if exists {0}".format(TEMP_TABLE_NAME)) + odps.execute_sql( + "create table {0} (col1 string) lifecycle 1".format(TEMP_TABLE_NAME) + ) tempobj.register_temp_table(odps, TEMP_TABLE_NAME) tempobj._obj_repos._exec_cleanup_script() @@ -202,29 +215,42 @@ def test_cleanup_script(odps): def test_multi_process(odps): - odps.execute_sql('drop table if exists {0}'.format(TEMP_TABLE_NAME)) + odps.execute_sql("drop table if exists {0}".format(TEMP_TABLE_NAME)) - odps.execute_sql('create table {0} (col1 string) lifecycle 1'.format(TEMP_TABLE_NAME)) + odps.execute_sql( + "create table {0} (col1 string) lifecycle 1".format(TEMP_TABLE_NAME) + ) tempobj.register_temp_table(odps, TEMP_TABLE_NAME) - script = SECONDARY_PROCESS_CODE.format(odps_info=_get_odps_json(odps), import_paths=json.dumps(sys.path)) + script = SECONDARY_PROCESS_CODE.format( + odps_info=_get_odps_json(odps), import_paths=json.dumps(sys.path) + ) - script_name = tempfile.gettempdir() + os.sep + 'tmp_' + str(os.getpid()) + '_secondary_script.py' - with open(script_name, 'w') as script_file: + script_name = ( + tempfile.gettempdir() + + os.sep + + "tmp_" + + str(os.getpid()) + + "_secondary_script.py" + ) + with open(script_name, "w") as script_file: script_file.write(script) script_file.close() env = copy.deepcopy(os.environ) - env.update({'WAIT_CLEANUP': '1'}) + env.update({"WAIT_CLEANUP": "1"}) subprocess.call([sys.executable, script_name], close_fds=True, env=env) sleep(10) assert odps.exist_table(TEMP_TABLE_NAME) - odps.run_sql('drop table {0}'.format(TEMP_TABLE_NAME)) + odps.run_sql("drop table {0}".format(TEMP_TABLE_NAME)) def test_plenty_create(odps): - del_insts = [odps.run_sql('drop table {0}'.format(tn('tmp_pyodps_create_temp_%d' % n))) for n in range(10)] + del_insts = [ + odps.run_sql("drop table {0}".format(tn("tmp_pyodps_create_temp_%d" % n))) + for n in range(10) + ] [inst.wait_for_completion() for inst in del_insts] script = PLENTY_CREATE_CODE.format( @@ -232,17 +258,21 @@ def test_plenty_create(odps): import_paths=utils.to_text(json.dumps(sys.path)), ) - script_name = tempfile.gettempdir() + os.sep + 'tmp_' + str(os.getpid()) + '_plenty_script.py' - with open(script_name, 'wb') as script_file: + script_name = ( + tempfile.gettempdir() + os.sep + "tmp_" + str(os.getpid()) + "_plenty_script.py" + ) + with open(script_name, "wb") as script_file: script_file.write(script.encode()) script_file.close() env = copy.deepcopy(os.environ) - env.update({'WAIT_CLEANUP': '1'}) + env.update({"WAIT_CLEANUP": "1"}) subprocess.call([sys.executable, script_name], close_fds=True, env=env) sleep(5) trial = 4 - case = lambda: all(not odps.exist_table(tn('tmp_pyodps_create_temp_%d' % tid)) for tid in range(10)) + case = lambda: all( + not odps.exist_table(tn("tmp_pyodps_create_temp_%d" % tid)) for tid in range(10) + ) while not case(): trial -= 1 sleep(5) @@ -251,20 +281,28 @@ def test_plenty_create(odps): def test_temp_functions(odps): - resource_name = tn('pyodps_test_tempobj_temp_resource') + '.py' - function_name = tn('pyodps_test_tempobj_temp_function') + resource_name = tn("pyodps_test_tempobj_temp_resource") + ".py" + function_name = tn("pyodps_test_tempobj_temp_function") script = TEMP_FUNCTION_CODE.format( - odps_info=_get_odps_json(odps), import_paths=json.dumps(sys.path), - resource_name=resource_name, function_name=function_name, + odps_info=_get_odps_json(odps), + import_paths=json.dumps(sys.path), + resource_name=resource_name, + function_name=function_name, ) - script_name = tempfile.gettempdir() + os.sep + 'tmp_' + str(os.getpid()) + '_temp_functions.py' - with open(script_name, 'w') as script_file: + script_name = ( + tempfile.gettempdir() + + os.sep + + "tmp_" + + str(os.getpid()) + + "_temp_functions.py" + ) + with open(script_name, "w") as script_file: script_file.write(script) script_file.close() env = copy.deepcopy(os.environ) - env.update({'WAIT_CLEANUP': '1'}) + env.update({"WAIT_CLEANUP": "1"}) subprocess.call([sys.executable, script_name], close_fds=True, env=env) sleep(10) diff --git a/odps/tests/test_types.py b/odps/tests/test_types.py index 21d1f35b..57e30746 100644 --- a/odps/tests/test_types.py +++ b/odps/tests/test_types.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ # limitations under the License. import copy +import datetime import decimal as _decimal from collections import OrderedDict # noqa: F401 @@ -25,10 +26,9 @@ except ImportError: pd = None -from ..types import * -from ..tests.core import py_and_c, pandas_case - -from datetime import datetime +from .. import options +from .. import types as odps_types +from ..tests.core import pandas_case, py_and_c def _reloader(): @@ -44,15 +44,32 @@ def _reloader(): @py_and_c_deco def test_nullable_record(): - col_types = ['tinyint', 'smallint', 'int', 'bigint', 'float', 'double', - 'string', 'datetime', 'boolean', 'decimal', 'binary', 'decimal(10, 2)', - 'interval_year_month', 'json', 'char(20)', 'varchar(20)', - 'array', 'map', 'struct>'] + col_types = [ + "tinyint", + "smallint", + "int", + "bigint", + "float", + "double", + "string", + "datetime", + "boolean", + "decimal", + "binary", + "decimal(10, 2)", + "interval_year_month", + "json", + "char(20)", + "varchar(20)", + "array", + "map", + "struct>", + ] if pd is not None: - col_types.extend(['interval_day_time', 'timestamp', 'timestamp_ntz']) + col_types.extend(["interval_day_time", "timestamp", "timestamp_ntz"]) s = TableSchema.from_lists( - ['col%s' % i for i in range(len(col_types))], + ["col%s" % i for i in range(len(col_types))], col_types, ) r = Record(schema=s, values=[None] * len(col_types)) @@ -69,17 +86,32 @@ def test_record_max_field_size(): r["col"] = "e" * 1025 r = Record(schema=s) - r["col"] = "e" * String._max_length + r["col"] = "e" * odps_types.String._max_length with pytest.raises(ValueError): - r["col"] = "e" * (String._max_length + 1) + r["col"] = "e" * (odps_types.String._max_length + 1) + + r = Record(schema=s, max_field_size=0) + r["col"] = "e" * odps_types.String._max_length + with pytest.raises(ValueError): + r["col"] = "e" * (odps_types.String._max_length + 1) @py_and_c_deco def test_record_set_and_get_by_index(): s = TableSchema.from_lists( - ['col%s' % i for i in range(9)], - ['bigint', 'double', 'string', 'datetime', 'boolean', 'decimal', 'json', - 'array', 'map']) + ["col%s" % i for i in range(9)], + [ + "bigint", + "double", + "string", + "datetime", + "boolean", + "decimal", + "json", + "array", + "map", + ], + ) s.build_snapshot() if options.force_py: assert s._snapshot is None @@ -89,84 +121,108 @@ def test_record_set_and_get_by_index(): r = Record(schema=s) r[0] = 1 r[1] = 1.2 - r[2] = 'abc' - r[3] = datetime(2016, 1, 1) + r[2] = "abc" + r[3] = datetime.datetime(2016, 1, 1) r[4] = True - r[5] = _decimal.Decimal('1.111') + r[5] = _decimal.Decimal("1.111") r[6] = {"root": {"key": "value"}} - r[7] = ['a', 'b'] - r[8] = OrderedDict({'a': 1}) + r[7] = ["a", "b"] + r[8] = OrderedDict({"a": 1}) assert list(r.values) == [ - 1, 1.2, 'abc', datetime(2016, 1, 1), True, _decimal.Decimal('1.111'), - {"root": {"key": "value"}}, ['a', 'b'], OrderedDict({'a': 1}), + 1, + 1.2, + "abc", + datetime.datetime(2016, 1, 1), + True, + _decimal.Decimal("1.111"), + {"root": {"key": "value"}}, + ["a", "b"], + OrderedDict({"a": 1}), ] assert 1 == r[0] assert 1.2 == r[1] - assert 'abc' == r[2] - assert datetime(2016, 1, 1) == r[3] + assert "abc" == r[2] + assert datetime.datetime(2016, 1, 1) == r[3] assert r[4] is True - assert _decimal.Decimal('1.111') == r[5] + assert _decimal.Decimal("1.111") == r[5] assert {"root": {"key": "value"}} == r[6] - assert ['a', 'b'] == r[7] - assert OrderedDict({'a': 1}) == r[8] + assert ["a", "b"] == r[7] + assert OrderedDict({"a": 1}) == r[8] assert [1, 1.2] == r[:2] @py_and_c_deco def test_record_set_and_get_by_name(): s = TableSchema.from_lists( - ['col%s' % i for i in range(9)], - ['bigint', 'double', 'string', 'datetime', 'boolean', 'decimal', 'json', - 'array', 'map']) + ["col%s" % i for i in range(9)], + [ + "bigint", + "double", + "string", + "datetime", + "boolean", + "decimal", + "json", + "array", + "map", + ], + ) r = Record(schema=s) - r['col0'] = 1 - r['col1'] = 1.2 - r['col2'] = 'abc' - r['col3'] = datetime(2016, 1, 1) - r['col4'] = True - r['col5'] = _decimal.Decimal('1.111') - r['col6'] = {"root": {"key": "value"}} - r['col7'] = ['a', 'b'] - r['col8'] = OrderedDict({'a': 1}) + r["col0"] = 1 + r["col1"] = 1.2 + r["col2"] = "abc" + r["col3"] = datetime.datetime(2016, 1, 1) + r["col4"] = True + r["col5"] = _decimal.Decimal("1.111") + r["col6"] = {"root": {"key": "value"}} + r["col7"] = ["a", "b"] + r["col8"] = OrderedDict({"a": 1}) assert list(r.values) == [ - 1, 1.2, 'abc', datetime(2016, 1, 1), True, _decimal.Decimal('1.111'), - {"root": {"key": "value"}}, ['a', 'b'], OrderedDict({'a': 1}) + 1, + 1.2, + "abc", + datetime.datetime(2016, 1, 1), + True, + _decimal.Decimal("1.111"), + {"root": {"key": "value"}}, + ["a", "b"], + OrderedDict({"a": 1}), ] - assert 1 == r['col0'] - assert 1.2 == r['col1'] - assert 'abc' == r['col2'] - assert datetime(2016, 1, 1) == r['col3'] - assert r['col4'] is True - assert _decimal.Decimal('1.111') == r['col5'] - assert {"root": {"key": "value"}} == r['col6'] - assert ['a', 'b'] == r['col7'] - assert OrderedDict({'a': 1}) == r['col8'] + assert 1 == r["col0"] + assert 1.2 == r["col1"] + assert "abc" == r["col2"] + assert datetime.datetime(2016, 1, 1) == r["col3"] + assert r["col4"] is True + assert _decimal.Decimal("1.111") == r["col5"] + assert {"root": {"key": "value"}} == r["col6"] + assert ["a", "b"] == r["col7"] + assert OrderedDict({"a": 1}) == r["col8"] def test_implicit_cast(): - tinyint = Tinyint() - smallint = Smallint() - int_ = Int() - bigint = Bigint() - float = Float() - double = Double() - datetime = Datetime() - bool = Boolean() - decimal = Decimal() - string = String() - json = Json() + tinyint = odps_types.Tinyint() + smallint = odps_types.Smallint() + int_ = odps_types.Int() + bigint = odps_types.Bigint() + float = odps_types.Float() + double = odps_types.Double() + datetime_ = odps_types.Datetime() + bool = odps_types.Boolean() + decimal = odps_types.Decimal() + string = odps_types.String() + json = odps_types.Json() assert double.can_implicit_cast(bigint) assert string.can_implicit_cast(bigint) assert decimal.can_implicit_cast(bigint) assert not bool.can_implicit_cast(bigint) - assert not datetime.can_implicit_cast(bigint) + assert not datetime_.can_implicit_cast(bigint) assert bigint.can_implicit_cast(double) assert string.can_implicit_cast(double) assert decimal.can_implicit_cast(double) assert not bool.can_implicit_cast(double) - assert not datetime.can_implicit_cast(double) + assert not datetime_.can_implicit_cast(double) assert smallint.can_implicit_cast(tinyint) assert int_.can_implicit_cast(tinyint) @@ -190,80 +246,83 @@ def test_implicit_cast(): def test_composite_types(): - comp_type = validate_data_type('decimal') - assert isinstance(comp_type, Decimal) + comp_type = odps_types.validate_data_type("decimal") + assert isinstance(comp_type, odps_types.Decimal) - comp_type = validate_data_type('decimal(10)') - assert isinstance(comp_type, Decimal) + comp_type = odps_types.validate_data_type("decimal(10)") + assert isinstance(comp_type, odps_types.Decimal) assert comp_type.precision == 10 - comp_type = validate_data_type('decimal(10, 2)') - assert isinstance(comp_type, Decimal) + comp_type = odps_types.validate_data_type("decimal(10, 2)") + assert isinstance(comp_type, odps_types.Decimal) assert comp_type.precision == 10 assert comp_type.scale == 2 - comp_type = validate_data_type('varchar(10)') - assert isinstance(comp_type, Varchar) + comp_type = odps_types.validate_data_type("varchar(10)") + assert isinstance(comp_type, odps_types.Varchar) assert comp_type.size_limit == 10 - comp_type = validate_data_type('char(20)') - assert isinstance(comp_type, Char) + comp_type = odps_types.validate_data_type("char(20)") + assert isinstance(comp_type, odps_types.Char) assert comp_type.size_limit == 20 with pytest.raises(ValueError) as ex_info: - validate_data_type('array') - assert 'ARRAY' in str(ex_info.value) + odps_types.validate_data_type("array") + assert "ARRAY" in str(ex_info.value) - comp_type = validate_data_type('array') - assert isinstance(comp_type, Array) - assert isinstance(comp_type.value_type, Bigint) + comp_type = odps_types.validate_data_type("array") + assert isinstance(comp_type, odps_types.Array) + assert isinstance(comp_type.value_type, odps_types.Bigint) with pytest.raises(ValueError) as ex_info: - validate_data_type('map') - assert 'MAP' in str(ex_info.value) + odps_types.validate_data_type("map") + assert "MAP" in str(ex_info.value) - comp_type = validate_data_type('map') - assert isinstance(comp_type, Map) - assert isinstance(comp_type.key_type, Bigint) - assert isinstance(comp_type.value_type, String) + comp_type = odps_types.validate_data_type("map") + assert isinstance(comp_type, odps_types.Map) + assert isinstance(comp_type.key_type, odps_types.Bigint) + assert isinstance(comp_type.value_type, odps_types.String) - comp_type = validate_data_type('struct') - assert isinstance(comp_type, Struct) + comp_type = odps_types.validate_data_type("struct") + assert isinstance(comp_type, odps_types.Struct) assert len(comp_type.field_types) == 2 - assert isinstance(comp_type.field_types['abc'], Int) - assert isinstance(comp_type.field_types['def'], String) + assert isinstance(comp_type.field_types["abc"], odps_types.Int) + assert isinstance(comp_type.field_types["def"], odps_types.String) - comp_type = validate_data_type('struct, ghi:string>') - assert isinstance(comp_type, Struct) + comp_type = odps_types.validate_data_type( + "struct, ghi:string>" + ) + assert isinstance(comp_type, odps_types.Struct) assert len(comp_type.field_types) == 3 - assert isinstance(comp_type.field_types['abc'], Int) - assert isinstance(comp_type.field_types['def'], Map) - assert isinstance(comp_type.field_types['def'].key_type, Bigint) - assert isinstance(comp_type.field_types['def'].value_type, String) - assert isinstance(comp_type.field_types['ghi'], String) + assert isinstance(comp_type.field_types["abc"], odps_types.Int) + assert isinstance(comp_type.field_types["def"], odps_types.Map) + assert isinstance(comp_type.field_types["def"].key_type, odps_types.Bigint) + assert isinstance(comp_type.field_types["def"].value_type, odps_types.String) + assert isinstance(comp_type.field_types["ghi"], odps_types.String) @py_and_c_deco def test_set_with_cast(): s = TableSchema.from_lists( - ['bigint', 'double', 'string', 'datetime', 'boolean', 'decimal'], - ['bigint', 'double', 'string', 'datetime', 'boolean', 'decimal']) + ["bigint", "double", "string", "datetime", "boolean", "decimal"], + ["bigint", "double", "string", "datetime", "boolean", "decimal"], + ) r = Record(schema=s) - r['double'] = 1 - assert 1.0 == r['double'] - r['double'] = '1.33' - assert 1.33 == r['double'] - r['bigint'] = 1.1 - assert 1 == r['bigint'] - r['datetime'] = '2016-01-01 0:0:0' - assert datetime(2016, 1, 1) == r['datetime'] + r["double"] = 1 + assert 1.0 == r["double"] + r["double"] = "1.33" + assert 1.33 == r["double"] + r["bigint"] = 1.1 + assert 1 == r["bigint"] + r["datetime"] = "2016-01-01 0:0:0" + assert datetime.datetime(2016, 1, 1) == r["datetime"] @py_and_c_deco def test_record_copy(): - s = TableSchema.from_lists(['col1'], ['string']) + s = TableSchema.from_lists(["col1"], ["string"]) r = Record(schema=s) - r.col1 = 'a' + r.col1 = "a" cr = copy.copy(r) assert cr.col1 == r.col1 @@ -271,55 +330,58 @@ def test_record_copy(): @py_and_c_deco def test_record_set_field(): - s = TableSchema.from_lists(['col1'], ['string']) + s = TableSchema.from_lists(["col1"], ["string"]) r = Record(schema=s) - r.col1 = 'a' - assert r.col1 == 'a' + r.col1 = "a" + assert r.col1 == "a" - r['col1'] = 'b' - assert r['col1'] == 'b' + r["col1"] = "b" + assert r["col1"] == "b" - r[0] = 'c' - assert r[0] == 'c' - assert r['col1'] == 'c' + r[0] = "c" + assert r[0] == "c" + assert r["col1"] == "c" @py_and_c_deco def test_duplicate_names(): - pytest.raises(ValueError, lambda: TableSchema.from_lists(['col1', 'col1'], ['string', 'string'])) + pytest.raises( + ValueError, + lambda: TableSchema.from_lists(["col1", "col1"], ["string", "string"]), + ) try: - TableSchema.from_lists(['col1', 'col1'], ['string', 'string']) + TableSchema.from_lists(["col1", "col1"], ["string", "string"]) except ValueError as e: - assert 'col1' in str(e) + assert "col1" in str(e) @py_and_c_deco def test_chinese_schema(): - s = TableSchema.from_lists([u'用户'], ['string'], ['分区'], ['bigint']) - assert '用户' in s - assert s.get_column('用户').type.name == 'string' - assert s.get_partition(u'分区').type.name == 'bigint' - assert s['用户'].type.name == 'string' - assert s[u'分区'].type.name == 'bigint' - - s2 = TableSchema.from_lists(['用户'], ['string'], [u'分区'], ['bigint']) + s = TableSchema.from_lists([u"用户"], ["string"], ["分区"], ["bigint"]) + assert "用户" in s + assert s.get_column("用户").type.name == "string" + assert s.get_partition(u"分区").type.name == "bigint" + assert s["用户"].type.name == "string" + assert s[u"分区"].type.name == "bigint" + + s2 = TableSchema.from_lists(["用户"], ["string"], [u"分区"], ["bigint"]) assert s == s2 @py_and_c_deco def test_record_multi_fields(): - s = TableSchema.from_lists(['col1', 'col2'], ['string', 'bigint']) + s = TableSchema.from_lists(["col1", "col2"], ["string", "bigint"]) r = Record(values=[1, 2], schema=s) - assert r['col1', 'col2'] == ['1', 2] + assert r["col1", "col2"] == ["1", 2] - pytest.raises(KeyError, lambda: r['col3']) - pytest.raises(KeyError, lambda: r['col3', ]) + pytest.raises(KeyError, lambda: r["col3"]) + pytest.raises(KeyError, lambda: r["col3"]) @py_and_c_deco def test_bizarre_repr(): - s = TableSchema.from_lists(['逗比 " \t'], ['string'], ['正常'], ['bigint']) + s = TableSchema.from_lists(['逗比 " \t'], ["string"], ["正常"], ["bigint"]) s_repr = repr(s) assert '"逗比 \\" \\t"' in s_repr assert '"正常"' not in s_repr @@ -329,17 +391,17 @@ def test_bizarre_repr(): def test_string_as_binary(): try: options.tunnel.string_as_binary = True - s = TableSchema.from_lists(['col1', 'col2'], ['string', 'bigint']) + s = TableSchema.from_lists(["col1", "col2"], ["string", "bigint"]) r = Record(values=[1, 2], schema=s) - assert r['col1', 'col2'] == [b'1', 2] + assert r["col1", "col2"] == [b"1", 2] assert isinstance(r[0], bytes) - r[0] = u'junk' - assert r[0] == b'junk' + r[0] = u"junk" + assert r[0] == b"junk" assert isinstance(r[0], bytes) - r[0] = b'junk' - assert r[0] == b'junk' + r[0] = b"junk" + assert r[0] == b"junk" assert isinstance(r[0], bytes) finally: options.tunnel.string_as_binary = False @@ -348,54 +410,54 @@ def test_string_as_binary(): def test_validate_struct(): try: options.struct_as_dict = True - struct_type = validate_data_type('struct') - assert validate_value(None, struct_type) is None + struct_type = odps_types.validate_data_type("struct") + assert odps_types.validate_value(None, struct_type) is None - vl = validate_value((10, "uvwxyz"), struct_type) + vl = odps_types.validate_value((10, "uvwxyz"), struct_type) assert isinstance(vl, dict) assert vl["abc"] == 10 assert vl["def"] == "uvwxyz" - vl = validate_value({"abc": 10, "def": "uvwxyz"}, struct_type) + vl = odps_types.validate_value({"abc": 10, "def": "uvwxyz"}, struct_type) assert isinstance(vl, dict) assert vl["abc"] == 10 assert vl["def"] == "uvwxyz" with pytest.raises(ValueError): - validate_value({"abcd", "efgh"}, struct_type) + odps_types.validate_value({"abcd", "efgh"}, struct_type) options.struct_as_dict = False - struct_type = validate_data_type('struct') - vl = validate_value((10, "uvwxyz"), struct_type) + struct_type = odps_types.validate_data_type("struct") + vl = odps_types.validate_value((10, "uvwxyz"), struct_type) assert isinstance(vl, tuple) assert vl == (10, "uvwxyz") - vl = validate_value({"def": "uvwxyz", "abc": 10}, struct_type) + vl = odps_types.validate_value({"def": "uvwxyz", "abc": 10}, struct_type) assert isinstance(vl, tuple) assert vl == (10, "uvwxyz") with pytest.raises(ValueError): - validate_value({"abcd", "efgh"}, struct_type) + odps_types.validate_value({"abcd", "efgh"}, struct_type) finally: options.struct_as_dict = False def test_validate_decimal(): with pytest.raises(ValueError): - Decimal(1024) + odps_types.Decimal(1024) with pytest.raises(ValueError): - Decimal(32, 60) + odps_types.Decimal(32, 60) with pytest.raises(ValueError): - Decimal(None, 10) + odps_types.Decimal(None, 10) - assert repr(Decimal()) == "decimal" - assert repr(Decimal(20)) == "decimal(20)" - assert repr(Decimal(20, 10)) == "decimal(20,10)" + assert repr(odps_types.Decimal()) == "decimal" + assert repr(odps_types.Decimal(20)) == "decimal(20)" + assert repr(odps_types.Decimal(20, 10)) == "decimal(20,10)" - assert Decimal(20, 5) == Decimal(20, 5) - assert Decimal(10) == "decimal(10)" + assert odps_types.Decimal(20, 5) == odps_types.Decimal(20, 5) + assert odps_types.Decimal(10) == "decimal(10)" - decimal_type = Decimal(10, 5) + decimal_type = odps_types.Decimal(10, 5) decimal_type.validate_value(None) decimal_type.validate_value(_decimal.Decimal("123456789.1")) decimal_type.validate_value(_decimal.Decimal("123456789.12345")) @@ -406,11 +468,11 @@ def test_validate_decimal(): @pandas_case def test_validate_timestamp(): with pytest.raises(ValueError): - validate_value("abcdef", timestamp) + odps_types.validate_value("abcdef", odps_types.timestamp) - vl = validate_value("2023-12-19 14:24:31", timestamp) + vl = odps_types.validate_value("2023-12-19 14:24:31", odps_types.timestamp) assert vl == pd.Timestamp("2023-12-19 14:24:31") ts = pd.Timestamp("2023-12-19 14:24:31") - vl = validate_value(ts, timestamp) + vl = odps_types.validate_value(ts, odps_types.timestamp) assert vl == ts diff --git a/odps/tests/test_unixsocket.py b/odps/tests/test_unixsocket.py index 40dcfc67..720ef608 100644 --- a/odps/tests/test_unixsocket.py +++ b/odps/tests/test_unixsocket.py @@ -132,7 +132,8 @@ def test_unixsocket_access(odps): proxy_obj.start() local_endpoint = "http+unix://%s%s" % ( - quote_plus(sock_name), parsed_endpoint.path + quote_plus(sock_name), + parsed_endpoint.path, ) unix_odps = ODPS( account=odps.account, project=odps.project, endpoint=local_endpoint diff --git a/odps/tests/test_utils.py b/odps/tests/test_utils.py index 5573267d..054e91de 100644 --- a/odps/tests/test_utils.py +++ b/odps/tests/test_utils.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,89 +27,99 @@ from .. import utils from ..compat import long_type -from .core import module_depend_case + try: from ..src.utils_c import CMillisecondsConverter except ImportError: CMillisecondsConverter = None mytimetuple = namedtuple( - 'TimeTuple', - [s for s in dir(datetime.datetime.now().timetuple()) if s.startswith('tm_')] + "TimeTuple", + [s for s in dir(datetime.datetime.now().timetuple()) if s.startswith("tm_")], ) def test_replace_sql_parameters(): cases = [ { - 'ns': { - 'dss': ['20180101', '20180102', '20180101'], + "ns": { + "dss": ["20180101", "20180102", "20180101"], }, - 'sql': 'select * from dual where id in :dss', - 'expected': ["select * from dual where id in ('20180101', '20180102', '20180101')"] + "sql": "select * from dual where id in :dss", + "expected": [ + "select * from dual where id in ('20180101', '20180102', '20180101')" + ], }, { - 'ns': { - 'dss': set(['20180101', '20180102', '20180101']), + "ns": { + "dss": set(["20180101", "20180102", "20180101"]), }, - 'sql': 'select * from dual where id in :dss', - 'expected': [ + "sql": "select * from dual where id in :dss", + "expected": [ "select * from dual where id in ('20180101', '20180102')", - "select * from dual where id in ('20180102', '20180101')" - ] + "select * from dual where id in ('20180102', '20180101')", + ], }, { - 'ns': {'test1': 'new_test1', 'test3': 'new_\'test3\''}, - 'sql': 'select :test1 from dual where :test2 > 0 and f=:test3.abc', - 'expected': [r"select 'new_test1' from dual where :test2 > 0 and f='new_\'test3\''.abc"] + "ns": {"test1": "new_test1", "test3": 'new_\'test3\''}, + "sql": "select :test1 from dual where :test2 > 0 and f=:test3.abc", + "expected": [ + r"select 'new_test1' from dual where :test2 > 0 and f='new_\'test3\''.abc" + ], }, { - 'ns': { - 'dss': ('20180101', '20180102', 20180101), + "ns": { + "dss": ("20180101", "20180102", 20180101), }, - 'sql': 'select * from dual where id in :dss', - 'expected': ["select * from dual where id in ('20180101', '20180102', 20180101)"] + "sql": "select * from dual where id in :dss", + "expected": [ + "select * from dual where id in ('20180101', '20180102', 20180101)" + ], }, { - 'ns': { - 'ds': '20180101', - 'dss': ('20180101', 20180101), - 'id': 21312, - 'price': 6.4, - 'prices': (123, '123', 6.4) + "ns": { + "ds": "20180101", + "dss": ("20180101", 20180101), + "id": 21312, + "price": 6.4, + "prices": (123, "123", 6.4), }, - 'sql': 'select * from dual where ds = :ds or ds in :dss and id = :id ' - 'and price > :price or price in :prices', - 'expected': ["select * from dual where ds = '20180101' or ds in ('20180101', 20180101) " - "and id = 21312 and price > {0!r} or price in (123, '123', {0!r})".format(6.4)] - } + "sql": "select * from dual where ds = :ds or ds in :dss and id = :id " + "and price > :price or price in :prices", + "expected": [ + "select * from dual where ds = '20180101' or ds in ('20180101', 20180101) " + "and id = 21312 and price > {0!r} or price in (123, '123', {0!r})".format( + 6.4 + ) + ], + }, ] for case in cases: - assert utils.replace_sql_parameters(case['sql'], case['ns']) in case['expected'] + assert utils.replace_sql_parameters(case["sql"], case["ns"]) in case["expected"] def test_split_sql(): cases = [ { - 'sql': " select * from pyodps_iris", - 'parts': ["select * from pyodps_iris"], + "sql": " select * from pyodps_iris", + "parts": ["select * from pyodps_iris"], }, { - 'sql': """ + "sql": """ @val1 = select * from pyodps_iris; ; -- first stmt; select *, ';' as semicolon from @val1; """, - 'parts': [ + "parts": [ """ @val1 = select * from pyodps_iris; """, - "select *, ';' as semicolon from @val1;" + "select *, ';' as semicolon from @val1;", ], }, { - 'sql': r""" + "sql": r""" @val1 = (select category as `category;`, /* omitting stuff; */ from pyodps_iris) union @@ -117,35 +127,37 @@ def test_split_sql(): select *, '\';' as semicolon from @val1; /* blank line */ ; """, - 'parts': [ + "parts": [ """ @val1 = (select category as `category;`, from pyodps_iris) union (select category2 as `category;` from pyodps_iris2); """, - r"select *, '\';' as semicolon from @val1;" + r"select *, '\';' as semicolon from @val1;", ], }, ] for case in cases: - sql = textwrap.dedent(case['sql']) - parts = [textwrap.dedent(p).strip() for p in case['parts']] + sql = textwrap.dedent(case["sql"]) + parts = [textwrap.dedent(p).strip() for p in case["parts"]] assert utils.split_sql_by_semicolon(sql) == parts def test_experimental(): - @utils.experimental('Experimental method') + @utils.experimental("Experimental method") def fun(): pass try: - os.environ['PYODPS_EXPERIMENTAL'] = 'false' + os.environ["PYODPS_EXPERIMENTAL"] = "false" pytest.raises(utils.ExperimentalNotAllowed, fun) finally: - del os.environ['PYODPS_EXPERIMENTAL'] + del os.environ["PYODPS_EXPERIMENTAL"] -@pytest.mark.parametrize("force_py", (False, True) if CMillisecondsConverter else (True,)) +@pytest.mark.parametrize( + "force_py", (False, True) if CMillisecondsConverter else (True,) +) def test_time_convert_native(force_py): class GMT8(datetime.tzinfo): def utcoffset(self, dt): @@ -171,7 +183,9 @@ def tzname(self, dt): to_datetime = functools.partial(utils.to_datetime, force_py=force_py) base_time = datetime.datetime.now().replace(microsecond=0) - base_time_utc = datetime.datetime.utcfromtimestamp(time.mktime(base_time.timetuple())) + base_time_utc = datetime.datetime.utcfromtimestamp( + time.mktime(base_time.timetuple()) + ) milliseconds = long_type(time.mktime(base_time.timetuple())) * 1000 assert milliseconds == to_milliseconds(base_time, local_tz=True) @@ -181,7 +195,9 @@ def tzname(self, dt): assert to_datetime(milliseconds, local_tz=False) == base_time_utc base_time = datetime.datetime.now(tz=GMT8()).replace(microsecond=0) - milliseconds = long_type(calendar.timegm(base_time.astimezone(UTC()).timetuple())) * 1000 + milliseconds = ( + long_type(calendar.timegm(base_time.astimezone(UTC()).timetuple())) * 1000 + ) assert milliseconds == to_milliseconds(base_time, local_tz=True) assert milliseconds == to_milliseconds(base_time, local_tz=False) @@ -214,57 +230,63 @@ def tzname(self, dt): @pytest.mark.parametrize( - 'force_py,zone_func', + "force_py,zone_func", list(itertools.product(_force_py_args, _zone_funcs)), ) def test_time_convert_with_tz(force_py, zone_func): to_milliseconds = functools.partial(utils.to_milliseconds, force_py=force_py) to_datetime = functools.partial(utils.to_datetime, force_py=force_py) - base_time = datetime.datetime.now(tz=zone_func('Etc/GMT-8')).replace(microsecond=0) - milliseconds = long_type(calendar.timegm(base_time.astimezone(zone_func("UTC")).timetuple())) * 1000 + base_time = datetime.datetime.now(tz=zone_func("Etc/GMT-8")).replace(microsecond=0) + milliseconds = ( + long_type(calendar.timegm(base_time.astimezone(zone_func("UTC")).timetuple())) + * 1000 + ) - assert to_datetime(milliseconds, local_tz='Etc/GMT-8') == base_time + assert to_datetime(milliseconds, local_tz="Etc/GMT-8") == base_time base_time = base_time.replace(tzinfo=None) - assert milliseconds == to_milliseconds(base_time, local_tz='Etc/GMT-8') - assert milliseconds == to_milliseconds(base_time, local_tz=zone_func('Etc/GMT-8')) + assert milliseconds == to_milliseconds(base_time, local_tz="Etc/GMT-8") + assert milliseconds == to_milliseconds(base_time, local_tz=zone_func("Etc/GMT-8")) - base_time = datetime.datetime.now(tz=zone_func('Etc/GMT-8')).replace(microsecond=0) + base_time = datetime.datetime.now(tz=zone_func("Etc/GMT-8")).replace(microsecond=0) milliseconds = time.mktime(base_time.timetuple()) * 1000 assert milliseconds == to_milliseconds(base_time, local_tz=True) assert milliseconds == to_milliseconds(base_time, local_tz=False) - assert milliseconds == to_milliseconds(base_time, local_tz='Etc/GMT-1') - assert milliseconds == to_milliseconds(base_time, local_tz=zone_func('Etc/GMT-1')) + assert milliseconds == to_milliseconds(base_time, local_tz="Etc/GMT-1") + assert milliseconds == to_milliseconds(base_time, local_tz=zone_func("Etc/GMT-1")) def test_thread_local_attribute(): class TestClass(object): - _no_defaults = utils.thread_local_attribute('test_thread_local') - _defaults = utils.thread_local_attribute('test_thread_local', lambda: 'TestValue') + _no_defaults = utils.thread_local_attribute("test_thread_local") + _defaults = utils.thread_local_attribute( + "test_thread_local", lambda: "TestValue" + ) inst = TestClass() pytest.raises(AttributeError, lambda: inst._no_defaults) - assert inst._defaults == 'TestValue' + assert inst._defaults == "TestValue" - inst._no_defaults = 'TestManualValue1' - assert inst._no_defaults == 'TestManualValue1' - inst._defaults = 'TestManualValue2' - assert inst._defaults == 'TestManualValue2' + inst._no_defaults = "TestManualValue1" + assert inst._no_defaults == "TestManualValue1" + inst._defaults = "TestManualValue2" + assert inst._defaults == "TestManualValue2" from ..compat import futures + executor = futures.ThreadPoolExecutor(1) def test_fn(): pytest.raises(AttributeError, lambda: inst._no_defaults) - assert inst._defaults == 'TestValue' + assert inst._defaults == "TestValue" - inst._no_defaults = 'TestManualValue1' - assert inst._no_defaults == 'TestManualValue1' - inst._defaults = 'TestManualValue2' - assert inst._defaults == 'TestManualValue2' + inst._no_defaults = "TestManualValue1" + assert inst._no_defaults == "TestManualValue1" + inst._defaults = "TestManualValue2" + assert inst._defaults == "TestManualValue2" executor.submit(test_fn).result() @@ -294,3 +316,34 @@ def arg_holder_kw(arg, async_=False, kwa=False, **kw): assert arg_holder(0, **{"async": True}) assert arg_holder(0, async_=True) assert arg_holder(0, wait=False) + + +def test_call_with_retry(): + retry_idx_list = [0] + + def func(delay=0): + if delay: + time.sleep(delay) + if retry_idx_list[0] < 3: + retry_idx_list[0] += 1 + raise ValueError + + # test cases for retry times + with pytest.raises(ValueError): + retry_idx_list[0] = 0 + utils.call_with_retry(func, retry_times=1, exc_type=(TypeError, ValueError)) + assert retry_idx_list[0] == 2 + + retry_idx_list[0] = 0 + utils.call_with_retry(func, retry_times=3, exc_type=(TypeError, ValueError)) + assert retry_idx_list[0] == 3 + + delay_func = functools.partial(func, delay=0.5) + with pytest.raises(ValueError): + retry_idx_list[0] = 0 + utils.call_with_retry(delay_func, retry_times=None, retry_timeout=0.7) + assert retry_idx_list[0] == 2 + + retry_idx_list[0] = 0 + utils.call_with_retry(delay_func, retry_times=None, retry_timeout=2.2) + assert retry_idx_list[0] == 3 diff --git a/odps/tunnel/__init__.py b/odps/tunnel/__init__.py index b01f703e..22208b53 100644 --- a/odps/tunnel/__init__.py +++ b/odps/tunnel/__init__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,15 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .instancetunnel import InstanceDownloadSession, InstanceTunnel from .io import CompressOption -from .tabletunnel import TableTunnel, TableUploadSession, TableDownloadSession +from .tabletunnel import TableDownloadSession, TableTunnel, TableUploadSession from .volumetunnel import ( - VolumeTunnel, + VolumeDownloadSession, VolumeFSTunnel, + VolumeTunnel, VolumeUploadSession, - VolumeDownloadSession, ) -from .instancetunnel import InstanceTunnel, InstanceDownloadSession TableUploadStatus = TableUploadSession.Status TableDownloadStatus = TableDownloadSession.Status diff --git a/odps/tunnel/base.py b/odps/tunnel/base.py index a217e27a..cca8fe42 100644 --- a/odps/tunnel/base.py +++ b/odps/tunnel/base.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,11 +26,13 @@ class BaseTunnel(object): - def __init__(self, odps=None, client=None, project=None, endpoint=None, quota_name=None): + def __init__( + self, odps=None, client=None, project=None, endpoint=None, quota_name=None + ): self._client = odps.rest if odps is not None else client self._account = self._client.account if project is None and odps is None: - raise AttributeError('%s requires project parameter.' % type(self).__name__) + raise AttributeError("%s requires project parameter." % type(self).__name__) if isinstance(project, six.string_types): if odps is not None: self._project = odps.get_project(project or odps.project) @@ -45,7 +47,9 @@ def __init__(self, odps=None, client=None, project=None, endpoint=None, quota_na if quota_name is not None: self._endpoint = endpoint else: - self._endpoint = endpoint or self._project._tunnel_endpoint or options.tunnel.endpoint + self._endpoint = ( + endpoint or self._project._tunnel_endpoint or options.tunnel.endpoint + ) self._tunnel_rest = None @property @@ -58,23 +62,23 @@ def quota_name(self): def _get_tunnel_server(self, project): protocol = urlparse(self._client.endpoint).scheme - if protocol is None or protocol not in ('http', 'https'): + if protocol is None or protocol not in ("http", "https"): raise TunnelError("Invalid protocol: %s" % protocol) ep_cache_key = (self._client.endpoint, project.name, self._quota_name) if ep_cache_key in _endpoint_cache: return _endpoint_cache[ep_cache_key] - url = '/'.join([project.resource().rstrip('/'), 'tunnel']) + url = "/".join([project.resource().rstrip("/"), "tunnel"]) params = {} if self._quota_name: params["quotaName"] = self._quota_name - resp = self._client.get(url, action='service', params=params) + resp = self._client.get(url, action="service", params=params) if self._client.is_ok(resp): addr = resp.text server_ep = _endpoint_cache[ep_cache_key] = urlparse( - '%s://%s' % (protocol, addr) + "%s://%s" % (protocol, addr) ).geturl() return server_ep else: @@ -87,12 +91,14 @@ def tunnel_rest(self): kw = dict(tag="TUNNEL") if options.data_proxy is not None: - kw['proxy'] = options.data_proxy - if getattr(self._client, 'app_account', None) is not None: - kw['app_account'] = self._client.app_account + kw["proxy"] = options.data_proxy + if getattr(self._client, "app_account", None) is not None: + kw["app_account"] = self._client.app_account endpoint = self._endpoint if endpoint is None: endpoint = self._get_tunnel_server(self._project) - self._tunnel_rest = RestClient(self._account, endpoint, self._client.project, **kw) + self._tunnel_rest = RestClient( + self._account, endpoint, self._client.project, **kw + ) return self._tunnel_rest diff --git a/odps/tunnel/checksum.py b/odps/tunnel/checksum.py index 95e2ee04..a2ed6566 100644 --- a/odps/tunnel/checksum.py +++ b/odps/tunnel/checksum.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,25 +17,25 @@ import struct -from ..crc import Crc32c, Crc32 from .. import utils +from ..crc import Crc32, Crc32c class Checksum(object): TRUE = bytearray([1]) FALSE = bytearray([0]) - - def __init__(self, method='crc32c'): - self.crc = Crc32c() if method.lower() == 'crc32c' else Crc32() + + def __init__(self, method="crc32c"): + self.crc = Crc32c() if method.lower() == "crc32c" else Crc32() def _mode(self): # use for UT to check if use c extension try: from ..src.crc32c_c import Crc32c - return 'c' if isinstance(self.crc, Crc32c) else 'py' + return "c" if isinstance(self.crc, Crc32c) else "py" except ImportError: - return 'py' + return "py" def update_bool(self, val): assert isinstance(val, bool) @@ -44,31 +44,31 @@ def update_bool(self, val): self._update(val) def update_int(self, val): - val = struct.pack('&retval, 1) @@ -39,31 +39,31 @@ cdef class Checksum: cpdef update_bool(self, bint val): self.c_update_bool(val) - cdef void c_update_int(self, int32_t val) nogil: + cdef void c_update_int(self, int32_t val) noexcept nogil: self.c_update(&val, sizeof(int32_t)) cpdef update_int(self, int32_t val): self.c_update(&val, sizeof(int32_t)) - cdef void c_update_long(self, int64_t val) nogil: + cdef void c_update_long(self, int64_t val) noexcept nogil: self.c_update(&val, sizeof(int64_t)) cpdef update_long(self, int64_t val): self.c_update(&val, sizeof(int64_t)) - cdef void c_update_float(self, float val) nogil: + cdef void c_update_float(self, float val) noexcept nogil: self.c_update(&val, sizeof(float)) cpdef update_float(self, float val): self.c_update(&val, sizeof(float)) - cdef void c_update_double(self, double val) nogil: + cdef void c_update_double(self, double val) noexcept nogil: self.c_update(&val, sizeof(double)) cpdef update_double(self, double val): self.c_update(&val, sizeof(double)) - cdef void c_update(self, const char *ptr, size_t length) nogil: + cdef void c_update(self, const char *ptr, size_t length) noexcept nogil: if self.use_c: self._checksum = crc32c(self._checksum, ptr, length) else: diff --git a/odps/tunnel/errors.py b/odps/tunnel/errors.py index b373e6a9..931e9240 100644 --- a/odps/tunnel/errors.py +++ b/odps/tunnel/errors.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,29 +18,29 @@ import requests -from ..compat import ElementTree as ET, TimeoutError +from ..compat import ElementTree as ET +from ..compat import TimeoutError from ..errors import ODPSError class TunnelError(ODPSError): - @classmethod def parse(cls, resp): try: root = ET.fromstring(resp.content) - code = root.find('./Code').text - msg = root.find('./Message').text - request_id = root.find('./RequestId') + code = root.find("./Code").text + msg = root.find("./Message").text + request_id = root.find("./RequestId") if request_id: request_id = request_id.text else: - request_id = resp.headers.get('x-odps-request-id') + request_id = resp.headers.get("x-odps-request-id") exc_type = globals().get(code, TunnelError) except: - request_id = resp.headers['x-odps-request-id'] + request_id = resp.headers["x-odps-request-id"] obj = json.loads(resp.content) - msg = obj['Message'] - code = obj['InvalidArgument'] + msg = obj["Message"] + code = obj["InvalidArgument"] exc_type = globals().get(code, TunnelError) return exc_type(msg, code=code, request_id=request_id) diff --git a/odps/tunnel/hasher.py b/odps/tunnel/hasher.py index adea420a..82966954 100644 --- a/odps/tunnel/hasher.py +++ b/odps/tunnel/hasher.py @@ -10,8 +10,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import ctypes import calendar +import ctypes import struct from .. import types @@ -46,11 +46,11 @@ def hash_string(self, val): class DefaultHasher(AbstractHasher): def hash_bigint(self, val): val = (~val) + ctypes.c_int64(val << 18).value - val ^= (val >> 31) + val ^= val >> 31 val = ctypes.c_int64(val * 21).value - val ^= (val >> 11) + val ^= val >> 11 val += ctypes.c_int64(val << 6).value - val ^= (val >> 22) + val ^= val >> 22 return ctypes.c_int32(val).value def hash_float(self, val): @@ -62,9 +62,9 @@ def hash_double(self, val): def hash_bool(self, val): # it is a magic number if val: - return 0x172ba9c7 + return 0x172BA9C7 else: - return -0x3a59cb12 + return -0x3A59CB12 def hash_string(self, val): val = to_binary(val) @@ -92,9 +92,9 @@ def hash_double(self, val): def hash_bool(self, val): # it is a magic number if val: - return 0x172ba9c7 + return 0x172BA9C7 else: - return -0x3a59cb12 + return -0x3A59CB12 def hash_string(self, val): val = to_binary(val) @@ -160,13 +160,9 @@ def __init__(self, schema, hasher_type, hash_keys): for col_name in hash_keys: col = self._schema.get_column(col_name) if col.type in _type_to_hash_fun: - self._column_hash_appenders.append( - _type_to_hash_fun[col.type] - ) + self._column_hash_appenders.append(_type_to_hash_fun[col.type]) elif isinstance(col.type, (types.Char, types.Varchar)): - self._column_hash_appenders.append( - _type_to_hash_fun[types.string] - ) + self._column_hash_appenders.append(_type_to_hash_fun[types.string]) else: raise TypeError("Hash for type %s not supported" % col.type) diff --git a/odps/tunnel/hasher_c.pxd b/odps/tunnel/hasher_c.pxd index 36624a9a..feaf6dbf 100644 --- a/odps/tunnel/hasher_c.pxd +++ b/odps/tunnel/hasher_c.pxd @@ -6,7 +6,7 @@ from ..src.utils_c cimport CMillisecondsConverter cdef class AbstractHasher: - cdef int32_t c_hash_bigint(self, int64_t val) nogil + cdef int32_t c_hash_bigint(self, int64_t val) noexcept nogil cdef int32_t c_hash_float(self, float val) nogil cdef int32_t c_hash_double(self, double val) nogil cdef int32_t c_hash_bool(self, bint val) nogil @@ -14,7 +14,7 @@ cdef class AbstractHasher: cdef class DefaultHasher(AbstractHasher): - cdef int32_t c_hash_bigint(self, int64_t val) nogil + cdef int32_t c_hash_bigint(self, int64_t val) noexcept nogil cdef int32_t c_hash_float(self, float val) nogil cdef int32_t c_hash_double(self, double val) nogil cdef int32_t c_hash_bool(self, bint val) nogil @@ -22,7 +22,7 @@ cdef class DefaultHasher(AbstractHasher): cdef class LegacyHasher(AbstractHasher): - cdef int32_t c_hash_bigint(self, int64_t val) nogil + cdef int32_t c_hash_bigint(self, int64_t val) noexcept nogil cdef int32_t c_hash_float(self, float val) nogil cdef int32_t c_hash_double(self, double val) nogil cdef int32_t c_hash_bool(self, bint val) nogil diff --git a/odps/tunnel/hasher_c.pyx b/odps/tunnel/hasher_c.pyx index 134d2711..4f890ad1 100644 --- a/odps/tunnel/hasher_c.pyx +++ b/odps/tunnel/hasher_c.pyx @@ -11,10 +11,12 @@ # limitations under the License. import calendar -from libc.stdint cimport * + from cpython.datetime cimport import_datetime +from libc.stdint cimport * from .. import types + from ..src.types_c cimport BaseRecord from ..src.utils_c cimport CMillisecondsConverter @@ -38,7 +40,7 @@ import_datetime() cdef class AbstractHasher: - cdef int32_t c_hash_bigint(self, int64_t val) nogil: + cdef int32_t c_hash_bigint(self, int64_t val) noexcept nogil: return 0 def hash_bigint(self, int64_t val): @@ -77,7 +79,7 @@ cdef class AbstractHasher: cdef class DefaultHasher(AbstractHasher): - cdef int32_t c_hash_bigint(self, int64_t val) nogil: + cdef int32_t c_hash_bigint(self, int64_t val) noexcept nogil: val = (~val) + (val << 18) val ^= val >> 31 val *= 21 @@ -106,7 +108,7 @@ cdef class DefaultHasher(AbstractHasher): cdef class LegacyHasher(AbstractHasher): - cdef int32_t c_hash_bigint(self, int64_t val) nogil: + cdef int32_t c_hash_bigint(self, int64_t val) noexcept nogil: return (val >> 32) ^ val cdef int32_t c_hash_bool(self, bint val) nogil: @@ -238,8 +240,8 @@ cpdef int32_t hash_value(hasher_type, data_type, value): """Simple hash function for test purpose""" cdef RecordHasher rec_hasher - from ..types import Column, OdpsSchema from ..models import Record + from ..types import Column, OdpsSchema schema = OdpsSchema([Column("col", data_type)]) record = Record(schema=schema, values=[value]) diff --git a/odps/tunnel/instancetunnel.py b/odps/tunnel/instancetunnel.py index c6210334..e7279613 100644 --- a/odps/tunnel/instancetunnel.py +++ b/odps/tunnel/instancetunnel.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,14 +19,14 @@ import requests -from .base import BaseTunnel, TUNNEL_VERSION -from .io.reader import TunnelRecordReader, TunnelArrowReader -from .io.stream import CompressOption, get_decompress_stream -from .errors import TunnelError from .. import serializers, types from ..compat import Enum, six from ..config import options from ..models import Projects, TableSchema +from .base import TUNNEL_VERSION, BaseTunnel +from .errors import TunnelError +from .io.reader import TunnelArrowReader, TunnelRecordReader +from .io.stream import CompressOption, get_decompress_stream try: import numpy as np @@ -38,45 +38,67 @@ class InstanceDownloadSession(serializers.JSONSerializableModel): __slots__ = ( - '_client', '_instance', '_limit_enabled', '_compress_option', '_sessional', - '_session_task_name', '_session_subquery_id', '_quota_name', '_timeout' + "_client", + "_instance", + "_limit_enabled", + "_compress_option", + "_sessional", + "_session_task_name", + "_session_subquery_id", + "_quota_name", + "_timeout", ) class Status(Enum): - Unknown = 'UNKNOWN' - Normal = 'NORMAL' - Closes = 'CLOSES' - Expired = 'EXPIRED' - Failed = 'FAILED' - Initiating = 'INITIATING' - - id = serializers.JSONNodeField('DownloadID') + Unknown = "UNKNOWN" + Normal = "NORMAL" + Closes = "CLOSES" + Expired = "EXPIRED" + Failed = "FAILED" + Initiating = "INITIATING" + + id = serializers.JSONNodeField("DownloadID") status = serializers.JSONNodeField( - 'Status', parse_callback=lambda s: InstanceDownloadSession.Status(s.upper())) - count = serializers.JSONNodeField('RecordCount') - schema = serializers.JSONNodeReferenceField(TableSchema, 'Schema') - quota_name = serializers.JSONNodeField('QuotaName') + "Status", parse_callback=lambda s: InstanceDownloadSession.Status(s.upper()) + ) + count = serializers.JSONNodeField("RecordCount") + schema = serializers.JSONNodeReferenceField(TableSchema, "Schema") + quota_name = serializers.JSONNodeField("QuotaName") - def __init__(self, client, instance, download_id=None, limit=None, - compress_option=None, quota_name=None, timeout=None, **kw): + def __init__( + self, + client, + instance, + download_id=None, + limit=None, + compress_option=None, + quota_name=None, + timeout=None, + tags=None, + **kw + ): super(InstanceDownloadSession, self).__init__() self._client = client self._instance = instance - self._limit_enabled = limit if limit is not None else kw.get('limit_enabled', False) + self._limit_enabled = ( + limit if limit is not None else kw.get("limit_enabled", False) + ) self._quota_name = quota_name self._sessional = kw.pop("sessional", False) self._session_task_name = kw.pop("session_task_name", "") self._session_subquery_id = int(kw.pop("session_subquery_id", -1)) self._timeout = timeout - if self._sessional and ((not self._session_task_name) or (self._session_subquery_id == -1)): + if self._sessional and ( + (not self._session_task_name) or (self._session_subquery_id == -1) + ): raise TunnelError( "Taskname('session_task_name') and Subquery ID ('session_subquery_id') " "keyword argument must be provided for session instance tunnels." ) if download_id is None: - self._init() + self._init(tags=tags or options.tunnel.tags) else: self.id = download_id self.reload() @@ -87,30 +109,40 @@ def __init__(self, client, instance, download_id=None, limit=None, options.tunnel_session_create_callback(self) def __repr__(self): - return "" % ( + return "" % ( self.id, self._instance.project.name, self._instance.id, + " limited" if self._limit_enabled else "", ) - def _init(self): + def _init(self, tags=None): params = {} headers = { - 'Content-Length': 0, - 'x-odps-tunnel-version': TUNNEL_VERSION, + "Content-Length": 0, + "x-odps-tunnel-version": TUNNEL_VERSION, } + if tags: + if isinstance(tags, six.string_types): + tags = tags.split(",") + headers["odps-tunnel-tags"] = ",".join(tags) if self._quota_name is not None: - params['quotaName'] = self._quota_name + params["quotaName"] = self._quota_name # Now we use DirectDownloadMode to fetch session results(any other method is removed) # This mode, only one request needed. So we don't have to send request here .. if not self._sessional: if self._limit_enabled: - params['instance_tunnel_limit_enabled'] = '' + params["instance_tunnel_limit_enabled"] = "" url = self._instance.resource() try: resp = self._client.post( - url, {}, action='downloads', params=params, headers=headers, timeout=self._timeout + url, + {}, + action="downloads", + params=params, + headers=headers, + timeout=self._timeout, ) except requests.exceptions.ReadTimeout: if callable(options.tunnel_session_create_timeout_callback): @@ -126,16 +158,16 @@ def _init(self): def reload(self): if not self._sessional: - params = {'downloadid': self.id} + params = {"downloadid": self.id} if self._quota_name is not None: - params['quotaName'] = self._quota_name + params["quotaName"] = self._quota_name headers = { - 'Content-Length': 0, - 'x-odps-tunnel-version': TUNNEL_VERSION, + "Content-Length": 0, + "x-odps-tunnel-version": TUNNEL_VERSION, } if self._sessional: - params['cached'] = '' - params['taskname'] = self._session_task_name + params["cached"] = "" + params["taskname"] = self._session_task_name url = self._instance.resource() resp = self._client.get(url, params=params, headers=headers) @@ -149,32 +181,34 @@ def reload(self): else: self.status = InstanceDownloadSession.Status.Normal - def _open_reader(self, start, count, compress=False, columns=None, arrow=False, reader_cls=None): + def _build_input_stream( + self, start, count, compress=False, columns=None, arrow=False + ): compress_option = self._compress_option or CompressOption() params = {} - headers = {'x-odps-tunnel-version': TUNNEL_VERSION} + headers = {"x-odps-tunnel-version": TUNNEL_VERSION} if self._quota_name is not None: - params['quotaName'] = self._quota_name + params["quotaName"] = self._quota_name if self._sessional: - params['cached'] = '' - params['taskname'] = self._session_task_name - params['queryid'] = str(self._session_subquery_id) + params["cached"] = "" + params["taskname"] = self._session_task_name + params["queryid"] = str(self._session_subquery_id) else: - params['downloadid'] = self.id - params['rowrange'] = '(%s,%s)' % (start, count) - headers['Content-Length'] = 0 + params["downloadid"] = self.id + params["rowrange"] = "(%s,%s)" % (start, count) + headers["Content-Length"] = 0 if compress: encoding = compress_option.algorithm.get_encoding() if encoding is not None: - headers['Accept-Encoding'] = encoding - params['data'] = '' + headers["Accept-Encoding"] = encoding + params["data"] = "" if columns is not None and len(columns) > 0: col_name = lambda col: col.name if isinstance(col, types.Column) else col - params['columns'] = ','.join(col_name(col) for col in columns) + params["columns"] = ",".join(col_name(col) for col in columns) if arrow: - params['arrow'] = '' + params["arrow"] = "" url = self._instance.resource() resp = self._client.get(url, stream=True, params=params, headers=headers) @@ -185,55 +219,98 @@ def _open_reader(self, start, count, compress=False, columns=None, arrow=False, if self._sessional: # in DirectDownloadMode, the schema is brought back in HEADER. # handle this. - schema_json = resp.headers.get('odps-tunnel-schema') + schema_json = resp.headers.get("odps-tunnel-schema") self.schema = TableSchema() self.schema = self.schema.deserial(schema_json) - content_encoding = resp.headers.get('Content-Encoding') + content_encoding = resp.headers.get("Content-Encoding") if content_encoding is not None: - compress_algo = CompressOption.CompressAlgorithm.from_encoding(content_encoding) + compress_algo = CompressOption.CompressAlgorithm.from_encoding( + content_encoding + ) if compress_algo != compress_option.algorithm: - compress_option = self._compress_option = CompressOption(compress_algo, -1, 0) + compress_option = self._compress_option = CompressOption( + compress_algo, -1, 0 + ) compress = True else: compress = False option = compress_option if compress else None - input_stream = get_decompress_stream(resp, option) - return reader_cls(self.schema, input_stream, columns=columns) + return get_decompress_stream(resp, option) + + def _open_reader( + self, start, count, compress=False, columns=None, arrow=False, reader_cls=None + ): + stream_kw = dict(compress=compress, columns=columns, arrow=arrow) + initial_stream_cache = [None] + + def stream_creator(cursor, cache=False): + if cursor == 0 and initial_stream_cache[0] is not None: + initial_stream_cache[0], stream = None, initial_stream_cache[0] + return stream + attempt_count = count - cursor if count is not None else None + stream = self._build_input_stream( + start + cursor, attempt_count, **stream_kw + ) + if cache: + initial_stream_cache[0] = stream + return stream - def open_record_reader(self, start, count, compress=False, columns=None): + # for MCQA we must obtain schema from the first stream, hence the first reader + # must be created beforehand and then cached for the reader class + stream_creator(0, True) + return reader_cls(self.schema, stream_creator, columns=columns) + + def open_record_reader(self, start, count, compress=False, columns=None, **_): return self._open_reader( - start, count, compress=compress, columns=columns, reader_cls=TunnelRecordReader + start, + count, + compress=compress, + columns=columns, + reader_cls=TunnelRecordReader, ) - def open_arrow_reader(self, start, count, compress=False, columns=None): + def open_arrow_reader(self, start, count, compress=False, columns=None, **_): return self._open_reader( - start, count, compress=compress, columns=columns, arrow=True, reader_cls=TunnelArrowReader + start, + count, + compress=compress, + columns=columns, + arrow=True, + reader_cls=TunnelArrowReader, ) - if np is not None: - def open_pandas_reader(self, start, count, compress=False, columns=None): - from .pdio.pdreader_c import TunnelPandasReader - return self._open_reader( - start, count, compress=compress, columns=columns, reader_cls=TunnelPandasReader - ) - class InstanceTunnel(BaseTunnel): - def create_download_session(self, instance, download_id=None, limit=None, compress_option=None, - compress_algo=None, compress_level=None, compress_strategy=None, - timeout=None, **kw): + def create_download_session( + self, + instance, + download_id=None, + limit=None, + compress_option=None, + compress_algo=None, + compress_level=None, + compress_strategy=None, + timeout=None, + tags=None, + **kw + ): if not isinstance(instance, six.string_types): instance = instance.id - instance = Projects(client=self.tunnel_rest)[self._project.name].instances[instance] + instance = Projects(client=self.tunnel_rest)[self._project.name].instances[ + instance + ] compress_option = compress_option if compress_option is None and compress_algo is not None: compress_option = CompressOption( - compress_algo=compress_algo, level=compress_level, strategy=compress_strategy) + compress_algo=compress_algo, + level=compress_level, + strategy=compress_strategy, + ) if limit is None: - limit = kw.get('limit_enabled', False) + limit = kw.get("limit_enabled", False) return InstanceDownloadSession( self.tunnel_rest, instance, @@ -242,5 +319,6 @@ def create_download_session(self, instance, download_id=None, limit=None, compre compress_option=compress_option, quota_name=self._quota_name, timeout=timeout, + tags=tags, **kw ) diff --git a/odps/tunnel/io/reader.py b/odps/tunnel/io/reader.py index c489b031..386cc788 100644 --- a/odps/tunnel/io/reader.py +++ b/odps/tunnel/io/reader.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,8 +21,9 @@ import warnings from collections import OrderedDict from decimal import Decimal -from io import IOBase, BytesIO, StringIO +from io import BytesIO, IOBase, StringIO +from ...errors import ChecksumError from ...types import PartitionSpec try: @@ -37,16 +38,19 @@ import pyarrow as pa except (AttributeError, ImportError): pa = None +try: + import pyarrow.compute as pac +except (AttributeError, ImportError): + pac = None - -from ... import utils, types, compat +from ... import compat, types, utils +from ...config import options from ...errors import DatetimeOverflowError from ...models import Record from ...readers import AbstractRecordReader -from ...config import options +from ..checksum import Checksum from ..pb.decoder import Decoder from ..pb.errors import DecodeError -from ..checksum import Checksum from ..wireconstants import ProtoWireConstants from .types import odps_schema_to_arrow_schema @@ -61,28 +65,45 @@ TunnelRecordReader = convert_legacy_decimal_bytes = None if TunnelRecordReader is None: - class TunnelRecordReader(AbstractRecordReader): - def __init__(self, schema, input_stream, columns=None, partition_spec=None): + class TunnelRecordReader(AbstractRecordReader): + def __init__( + self, + schema, + stream_creator, + columns=None, + partition_spec=None, + append_partitions=False, + ): self._schema = schema if columns is None: - self._columns = self._schema.columns + self._columns = ( + self._schema.columns + if append_partitions + else self._schema.simple_columns + ) else: self._columns = [self._schema[c] for c in columns] - self._reader = Decoder(input_stream) - self._crc = Checksum() - self._crccrc = Checksum() + self._curr_cursor = 0 + self._stream_creator = stream_creator + self._reopen_reader() + self._read_limit = options.table_read_limit self._to_datetime = utils.MillisecondsConverter().from_milliseconds self._to_datetime_utc = utils.MillisecondsConverter( local_tz=False ).from_milliseconds self._to_date = utils.to_date - self._partition_spec = PartitionSpec(partition_spec) if partition_spec else None + self._partition_spec = ( + PartitionSpec(partition_spec) if partition_spec else None + ) + self._append_partitions = append_partitions + + self._injected_error = None def _mode(self): - return 'py' + return "py" @property def count(self): @@ -92,6 +113,16 @@ def count(self): def schema(self): return self._schema + def _inject_error(self, cursor, exc): + self._injected_error = (cursor, exc) + + def _reopen_reader(self): + self._last_n_bytes = len(self._reader) if self._curr_cursor != 0 else 0 + self._reader = Decoder(self._stream_creator(self._curr_cursor)) + self._crc = Checksum() + self._crccrc = Checksum() + self._attempt_row_count = 0 + def _read_field(self, data_type): if data_type == types.float_: val = self._reader.read_float() @@ -125,15 +156,23 @@ def _read_field(self, data_type): self._crc.update_long(val) val = self._to_date(val) elif data_type == types.timestamp or data_type == types.timestamp_ntz: - to_datetime = self._to_datetime_utc if data_type == types.timestamp_ntz else self._to_datetime + to_datetime = ( + self._to_datetime_utc + if data_type == types.timestamp_ntz + else self._to_datetime + ) l_val = self._reader.read_sint64() self._crc.update_long(l_val) nano_secs = self._reader.read_sint32() self._crc.update_int(nano_secs) if pd is None: - raise ImportError('To use TIMESTAMP in pyodps, you need to install pandas.') + raise ImportError( + "To use TIMESTAMP in pyodps, you need to install pandas." + ) try: - val = pd.Timestamp(to_datetime(l_val * 1000)) + pd.Timedelta(nanoseconds=nano_secs) + val = pd.Timestamp(to_datetime(l_val * 1000)) + pd.Timedelta( + nanoseconds=nano_secs + ) except DatetimeOverflowError: if not options.tunnel.overflow_date_as_none: raise @@ -144,7 +183,9 @@ def _read_field(self, data_type): nano_secs = self._reader.read_sint32() self._crc.update_int(nano_secs) if pd is None: - raise ImportError('To use INTERVAL_DAY_TIME in pyodps, you need to install pandas.') + raise ImportError( + "To use INTERVAL_DAY_TIME in pyodps, you need to install pandas." + ) val = pd.Timedelta(seconds=l_val, nanoseconds=nano_secs) elif data_type == types.interval_year_month: l_val = self._reader.read_sint64() @@ -169,7 +210,7 @@ def _read_field(self, data_type): elif isinstance(data_type, types.Struct): val = self._read_struct(data_type) else: - raise IOError('Unsupported type %s' % data_type) + raise IOError("Unsupported type %s" % data_type) return val def _read_array(self, value_type): @@ -194,10 +235,17 @@ def _read_struct(self, value_type): else: return value_type.namedtuple_type(*res_list) - def read(self): + def _read_single_record(self): + if ( + self._injected_error is not None + and self._injected_error[0] == self._curr_cursor + ): + self._injected_error = None + raise self._injected_error[1] + if self._read_limit is not None and self.count >= self._read_limit: warnings.warn( - 'Number of lines read via tunnel already reaches the limitation.', + "Number of lines read via tunnel already reaches the limitation.", RuntimeWarning, ) return None @@ -212,32 +260,34 @@ def read(self): if index == ProtoWireConstants.TUNNEL_END_RECORD: checksum = utils.long_to_int(self._crc.getvalue()) if int(self._reader.read_uint32()) != utils.int_to_uint(checksum): - raise IOError('Checksum invalid') + raise ChecksumError("Checksum invalid") self._crc.reset() self._crccrc.update_int(checksum) break if index == ProtoWireConstants.TUNNEL_META_COUNT: - if self.count != self._reader.read_sint64(): - raise IOError('count does not match') + if self._attempt_row_count != self._reader.read_sint64(): + raise IOError("count does not match") idx_of_checksum, _ = self._reader.read_field_number_and_wire_type() if ProtoWireConstants.TUNNEL_META_CHECKSUM != idx_of_checksum: - raise IOError('Invalid stream data.') + raise IOError("Invalid stream data.") if int(self._crccrc.getvalue()) != self._reader.read_uint32(): - raise IOError('Checksum invalid.') + raise ChecksumError("Checksum invalid.") return if index > len(self._columns): - raise IOError('Invalid protobuf tag. Perhaps the datastream ' - 'from server is crushed.') + raise IOError( + "Invalid protobuf tag. Perhaps the datastream " + "from server is crushed." + ) self._crc.update_int(index) i = index - 1 record[i] = self._read_field(self._columns[i].type) - if self._partition_spec is not None: + if self._append_partitions and self._partition_spec is not None: for k, v in self._partition_spec.items(): try: record[k] = v @@ -246,8 +296,14 @@ def read(self): pass self._curr_cursor += 1 + self._attempt_row_count += 1 return record + def read(self): + return utils.call_with_retry( + self._read_single_record, reset_func=self._reopen_reader + ) + def __next__(self): record = self.read() if record is None: @@ -261,13 +317,13 @@ def reads(self): @property def n_bytes(self): - return len(self._reader) + return self._last_n_bytes + len(self._reader) def get_total_bytes(self): return self.n_bytes def close(self): - if hasattr(self._schema, 'close'): + if hasattr(self._schema, "close"): self._schema.close() def __enter__(self): @@ -287,16 +343,17 @@ def __init__(self, raw_reader, arrow_schema): self._chunk_size = None self._buffers = collections.deque() - self._buffers.append( - BytesIO(arrow_schema.serialize().to_pybytes()) - ) + self._buffers.append(BytesIO(arrow_schema.serialize().to_pybytes())) + + def __len__(self): + return len(self._reader) def readable(self): return True @staticmethod def _read_unint32(b): - return struct.unpack('!I', b) + return struct.unpack("!I", b) def _read_chunk_size(self): try: @@ -310,7 +367,7 @@ def _read_chunk(self): read_size = self._chunk_size + 4 b = self._reader.read(read_size) if 0 < len(b) < 4: - raise IOError('Checksum invalid') + raise ChecksumError("Checksum invalid") self._pos += len(b) self._crc.update(b[:-4]) self._crccrc.update(b[:-4]) @@ -331,7 +388,7 @@ def _fill_next_buffer(self): read_checksum = self._read_unint32(crc_data)[0] checksum = int(self._crccrc.getvalue()) if checksum != read_checksum: - raise IOError('Checksum invalid') + raise ChecksumError("Checksum invalid") self._pos += len(data) + 4 self._buffers.append(BytesIO(data)) self._crccrc.reset() @@ -339,7 +396,7 @@ def _fill_next_buffer(self): checksum = int(self._crc.getvalue()) read_checksum = self._read_unint32(crc_data)[0] if checksum != read_checksum: - raise IOError('Checksum invalid') + raise ChecksumError("Checksum invalid") self._crc.reset() self._buffers.append(BytesIO(data)) @@ -361,36 +418,48 @@ def read(self, nbytes=None): tot_size += len(buf) if len(bufs) == 1: return bufs[0] - return b''.join(bufs) + return b"".join(bufs) def close(self): - if hasattr(self._reader, 'close'): + if hasattr(self._reader, "close"): self._reader.close() class TunnelArrowReader(object): def __init__( - self, schema, input_stream, columns=None, use_ipc_stream=False + self, + schema, + stream_creator, + columns=None, + partition_spec=None, + append_partitions=False, + use_ipc_stream=False, ): if pa is None: raise ValueError("To use arrow reader you need to install pyarrow") - self._schema = schema - self._columns = columns + self._raw_schema = schema - arrow_schema = odps_schema_to_arrow_schema(schema) + raw_arrow_schema = odps_schema_to_arrow_schema(schema) if columns is None: - self._arrow_schema = arrow_schema + self._schema = schema + self._arrow_schema = self._raw_arrow_schema = raw_arrow_schema else: - self._arrow_schema = pa.schema([s for s in arrow_schema if s.name in columns]) + self._schema = types.OdpsSchema([schema[c] for c in columns]) + self._raw_arrow_schema = pa.schema( + [s for s in raw_arrow_schema if s.name in columns] + ) + self._arrow_schema = odps_schema_to_arrow_schema(self._schema) + self._columns = columns - if use_ipc_stream: - self._reader = input_stream - else: - self._reader = ArrowStreamReader(input_stream, self._arrow_schema) + self._append_partitions = append_partitions + self._partition_spec = partition_spec self._pos = 0 - self._arrow_stream = None + self._stream_creator = stream_creator + self._use_ipc_stream = use_ipc_stream + self._reopen_reader() + self._to_datetime = utils.MillisecondsConverter().from_milliseconds self._read_limit = options.table_read_limit @@ -403,20 +472,35 @@ def __init__( col.type, convert_ts=False ) + self._injected_error = None + + def _reopen_reader(self): + self._last_n_bytes = len(self._reader) if self._pos != 0 else 0 + input_stream = self._stream_creator(self._pos) + self._arrow_stream = None + if self._use_ipc_stream: + self._reader = input_stream + else: + self._reader = ArrowStreamReader(input_stream, self._raw_arrow_schema) + + def _inject_error(self, cursor, exc): + self._injected_error = (cursor, exc) + @property def schema(self): return self._schema - def read_next_batch(self): - if self._reader is None: - return None + def _read_next_raw_batch(self): + if self._injected_error is not None and self._injected_error[0] <= self._pos: + self._injected_error = None + raise self._injected_error[1] if self._arrow_stream is None: self._arrow_stream = pa.ipc.open_stream(self._reader) if self._read_limit is not None and self._pos >= self._read_limit: warnings.warn( - 'Number of lines read via tunnel already reaches the limitation.', + "Number of lines read via tunnel already reaches the limitation.", RuntimeWarning, ) return None @@ -424,30 +508,96 @@ def read_next_batch(self): try: batch = self._arrow_stream.read_next_batch() self._pos += batch.num_rows + except pa.ArrowTypeError as ex: + if str(ex) != "Did not pass numpy.dtype object": + raise + else: + raise pa.ArrowTypeError( + "Error caused by version mismatch. Try install numpy<1.20 or " + "upgrade your pyarrow version. Original message: " + str(ex) + ) except StopIteration: return None + return batch - col_names = self._columns or batch.schema.names - col_to_array = dict() - col_name_set = set(col_names) + def _convert_timezone(self, batch): + from ...lib import tzlocal - for name, arr in zip(batch.schema.names, batch.columns): - if name not in col_name_set: + if not any(isinstance(tp, pa.TimestampType) for tp in batch.schema.types): + return batch + + timezone = raw_timezone = options.local_timezone + if timezone is True or timezone is None: + timezone = str(tzlocal.get_localzone()) + + cols = [] + for idx in range(batch.num_columns): + col = batch.column(idx) + name = batch.schema.names[idx] + if not isinstance(col.type, pa.TimestampType): + cols.append(col) continue - if arr.type == pa.timestamp('ms'): - col_to_array[name] = np.vectorize(self._to_datetime)( - arr.cast('int64').to_numpy() - ) - elif arr.type == pa.timestamp('ns'): - col_to_array[name] = arr.to_pandas().map( - lambda x: pd.Timestamp( - self._to_datetime(x.timestamp() * 1000) - ).replace(microsecond=x.microsecond, nanosecond=x.nanosecond) - ) + + if timezone is False or self._schema[name].type == types.timestamp_ntz: + col = col.cast(pa.timestamp(col.type.unit)) else: + col = col.cast(pa.timestamp(col.type.unit, timezone)) + if raw_timezone is None or raw_timezone is True: + if hasattr(pac, "local_timestamp"): + col = pac.local_timestamp(col) + else: + col = pa.Array.from_pandas( + col.to_pandas().dt.tz_localize(None) + ).cast(pa.timestamp(col.type.unit)) + cols.append(col) + + return pa.RecordBatch.from_arrays(cols, names=batch.schema.names) + + def _append_partition_cols(self, batch): + col_set = set(self._columns or [c.name for c in self._schema.columns]) + pt_obj = ( + types.PartitionSpec(self._partition_spec) if self._partition_spec else None + ) + + sel_col_set = set(self._columns or []) + if pt_obj and any(c in sel_col_set for c in pt_obj.keys()): + # append partitions selected in columns argument + self._append_partitions = True + if not pt_obj or not self._append_partitions: + return batch + + batch_cols = list(batch.columns) + batch_col_names = list(batch.schema.names) + for key, val in pt_obj.items(): + if key not in col_set: + continue + val = types.validate_value(val, self._schema[key].type) + batch_cols.append(pa.array(np.repeat([val], batch.num_rows))) + batch_col_names.append(key) + return pa.RecordBatch.from_arrays(batch_cols, names=batch_col_names) + + def read_next_batch(self): + if self._reader is None: + return None + + batch = utils.call_with_retry( + self._read_next_raw_batch, reset_func=self._reopen_reader + ) + if batch is None: + return None + + batch = self._append_partition_cols(batch) + if self._columns and self._columns != batch.schema.names: + col_to_array = dict() + col_name_set = set(self._columns) + for name, arr in zip(batch.schema.names, batch.columns): + if name not in col_name_set: + continue col_to_array[name] = arr - arrays = [col_to_array[name] for name in col_names] - return pa.RecordBatch.from_arrays(arrays, names=col_names) + arrays = [col_to_array[name] for name in self._columns] + batch = pa.RecordBatch.from_arrays(arrays, names=self._columns) + batch = self._convert_timezone(batch) + return batch def read(self): batches = [] @@ -471,14 +621,18 @@ def __next__(self): return batch @property - def n_bytes(self): + def count(self): return self._pos + @property + def n_bytes(self): + return self._last_n_bytes + len(self._reader) + def get_total_bytes(self): return self.n_bytes def close(self): - if hasattr(self._reader, 'close'): + if hasattr(self._reader, "close"): self._reader.close() def _convert_batch_to_pandas(self, batch): @@ -521,6 +675,7 @@ def __exit__(self, *_): if convert_legacy_decimal_bytes is None: + def convert_legacy_decimal_bytes(value): """ Legacy decimal memory layout: @@ -543,12 +698,12 @@ def convert_legacy_decimal_bytes(value): sio = BytesIO() if compat.PY27 else StringIO() if sign > 0: sio.write("-") - intg_nums = struct.unpack("<%dI" % intg, value[12: 12 + intg * 4]) + intg_nums = struct.unpack("<%dI" % intg, value[12 : 12 + intg * 4]) intg_val = "".join("%09d" % d for d in reversed(intg_nums)).lstrip("0") sio.write(intg_val or "0") if frac > 0: sio.write(".") - frac_nums = struct.unpack("<%dI" % frac, value[12 - frac * 4: 12]) + frac_nums = struct.unpack("<%dI" % frac, value[12 - frac * 4 : 12]) sio.write("".join("%09d" % d for d in reversed(frac_nums))) return Decimal(sio.getvalue()) @@ -619,9 +774,7 @@ def _convert_struct(value, field_converters, tuple_type): if value is None: return None - results = { - k: field_converters[k](v) for k, v in value.items() - } + results = {k: field_converters[k](v) for k, v in value.items()} if tuple_type is not None: return tuple_type(**results) else: @@ -656,14 +809,15 @@ def _build_converter(self, odps_type, arrow_type=None): and not isinstance(arrow_type, arrow_decimal_types) ): return convert_legacy_decimal_bytes - elif ( - isinstance(odps_type, types.IntervalDayTime) - and isinstance(arrow_type, pa.StructType) + elif isinstance(odps_type, types.IntervalDayTime) and isinstance( + arrow_type, pa.StructType ): return self._convert_struct_timedelta elif isinstance(odps_type, types.Array): arrow_value_type = getattr(arrow_type, "value_type", None) - sub_converter = self._build_converter(odps_type.value_type, arrow_value_type) + sub_converter = self._build_converter( + odps_type.value_type, arrow_value_type + ) if sub_converter is _reflective: return _reflective return lambda value: [sub_converter(x) for x in value] @@ -672,7 +826,9 @@ def _build_converter(self, odps_type, arrow_type=None): arrow_value_type = getattr(arrow_type, "item_type", None) key_converter = self._build_converter(odps_type.key_type, arrow_key_type) - value_converter = self._build_converter(odps_type.value_type, arrow_value_type) + value_converter = self._build_converter( + odps_type.value_type, arrow_value_type + ) if key_converter is _reflective and value_converter is _reflective: return OrderedDict @@ -694,7 +850,9 @@ def _build_converter(self, odps_type, arrow_type=None): else: tuple_type = odps_type.namedtuple_type return functools.partial( - self._convert_struct, field_converters=field_converters, tuple_type=tuple_type + self._convert_struct, + field_converters=field_converters, + tuple_type=tuple_type, ) else: return _reflective @@ -752,7 +910,7 @@ def read(self): def to_pandas(self, start=None, count=None, **kw): step = kw.get("step") or 1 - return self._arrow_reader.to_pandas().iloc[start: start + count: step] + return self._arrow_reader.to_pandas().iloc[start : start + count : step] def close(self): self._arrow_reader.close() diff --git a/odps/tunnel/io/reader_c.pxd b/odps/tunnel/io/reader_c.pxd index 7328c60c..b049edb3 100644 --- a/odps/tunnel/io/reader_c.pxd +++ b/odps/tunnel/io/reader_c.pxd @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,9 +19,8 @@ from libcpp.vector cimport vector from ...src.types_c cimport SchemaSnapshot from ...src.utils_c cimport CMillisecondsConverter -from ..pb.decoder_c cimport CDecoder from ..checksum_c cimport Checksum - +from ..pb.decoder_c cimport CDecoder ctypedef int (*_SET_FUNCTION)(BaseTunnelRecordReader self, list record, int i) except? -1 @@ -29,6 +28,7 @@ ctypedef int (*_SET_FUNCTION)(BaseTunnelRecordReader self, list record, int i) e cdef class BaseTunnelRecordReader: cdef public object _schema cdef object _columns + cdef object _stream_creator cdef CMillisecondsConverter _mills_converter cdef CMillisecondsConverter _mills_converter_utc cdef object _to_date @@ -36,6 +36,8 @@ cdef class BaseTunnelRecordReader: cdef Checksum _crc cdef Checksum _crccrc cdef int _curr_cursor + cdef int _attempt_row_count + cdef int _last_n_bytes cdef int _n_columns cdef int _read_limit cdef bint _overflow_date_as_none @@ -43,6 +45,10 @@ cdef class BaseTunnelRecordReader: cdef vector[_SET_FUNCTION] _column_setters cdef SchemaSnapshot _schema_snapshot cdef list _partition_vals + cdef bint _append_partitions + + cdef int _n_injected_error_cursor + cdef object _injected_error_exc cdef object _read_struct(self, object value_type) cdef object _read_element(self, int data_type_id, object data_type) @@ -72,4 +78,5 @@ cdef class BaseTunnelRecordReader: cdef int _set_interval_day_time(self, list record, int i) except? -1 cdef int _set_interval_year_month(self, list record, int i) except? -1 cdef int _set_json(self, list record, int i) except? -1 + cdef _read(self) cpdef read(self) diff --git a/odps/tunnel/io/reader_c.pyx b/odps/tunnel/io/reader_c.pyx index cf79fc15..710a1e35 100644 --- a/odps/tunnel/io/reader_c.pyx +++ b/odps/tunnel/io/reader_c.pyx @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,26 +14,29 @@ # limitations under the License. cimport cython + import decimal import json import warnings from collections import OrderedDict + from cpython.datetime cimport import_datetime from libc.stdint cimport * from libc.string cimport * from ...src.types_c cimport BaseRecord from ...src.utils_c cimport CMillisecondsConverter -from ..pb.decoder_c cimport CDecoder from ..checksum_c cimport Checksum +from ..pb.decoder_c cimport CDecoder -from ... import utils, types, compat, options -from ...errors import DatetimeOverflowError +from ... import compat, options, types, utils +from ...errors import ChecksumError, DatetimeOverflowError from ...models import Record -from ...readers import AbstractRecordReader +from ...readers import AbstractRecordReader # noqa from ...types import PartitionSpec from ..wireconstants import ProtoWireConstants + cdef int64_t MAX_READ_SIZE_LIMIT = (1 << 63) - 1 @@ -65,10 +68,21 @@ cdef: import_datetime() cdef class BaseTunnelRecordReader: - def __init__(self, object schema, object input_stream, columns=None, partition_spec=None): + def __init__( + self, + object schema, + object stream_creator, + object columns=None, + object partition_spec=None, + bint append_partitions=True, + ): self._schema = schema if columns is None: - self._columns = self._schema.columns + self._columns = ( + self._schema.columns + if append_partitions + else self._schema.simple_columns + ) else: self._columns = [self._schema[c] for c in columns] self._reader_schema = types.OdpsSchema(columns=self._columns) @@ -77,6 +91,7 @@ cdef class BaseTunnelRecordReader: self._overflow_date_as_none = options.tunnel.overflow_date_as_none self._struct_as_dict = options.struct_as_dict self._partition_vals = [] + self._append_partitions = append_partitions partition_spec = PartitionSpec(partition_spec) if partition_spec is not None else None @@ -118,17 +133,31 @@ cdef class BaseTunnelRecordReader: if partition_spec is not None and self._columns[i].name in partition_spec: self._partition_vals.append((i, partition_spec[self._columns[i].name])) - self._reader = CDecoder(input_stream) - self._crc = Checksum() - self._crccrc = Checksum() self._curr_cursor = 0 + self._stream_creator = stream_creator + self._reopen_reader() + self._read_limit = -1 if options.table_read_limit is None else options.table_read_limit self._mills_converter = CMillisecondsConverter() self._mills_converter_utc = CMillisecondsConverter(local_tz=False) self._to_date = utils.to_date + self._n_injected_error_cursor = -1 + self._injected_error_exc = None + + def _reopen_reader(self): + self._last_n_bytes = self._reader.position() if self._curr_cursor != 0 else 0 + self._reader = CDecoder(self._stream_creator(self._curr_cursor)) + self._crc = Checksum() + self._crccrc = Checksum() + self._attempt_row_count = 0 + + def _inject_error(self, cursor, exc): + self._n_injected_error_cursor = cursor + self._injected_error_exc = exc + def _mode(self): - return 'c' + return "c" @property def count(self): @@ -184,7 +213,7 @@ cdef class BaseTunnelRecordReader: elif isinstance(data_type, types.Struct): val = self._read_struct(data_type) else: - raise IOError('Unsupported type %s' % data_type) + raise IOError("Unsupported type %s" % data_type) return val cdef list _read_array(self, object value_type): @@ -357,7 +386,7 @@ cdef class BaseTunnelRecordReader: self._set_record_list_value(record, i, json.loads(val)) return 0 - cpdef read(self): + cdef _read(self): cdef: int index int checksum @@ -368,9 +397,13 @@ cdef class BaseTunnelRecordReader: BaseRecord record list rec_list + if self._n_injected_error_cursor == self._curr_cursor: + self._n_injected_error_cursor = -1 + raise self._injected_error_exc + if self._curr_cursor >= self._read_limit > 0: warnings.warn( - 'Number of lines read via tunnel already reaches the limitation.', + "Number of lines read via tunnel already reaches the limitation.", RuntimeWarning, ) return None @@ -386,27 +419,29 @@ cdef class BaseTunnelRecordReader: if index == WIRE_TUNNEL_END_RECORD: checksum = self._crc.c_getvalue() if self._reader.read_uint32() != checksum: - raise IOError('Checksum invalid') + raise ChecksumError("Checksum invalid") self._crc.c_reset() self._crccrc.c_update_int(checksum) break if index == WIRE_TUNNEL_META_COUNT: - if self._curr_cursor != self._reader.read_sint64(): - raise IOError('count does not match') + if self._attempt_row_count != self._reader.read_sint64(): + raise IOError("count does not match") idx_of_checksum = self._reader.read_field_number() if WIRE_TUNNEL_META_CHECKSUM != idx_of_checksum: - raise IOError('Invalid stream data.') + raise IOError("Invalid stream data.") if self._crccrc.c_getvalue() != self._reader.read_uint32(): - raise IOError('Checksum invalid.') + raise ChecksumError("Checksum invalid.") # if not self._reader.at_end(): # raise IOError('Expect at the end of stream, but not.') return if index > self._n_columns: - raise IOError('Invalid protobuf tag. Perhaps the datastream ' - 'from server is crushed.') + raise IOError( + "Invalid protobuf tag. Perhaps the datastream " + "from server is crushed." + ) self._crc.c_update_int(index) @@ -427,26 +462,39 @@ cdef class BaseTunnelRecordReader: val = self._read_struct(data_type) rec_list[i] = self._schema_snapshot.validate_value(i, val, MAX_READ_SIZE_LIMIT) else: - raise IOError('Unsupported type %s' % data_type) + raise IOError("Unsupported type %s" % data_type) - for idx, val in self._partition_vals: - rec_list[idx] = val + if self._append_partitions: + for idx, val in self._partition_vals: + rec_list[idx] = val + self._attempt_row_count += 1 self._curr_cursor += 1 return record + cpdef read(self): + cdef int retry_num = 0 + while True: + try: + return self._read() + except: + retry_num += 1 + if retry_num > options.retry_times: + raise + self._reopen_reader() + def reads(self): return self.__iter__() @property def n_bytes(self): - return self._reader.position() + return self._last_n_bytes + self._reader.position() def get_total_bytes(self): return self.n_bytes def close(self): - if hasattr(self._schema, 'close'): + if hasattr(self._schema, "close"): self._schema.close() def __enter__(self): @@ -492,8 +540,8 @@ cdef inline int32_t decimal_print_dig( data = val[i] while data != 0: r = data // 10 - ptr[0] = data - r * 10 + ord('0') - if ptr[0] != ord('0') and (not tail or ret[0] == ord('0')): + ptr[0] = data - r * 10 + ord("0") + if ptr[0] != ord("0") and (not tail or ret[0] == ord("0")): ret = ptr data = r ptr -= 1 @@ -524,7 +572,7 @@ cpdef convert_legacy_decimal_bytes(bytes value, int32_t frac = 0): cdef char buf[9 * (2 + 4) + 4] cdef char *buf_ptr = buf - memset(buf_ptr, ord('0'), sizeof(buf)) + memset(buf_ptr, ord("0"), sizeof(buf)) if is_null: # pragma: no cover return None @@ -544,7 +592,7 @@ cpdef convert_legacy_decimal_bytes(bytes value, int32_t frac = 0): if sign: start -= 1 - start[0] = ord('-') + start[0] = ord("-") cdef int32_t fcnt = decimal_print_dig( buf_ptr + DECIMAL_PREC_DIGS + 1, data, DECIMAL_FRAC_CNT, True @@ -554,7 +602,7 @@ cpdef convert_legacy_decimal_bytes(bytes value, int32_t frac = 0): else: frac = DECIMAL_FRAC_DIGS fcnt = max(fcnt, frac) - buf[DECIMAL_INTG_DIGS + 1] = ord('.') + buf[DECIMAL_INTG_DIGS + 1] = ord(".") dec_cnt = buf_ptr + DECIMAL_INTG_DIGS + 1 - start + (fcnt + 1 if fcnt > 0 else 0) return decimal.Decimal(start[0:dec_cnt].decode()) diff --git a/odps/tunnel/io/stream.py b/odps/tunnel/io/stream.py index 895ab260..488438d4 100644 --- a/odps/tunnel/io/stream.py +++ b/odps/tunnel/io/stream.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,10 @@ # limitations under the License. import sys -import zlib import threading +import zlib -from ... import errors, compat, options +from ... import compat, errors, options from ...compat import BytesIO, Enum, Semaphore, six from ..errors import TunnelError @@ -37,11 +37,14 @@ if compat.six.PY3: + def cast_memoryview(v): if not isinstance(v, memoryview): v = memoryview(v) - return v.cast('B') + return v.cast("B") + else: + def cast_memoryview(v): if not isinstance(v, memoryview): v = memoryview(v) @@ -53,7 +56,10 @@ class RequestsIO(object): def __new__(cls, *args, **kwargs): if cls is RequestsIO: - if not isinstance(threading.current_thread(), threading._MainThread) or _FORCE_THREAD: + if ( + not isinstance(threading.current_thread(), threading._MainThread) + or _FORCE_THREAD + ): return object.__new__(ThreadRequestsIO) elif GreenletRequestsIO is not None: return object.__new__(GreenletRequestsIO) @@ -197,14 +203,13 @@ def put(self, data): class CompressOption(object): - class CompressAlgorithm(Enum): - ODPS_RAW = 'RAW' - ODPS_ZLIB = 'ZLIB' - ODPS_SNAPPY = 'SNAPPY' - ODPS_ZSTD = 'ZSTD' - ODPS_LZ4 = 'LZ4' - ODPS_ARROW_LZ4 = 'ARROW_LZ4' + ODPS_RAW = "RAW" + ODPS_ZLIB = "ZLIB" + ODPS_SNAPPY = "SNAPPY" + ODPS_ZSTD = "ZSTD" + ODPS_LZ4 = "LZ4" + ODPS_ARROW_LZ4 = "ARROW_LZ4" def get_encoding(self, legacy=True): cls = type(self) @@ -212,58 +217,59 @@ def get_encoding(self, legacy=True): if self == cls.ODPS_RAW: return None elif self == cls.ODPS_ZLIB: - return 'deflate' + return "deflate" elif self == cls.ODPS_ZSTD: - return 'zstd' + return "zstd" elif self == cls.ODPS_LZ4: - return 'x-lz4-frame' + return "x-lz4-frame" elif self == cls.ODPS_SNAPPY: - return 'x-snappy-framed' + return "x-snappy-framed" elif self == cls.ODPS_ARROW_LZ4: - return 'x-odps-lz4-frame' + return "x-odps-lz4-frame" else: - raise TunnelError('invalid compression option') + raise TunnelError("invalid compression option") else: if self == cls.ODPS_RAW: return None elif self == cls.ODPS_ZSTD: - return 'ZSTD' + return "ZSTD" elif self == cls.ODPS_LZ4 or self == cls.ODPS_ARROW_LZ4: - return 'LZ4_FRAME' + return "LZ4_FRAME" else: - raise TunnelError('invalid compression option') + raise TunnelError("invalid compression option") @classmethod def from_encoding(cls, encoding): encoding = encoding.lower() if encoding else None - if encoding is None or encoding == 'identity': + if encoding is None or encoding == "identity": return cls.ODPS_RAW - elif encoding == 'deflate': + elif encoding == "deflate": return cls.ODPS_ZLIB - elif encoding == 'zstd': + elif encoding == "zstd": return cls.ODPS_ZSTD - elif encoding == 'x-lz4-frame': + elif encoding == "x-lz4-frame": return cls.ODPS_LZ4 - elif encoding == 'x-snappy-framed': + elif encoding == "x-snappy-framed": return cls.ODPS_SNAPPY - elif encoding == 'x-odps-lz4-frame' or encoding == "lz4_frame": + elif encoding == "x-odps-lz4-frame" or encoding == "lz4_frame": return cls.ODPS_ARROW_LZ4 else: - raise TunnelError('invalid encoding name %s' % encoding) + raise TunnelError("invalid encoding name %s" % encoding) - def __init__(self, compress_algo=CompressAlgorithm.ODPS_ZLIB, - level=None, strategy=None): + def __init__( + self, compress_algo=CompressAlgorithm.ODPS_ZLIB, level=None, strategy=None + ): if isinstance(compress_algo, CompressOption.CompressAlgorithm): self.algorithm = compress_algo else: - self.algorithm = \ - CompressOption.CompressAlgorithm(compress_algo.upper()) + self.algorithm = CompressOption.CompressAlgorithm(compress_algo.upper()) self.level = level or 1 self.strategy = strategy or 0 _lz4_algorithms = ( - CompressOption.CompressAlgorithm.ODPS_LZ4, CompressOption.CompressAlgorithm.ODPS_ARROW_LZ4 + CompressOption.CompressAlgorithm.ODPS_LZ4, + CompressOption.CompressAlgorithm.ODPS_ARROW_LZ4, ) @@ -281,7 +287,7 @@ def get_compress_stream(buffer, compress_option=None): elif algo in _lz4_algorithms: return LZ4OutputStream(buffer, level=compress_option.level) else: - raise errors.InvalidArgument('Invalid compression algorithm %s.' % algo) + raise errors.InvalidArgument("Invalid compression algorithm %s." % algo) def get_decompress_stream(resp, compress_option=None, requests=True): @@ -297,7 +303,7 @@ def get_decompress_stream(resp, compress_option=None, requests=True): elif algo in _lz4_algorithms: stream_cls = LZ4RequestsInputStream else: - raise errors.InvalidArgument('Invalid compression algorithm %s.' % algo) + raise errors.InvalidArgument("Invalid compression algorithm %s." % algo) if not requests: stream_cls = stream_cls.get_raw_input_stream_class() @@ -340,7 +346,8 @@ def _get_compressor(self, level=1): import snappy except ImportError: raise errors.DependencyNotInstalledError( - "python-snappy library is required for snappy support") + "python-snappy library is required for snappy support" + ) return snappy.StreamCompressor() @@ -350,7 +357,8 @@ def _get_compressor(self, level=1): import zstandard except ImportError: raise errors.DependencyNotInstalledError( - "zstandard library is required for zstd support") + "zstandard library is required for zstd support" + ) return zstandard.ZstdCompressor().compressobj() @@ -360,7 +368,8 @@ def _get_compressor(self, level=1): import lz4.frame except ImportError: raise errors.DependencyNotInstalledError( - "lz4 library is required for lz4 support") + "lz4 library is required for lz4 support" + ) self._begun = False return lz4.frame.LZ4FrameCompressor(compression_level=level) @@ -372,27 +381,31 @@ def write(self, data): class SimpleInputStream(object): - READ_BLOCK_SIZE = 1024 * 64 def __init__(self, input): self._input = input - self._internal_buffer = memoryview(b'') + self._internal_buffer = memoryview(b"") self._buffered_len = 0 self._buffered_pos = 0 + self._pos = 0 self._closed = False @staticmethod def readable(): return True + def __len__(self): + return self._pos + def read(self, limit): if self._closed: raise IOError("closed") if limit <= self._buffered_len - self._buffered_pos: - mv = self._internal_buffer[self._buffered_pos:self._buffered_pos + limit] + mv = self._internal_buffer[self._buffered_pos : self._buffered_pos + limit] self._buffered_pos += len(mv) + self._pos += len(mv) return mv_to_bytes(mv) bufs = list() @@ -403,7 +416,9 @@ def read(self, limit): break bufs.append(content) size_left -= len(content) - return bytes().join(bufs) + ret = bytes().join(bufs) + self._pos += len(ret) + return ret def peek(self): if self._buffered_pos == self._buffered_len: @@ -421,9 +436,10 @@ def readinto(self, b): b = cast_memoryview(b) limit = len(b) if limit <= self._buffered_len - self._buffered_pos: - mv = self._internal_buffer[self._buffered_pos:self._buffered_pos + limit] + mv = self._internal_buffer[self._buffered_pos : self._buffered_pos + limit] self._buffered_pos += len(mv) b[:limit] = mv + self._pos += len(mv) return len(mv) pos = 0 @@ -432,12 +448,13 @@ def readinto(self, b): if not rsize: break pos += rsize + self._pos += pos return pos def _internal_read(self, limit): if self._buffered_pos == self._buffered_len: self._refill_buffer() - mv = self._internal_buffer[self._buffered_pos:self._buffered_pos + limit] + mv = self._internal_buffer[self._buffered_pos : self._buffered_pos + limit] self._buffered_pos += len(mv) return mv_to_bytes(mv) @@ -445,10 +462,10 @@ def _internal_readinto(self, b, start): if self._buffered_pos == self._buffered_len: self._refill_buffer() size = len(b) - start - mv = self._internal_buffer[self._buffered_pos:self._buffered_pos + size] + mv = self._internal_buffer[self._buffered_pos : self._buffered_pos + size] size = len(mv) self._buffered_pos += size - b[start:start + size] = mv + b[start : start + size] = mv return size def _refill_buffer(self): @@ -515,7 +532,8 @@ def _get_decompressor(self): import snappy except ImportError: raise errors.DependencyNotInstalledError( - "python-snappy library is required for snappy support") + "python-snappy library is required for snappy support" + ) return snappy.StreamDecompressor() @@ -525,7 +543,8 @@ def _get_decompressor(self): import zstandard except ImportError: raise errors.DependencyNotInstalledError( - "zstandard library is required for zstd support") + "zstandard library is required for zstd support" + ) return zstandard.ZstdDecompressor().decompressobj() @@ -535,7 +554,8 @@ def _get_decompressor(self): import lz4.frame except ImportError: raise errors.DependencyNotInstalledError( - "lz4 library is required for lz4 support") + "lz4 library is required for lz4 support" + ) return lz4.frame.LZ4FrameDecompressor() diff --git a/odps/tunnel/io/types.py b/odps/tunnel/io/types.py index e1b44c5b..f6edb7f5 100644 --- a/odps/tunnel/io/types.py +++ b/odps/tunnel/io/types.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,7 +23,21 @@ if pa is not None: - _ODPS_ARROW_TYPE_MAPPING = { + _ARROW_TO_ODPS_TYPE = { + pa.string(): odps_types.string, + pa.binary(): odps_types.binary, + pa.int8(): odps_types.tinyint, + pa.int16(): odps_types.smallint, + pa.int32(): odps_types.int_, + pa.int64(): odps_types.bigint, + pa.bool_(): odps_types.boolean, + pa.float32(): odps_types.float_, + pa.float64(): odps_types.double, + pa.date32(): odps_types.date, + pa.timestamp("ms"): odps_types.datetime, + pa.timestamp("ns"): odps_types.timestamp, + } + _ODPS_TO_ARROW_TYPE = { odps_types.string: pa.string(), odps_types.binary: pa.binary(), odps_types.tinyint: pa.int8(), @@ -34,18 +48,19 @@ odps_types.float_: pa.float32(), odps_types.double: pa.float64(), odps_types.date: pa.date32(), - odps_types.datetime: pa.timestamp('ms'), - odps_types.timestamp: pa.timestamp('ns'), + odps_types.datetime: pa.timestamp("ms"), + odps_types.timestamp: pa.timestamp("ns"), } else: - _ODPS_ARROW_TYPE_MAPPING = {} + _ARROW_TO_ODPS_TYPE = {} + _ODPS_TO_ARROW_TYPE = {} def odps_type_to_arrow_type(odps_type): from ... import types - if odps_type in _ODPS_ARROW_TYPE_MAPPING: - col_type = _ODPS_ARROW_TYPE_MAPPING[odps_type] + if odps_type in _ODPS_TO_ARROW_TYPE: + col_type = _ODPS_TO_ARROW_TYPE[odps_type] else: if isinstance(odps_type, types.Array): col_type = pa.list_(odps_type_to_arrow_type(odps_type.value_type)) @@ -71,16 +86,57 @@ def odps_type_to_arrow_type(odps_type): elif isinstance(odps_type, odps_types.IntervalDayTime): col_type = pa.struct([("sec", pa.int64()), ("nano", pa.int32())]) else: - raise TypeError('Unsupported type: {}'.format(odps_type)) + raise TypeError("Unsupported type: {}".format(odps_type)) return col_type def odps_schema_to_arrow_schema(odps_schema): - arrow_schema = [] - for schema in odps_schema.simple_columns: - col_name = schema.name - col_type = odps_type_to_arrow_type(schema.type) + for col in odps_schema.simple_columns: + col_name = col.name + col_type = odps_type_to_arrow_type(col.type) arrow_schema.append(pa.field(col_name, col_type)) return pa.schema(arrow_schema) + + +def arrow_type_to_odps_type(arrow_type): + from ... import types + + if arrow_type in _ARROW_TO_ODPS_TYPE: + col_type = _ARROW_TO_ODPS_TYPE[arrow_type] + else: + if isinstance(arrow_type, pa.ListType): + col_type = types.Array(arrow_type_to_odps_type(arrow_type.value_type)) + elif isinstance(arrow_type, pa.MapType): + col_type = types.Map( + arrow_type_to_odps_type(arrow_type.key_type), + arrow_type_to_odps_type(arrow_type.item_type), + ) + elif isinstance(arrow_type, (pa.Decimal128Type, pa.Decimal256Type)): + precision = arrow_type.precision or types.Decimal._max_precision + scale = arrow_type.scale or types.Decimal._max_scale + col_type = types.Decimal(precision, scale) + elif isinstance(arrow_type, pa.StructType): + fields = [ + ( + arrow_type.field(idx).name, + arrow_type_to_odps_type(arrow_type.field(idx).type), + ) + for idx in arrow_type.num_fields + ] + col_type = types.Struct(fields) + else: + raise TypeError("Unsupported type: {}".format(arrow_type)) + return col_type + + +def arrow_schema_to_odps_schema(arrow_schema): + from ... import types + + odps_cols = [] + for col_name, pa_type in zip(arrow_schema.names, arrow_schema.types): + col_type = arrow_type_to_odps_type(pa_type) + odps_cols.append(types.Column(col_name, col_type)) + + return types.OdpsSchema(odps_cols) diff --git a/odps/tunnel/io/writer.py b/odps/tunnel/io/writer.py index d852366b..fa283e4e 100644 --- a/odps/tunnel/io/writer.py +++ b/odps/tunnel/io/writer.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,6 @@ import json import struct -import time try: import pyarrow as pa @@ -35,31 +34,36 @@ except (ImportError, ValueError): pd = None +from ... import compat, options, types, utils +from ...compat import Enum, futures, six +from ..checksum import Checksum +from ..errors import TunnelError from ..pb.encoder import Encoder from ..pb.wire_format import ( - WIRETYPE_VARINT, WIRETYPE_FIXED32, WIRETYPE_FIXED64, WIRETYPE_LENGTH_DELIMITED, + WIRETYPE_VARINT, ) -from ... import compat, options, types, utils -from ...compat import Enum, futures, six -from ..checksum import Checksum -from ..errors import TunnelError from ..wireconstants import ProtoWireConstants from .stream import RequestsIO, get_compress_stream from .types import odps_schema_to_arrow_schema + try: if not options.force_py: from ..hasher_c import RecordHasher from .writer_c import BaseRecordWriter else: from ..hasher import RecordHasher + BaseRecordWriter = None except ImportError as e: if options.force_c: raise e - BaseRecordWriter = RecordHasher = None + + from ..hasher import RecordHasher + + BaseRecordWriter = None varint_tag_types = types.integer_types + ( @@ -100,7 +104,7 @@ def _re_init(self, output): self._n_total = 0 def _mode(self): - return 'py' + return "py" def flush(self): if len(self._encoder) > 0: @@ -360,7 +364,9 @@ class RecordWriter(BaseRecordWriter): This writer uploads the output of serializer asynchronously within a long-lived http connection. """ - def __init__(self, schema, request_callback, compress_option=None, encoding="utf-8"): + def __init__( + self, schema, request_callback, compress_option=None, encoding="utf-8" + ): self._req_io = RequestsIO(request_callback, chunk_size=options.chunk_size) out = get_compress_stream(self._req_io, compress_option) @@ -386,6 +392,7 @@ class BufferedRecordWriter(BaseRecordWriter): This writer buffers the output of serializer. When the buffer exceeds a fixed-size of limit (default 20 MiB), it uploads the buffered output within one http connection. """ + def __init__( self, schema, @@ -501,8 +508,8 @@ def write(self, record): def _reset_writer(self, write_response): self._record_count = 0 - slot_server = write_response.headers['odps-tunnel-routed-server'] - slot_num = int(write_response.headers['odps-tunnel-slot-num']) + slot_server = write_response.headers["odps-tunnel-routed-server"] + slot_num = int(write_response.headers["odps-tunnel-slot-num"]) self.session.reload_slots(self.slot, slot_server, slot_num) super(StreamRecordWriter, self)._reset_writer(write_response) @@ -571,15 +578,16 @@ def _localize_timezone(cls, col, tz=None): else: tz = str(options.local_timezone) - try: - if col.type.tz is not None: - return col - col = pac.assume_timezone(col, tz) + if col.type.tz is not None: + return col + if hasattr(pac, "assume_timezone") and isinstance(tz, str): + # pyarrow.compute.assume_timezone only accepts + # string-represented zones + col = pac.assume_timezone(col, timezone=tz) return col - except: - col = col.to_pandas() - col = pa.Array.from_pandas(col.dt.tz_localize(tz)) - return col + else: + pd_col = col.to_pandas().dt.tz_localize(tz) + return pa.Array.from_pandas(pd_col) def write(self, data): if isinstance(data, pd.DataFrame): @@ -591,19 +599,24 @@ def write(self, data): assert isinstance(arrow_data, (pa.RecordBatch, pa.Table)) - if not arrow_data.schema.equals(self._arrow_schema): + if arrow_data.schema != self._arrow_schema or any( + isinstance(tp, pa.TimestampType) for tp in arrow_data.schema.types + ): type_dict = dict(zip(arrow_data.schema.names, arrow_data.schema.types)) column_dict = dict(zip(arrow_data.schema.names, arrow_data.columns)) arrays = [] for name, tp in zip(self._arrow_schema.names, self._arrow_schema.types): if name not in column_dict: - raise ValueError("Input record batch does not contain column %s" % name) + raise ValueError( + "Input record batch does not contain column %s" % name + ) - if tp == pa.timestamp("ms") or tp == pa.timestamp("ns"): + if isinstance(tp, pa.TimestampType): if self._schema[name].type == types.timestamp_ntz: - column_dict[name] = self._localize_timezone(column_dict[name], "UTC") + col = self._localize_timezone(column_dict[name], "UTC") else: - column_dict[name] = self._localize_timezone(column_dict[name]) + col = self._localize_timezone(column_dict[name]) + column_dict[name] = col.cast(pa.timestamp(tp.unit, col.type.tz)) if tp == type_dict[name]: arrays.append(column_dict[name]) @@ -611,7 +624,9 @@ def write(self, data): try: arrays.append(column_dict[name].cast(tp, safe=False)) except (pa.ArrowInvalid, pa.ArrowNotImplementedError): - raise ValueError("Failed to cast column %s to type %s" % (name, tp)) + raise ValueError( + "Failed to cast column %s to type %s" % (name, tp) + ) pa_type = type(arrow_data) arrow_data = pa_type.from_arrays(arrays, names=self._arrow_schema.names) @@ -624,9 +639,10 @@ def write(self, data): data = batch.serialize().to_pybytes() written_bytes = 0 while written_bytes < len(data): - length = min(self._chunk_size - self._cur_chunk_size, - len(data) - written_bytes) - chunk_data = data[written_bytes: written_bytes + length] + length = min( + self._chunk_size - self._cur_chunk_size, len(data) - written_bytes + ) + chunk_data = data[written_bytes : written_bytes + length] self._write_chunk(chunk_data) written_bytes += length @@ -656,9 +672,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): class ArrowWriter(BaseArrowWriter): - def __init__( - self, schema, request_callback, compress_option=None, chunk_size=None - ): + def __init__(self, schema, request_callback, compress_option=None, chunk_size=None): self._req_io = RequestsIO(request_callback, chunk_size=chunk_size) out = get_compress_stream(self._req_io, compress_option) @@ -750,8 +764,8 @@ def get_blocks_written(self): class Upsert(object): - DEFAULT_MAX_BUFFER_SIZE = 64 * 1024 ** 2 - DEFAULT_SLOT_BUFFER_SIZE = 1024 ** 2 + DEFAULT_MAX_BUFFER_SIZE = 64 * 1024**2 + DEFAULT_SLOT_BUFFER_SIZE = 1024**2 class Operation(Enum): UPSERT = "UPSERT" @@ -887,7 +901,9 @@ def _write(self, record, op, valid_columns=None): record[self._session.UPSERT_VALUE_COLS_KEY] = [] else: valid_cols_set = set(valid_columns) - col_idxes = [idx for idx, col in self._schema.columns if col in valid_cols_set] + col_idxes = [ + idx for idx, col in self._schema.columns if col in valid_cols_set + ] record[self._session.UPSERT_VALUE_COLS_KEY] = col_idxes writer = self._bucket_writers[bucket] diff --git a/odps/tunnel/io/writer_c.pxd b/odps/tunnel/io/writer_c.pxd index 9dbad940..63cf1a59 100644 --- a/odps/tunnel/io/writer_c.pxd +++ b/odps/tunnel/io/writer_c.pxd @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ from ...src.utils_c cimport CMillisecondsConverter from ..checksum_c cimport Checksum from ..pb.encoder_c cimport CEncoder + cdef class ProtobufRecordWriter: cdef int DEFAULT_BUFFER_SIZE @@ -36,14 +37,14 @@ cdef class ProtobufRecordWriter: cpdef close(self) cpdef flush_all(self) cpdef int _refresh_buffer(self) except -1 - cdef int _write_tag(self, int field_num, int wire_type) except + nogil - cdef int _write_raw_long(self, int64_t val) except + nogil - cdef int _write_raw_int(self, int32_t val) except + nogil - cdef int _write_raw_uint(self, uint32_t val) except + nogil - cdef int _write_raw_bool(self, bint val) except + nogil - cdef int _write_raw_float(self, float val) except + nogil - cdef int _write_raw_double(self, double val) except + nogil - cdef int _write_raw_string(self, const char *ptr, uint32_t size) except + nogil + cdef int _write_tag(self, int field_num, int wire_type) except -1 nogil + cdef int _write_raw_long(self, int64_t val) except -1 nogil + cdef int _write_raw_int(self, int32_t val) except -1 nogil + cdef int _write_raw_uint(self, uint32_t val) except -1 nogil + cdef int _write_raw_bool(self, bint val) except -1 nogil + cdef int _write_raw_float(self, float val) except -1 nogil + cdef int _write_raw_double(self, double val) except -1 nogil + cdef int _write_raw_string(self, const char *ptr, uint32_t size) except -1 nogil cdef class BaseRecordWriter(ProtobufRecordWriter): @@ -61,10 +62,10 @@ cdef class BaseRecordWriter(ProtobufRecordWriter): cdef SchemaSnapshot _schema_snapshot cpdef write(self, BaseRecord record) - cdef void _write_bool(self, bint data) except + nogil - cdef void _write_long(self, int64_t data) except + nogil - cdef void _write_float(self, float data) except + nogil - cdef void _write_double(self, double data) except + nogil + cdef int _write_bool(self, bint data) except -1 nogil + cdef int _write_long(self, int64_t data) except -1 nogil + cdef int _write_float(self, float data) except -1 nogil + cdef int _write_double(self, double data) except -1 nogil cdef _write_string(self, object data) cdef _write_timestamp_base(self, object data, bint ntz) cdef _write_timestamp(self, object data) @@ -74,4 +75,4 @@ cdef class BaseRecordWriter(ProtobufRecordWriter): cdef _write_array(self, object data, object data_type) cdef _write_struct(self, object data, object data_type) cpdef _write_finish_tags(self) - cpdef close(self) \ No newline at end of file + cpdef close(self) diff --git a/odps/tunnel/io/writer_c.pyx b/odps/tunnel/io/writer_c.pyx index ef298178..8cd6c42b 100644 --- a/odps/tunnel/io/writer_c.pyx +++ b/odps/tunnel/io/writer_c.pyx @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ import json import time import cython + from cpython.datetime cimport import_datetime from libc.stdint cimport * from libc.string cimport * @@ -26,13 +27,15 @@ from ...src.types_c cimport BaseRecord, SchemaSnapshot from ..checksum_c cimport Checksum from ..pb.encoder_c cimport CEncoder -from ... import types, compat, utils +from ... import compat, types, utils from ...compat import six + from ...src.utils_c cimport CMillisecondsConverter -from ..pb.wire_format import WIRETYPE_VARINT as PY_WIRETYPE_VARINT, \ - WIRETYPE_FIXED32 as PY_WIRETYPE_FIXED32,\ - WIRETYPE_FIXED64 as PY_WIRETYPE_FIXED64,\ - WIRETYPE_LENGTH_DELIMITED as PY_WIRETYPE_LENGTH_DELIMITED + +from ..pb.wire_format import WIRETYPE_FIXED32 as PY_WIRETYPE_FIXED32 +from ..pb.wire_format import WIRETYPE_FIXED64 as PY_WIRETYPE_FIXED64 +from ..pb.wire_format import WIRETYPE_LENGTH_DELIMITED as PY_WIRETYPE_LENGTH_DELIMITED +from ..pb.wire_format import WIRETYPE_VARINT as PY_WIRETYPE_VARINT from ..wireconstants import ProtoWireConstants cdef: @@ -98,7 +101,7 @@ cdef class ProtobufRecordWriter: self._n_total = 0 def _mode(self): - return 'c' + return "c" cpdef flush(self): if self._encoder.position() > 0: @@ -126,34 +129,34 @@ cdef class ProtobufRecordWriter: def __len__(self): return self.n_bytes - cdef int _write_tag(self, int field_num, int wire_type) except + nogil: + cdef int _write_tag(self, int field_num, int wire_type) except -1 nogil: return self._encoder.append_tag(field_num, wire_type) - cdef int _write_raw_long(self, int64_t val) except + nogil: + cdef int _write_raw_long(self, int64_t val) except -1 nogil: return self._encoder.append_sint64(val) - cdef int _write_raw_int(self, int32_t val) except + nogil: + cdef int _write_raw_int(self, int32_t val) except -1 nogil: return self._encoder.append_sint32(val) - cdef int _write_raw_uint(self, uint32_t val) except + nogil: + cdef int _write_raw_uint(self, uint32_t val) except -1 nogil: return self._encoder.append_uint32(val) - cdef int _write_raw_bool(self, bint val) except + nogil: + cdef int _write_raw_bool(self, bint val) except -1 nogil: return self._encoder.append_bool(val) - cdef int _write_raw_float(self, float val) except + nogil: + cdef int _write_raw_float(self, float val) except -1 nogil: return self._encoder.append_float(val) - cdef int _write_raw_double(self, double val) except + nogil: + cdef int _write_raw_double(self, double val) except -1 nogil: return self._encoder.append_double(val) - cdef int _write_raw_string(self, const char *ptr, uint32_t size) except + nogil: + cdef int _write_raw_string(self, const char *ptr, uint32_t size) except -1 nogil: return self._encoder.append_string(ptr, size) cdef class BaseRecordWriter(ProtobufRecordWriter): - def __init__(self, object schema, object out, encoding='utf-8'): + def __init__(self, object schema, object out, encoding="utf-8"): self._encoding = encoding self._is_utf8 = encoding == "utf-8" self._schema = schema @@ -191,7 +194,7 @@ cdef class BaseRecordWriter(ProtobufRecordWriter): n_record_fields = len(record) if n_record_fields > self._n_columns: - raise IOError('record fields count is more than schema.') + raise IOError("record fields count is more than schema.") for i in range(min(n_record_fields, self._n_columns)): if self._schema_snapshot._col_is_partition[i]: @@ -213,7 +216,7 @@ cdef class BaseRecordWriter(ProtobufRecordWriter): if isinstance(data_type, (types.Array, types.Map, types.Struct)): self._write_tag(pb_index, WIRETYPE_LENGTH_DELIMITED) else: - raise IOError('Invalid data type: %s' % data_type) + raise IOError("Invalid data type: %s" % data_type) self._write_field(val, data_type_id, data_type) @@ -226,21 +229,25 @@ cdef class BaseRecordWriter(ProtobufRecordWriter): self._crccrc_c.c_update_int(checksum) self._curr_cursor_c += 1 - cdef void _write_bool(self, bint data) except + nogil: + cdef int _write_bool(self, bint data) except -1 nogil: self._crc_c.c_update_bool(data) self._write_raw_bool(data) + return 0 - cdef void _write_long(self, int64_t data) except + nogil: + cdef int _write_long(self, int64_t data) except -1 nogil: self._crc_c.c_update_long(data) self._write_raw_long(data) + return 0 - cdef void _write_float(self, float data) except + nogil: + cdef int _write_float(self, float data) except -1 nogil: self._crc_c.c_update_float(data) self._write_raw_float(data) + return 0 - cdef void _write_double(self, double data) except + nogil: + cdef int _write_double(self, double data) except -1 nogil: self._crc_c.c_update_double(data) self._write_raw_double(data) + return 0 @cython.nonecheck(False) cdef _write_string(self, object data): @@ -337,7 +344,7 @@ cdef class BaseRecordWriter(ProtobufRecordWriter): elif isinstance(data_type, types.Struct): self._write_struct(val, data_type) else: - raise IOError('Invalid data type: %s' % data_type) + raise IOError("Invalid data type: %s" % data_type) cdef _write_array(self, object data, object data_type): cdef int data_type_id = data_type._type_id diff --git a/odps/tunnel/pb/decoder.py b/odps/tunnel/pb/decoder.py index f6430096..4fecddaa 100644 --- a/odps/tunnel/pb/decoder.py +++ b/odps/tunnel/pb/decoder.py @@ -21,16 +21,14 @@ import struct -from . import input_stream -from . import wire_format +from . import input_stream, wire_format class Decoder(object): """Decodes logical protocol buffer fields from the wire.""" def __init__(self, input): - """Initializes the decoder to read from input stream. - """ + """Initializes the decoder to read from input stream.""" self._stream = input_stream.InputStream(input) def __len__(self): @@ -72,25 +70,25 @@ def read_sfixed32(self): """Reads and returns a signed, fixed-width, 32-bit integer.""" value = self._stream.read_little_endian32() if value >= (1 << 31): - value -= (1 << 32) + value -= 1 << 32 return value def read_sfixed64(self): """Reads and returns a signed, fixed-width, 64-bit integer.""" value = self._stream.read_little_endian64() if value >= (1 << 63): - value -= (1 << 64) + value -= 1 << 64 return value def read_float(self): """Reads and returns a 4-byte floating-point number.""" serialized = self._stream.read_string(4) - return struct.unpack('f', serialized)[0] + return struct.unpack("f", serialized)[0] def read_double(self): """Reads and returns an 8-byte floating-point number.""" serialized = self._stream.read_string(8) - return struct.unpack('d', serialized)[0] + return struct.unpack("d", serialized)[0] def read_bool(self): """Reads and returns a bool.""" diff --git a/odps/tunnel/pb/decoder_c.pxd b/odps/tunnel/pb/decoder_c.pxd index c1ea00c2..73d55529 100644 --- a/odps/tunnel/pb/decoder_c.pxd +++ b/odps/tunnel/pb/decoder_c.pxd @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/odps/tunnel/pb/decoder_c.pyx b/odps/tunnel/pb/decoder_c.pyx index 2e081261..b883e9db 100644 --- a/odps/tunnel/pb/decoder_c.pyx +++ b/odps/tunnel/pb/decoder_c.pyx @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,7 +18,8 @@ include "util_c.pxi" from libc.stdint cimport * from libc.string cimport * -from .wire_format import TAG_TYPE_BITS as PY_TAG_TYPE_BITS, _TAG_TYPE_MASK as _PY_TAG_TYPE_MASK +from .wire_format import _TAG_TYPE_MASK as _PY_TAG_TYPE_MASK +from .wire_format import TAG_TYPE_BITS as PY_TAG_TYPE_BITS cdef: int TAG_TYPE_BITS = PY_TAG_TYPE_BITS @@ -132,11 +133,11 @@ cdef class CDecoder: need = 0 if len(result) == 0: - return b'' + return b"" elif len(result) == 1: return result[0] else: - return b''.join(result) + return b"".join(result) cdef int _load_next_buffer(self) except -1 with gil: if self._is_source_eof and (self._begin >= self._end): diff --git a/odps/tunnel/pb/encoder.py b/odps/tunnel/pb/encoder.py index 88cf2f65..079104c9 100644 --- a/odps/tunnel/pb/encoder.py +++ b/odps/tunnel/pb/encoder.py @@ -21,9 +21,7 @@ import struct -from . import errors -from . import wire_format -from . import output_stream +from . import errors, output_stream, wire_format class Encoder(object): @@ -83,8 +81,8 @@ def append_sfixed32(self, value): """ sign = (value & 0x80000000) and -1 or 0 if value >> 32 != sign: - raise errors.EncodeError('SFixed32 out of range: %d' % value) - self._stream.append_little_endian32(value & 0xffffffff) + raise errors.EncodeError("SFixed32 out of range: %d" % value) + self._stream.append_little_endian32(value & 0xFFFFFFFF) def append_sfixed64(self, value): """Appends a signed 64-bit integer to our buffer, in little-endian @@ -92,16 +90,16 @@ def append_sfixed64(self, value): """ sign = (value & 0x8000000000000000) and -1 or 0 if value >> 64 != sign: - raise errors.EncodeError('SFixed64 out of range: %d' % value) - self._stream.append_little_endian64(value & 0xffffffffffffffff) + raise errors.EncodeError("SFixed64 out of range: %d" % value) + self._stream.append_little_endian64(value & 0xFFFFFFFFFFFFFFFF) def append_float(self, value): """Appends a floating-point number to our buffer.""" - self._stream.append_raw_bytes(struct.pack('f', value)) + self._stream.append_raw_bytes(struct.pack("f", value)) def append_double(self, value): """Appends a double-precision floating-point number to our buffer.""" - self._stream.append_raw_bytes(struct.pack('d', value)) + self._stream.append_raw_bytes(struct.pack("d", value)) def append_bool(self, value): """Appends a boolean to our buffer.""" diff --git a/odps/tunnel/pb/encoder_c.pxd b/odps/tunnel/pb/encoder_c.pxd index 2e1e764a..8ac615d3 100644 --- a/odps/tunnel/pb/encoder_c.pxd +++ b/odps/tunnel/pb/encoder_c.pxd @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ from libc.stdint cimport * from libc.string cimport * + from ...src.stringstream cimport stringstream @@ -23,15 +24,15 @@ cdef class CEncoder: cdef size_t position(self) nogil cpdef bytes tostring(self) - cdef int append_tag(self, int field_num, int wire_type) except + nogil - cdef int append_sint32(self, int32_t value) except + nogil - cdef int append_uint32(self, uint32_t value) except + nogil - cdef int append_sint64(self, int64_t value) except + nogil - cdef int append_uint64(self, uint64_t value) except + nogil - cdef int append_bool(self, bint value) except + nogil - cdef int append_double(self, double value) except + nogil - cdef int append_float(self, float value) except + nogil - cdef int append_string(self, const char *ptr, size_t value_len) except + nogil + cdef int append_tag(self, int field_num, int wire_type) except -1 nogil + cdef int append_sint32(self, int32_t value) except -1 nogil + cdef int append_uint32(self, uint32_t value) except -1 nogil + cdef int append_sint64(self, int64_t value) except -1 nogil + cdef int append_uint64(self, uint64_t value) except -1 nogil + cdef int append_bool(self, bint value) except -1 nogil + cdef int append_double(self, double value) except -1 nogil + cdef int append_float(self, float value) except -1 nogil + cdef int append_string(self, const char *ptr, size_t value_len) except -1 nogil cdef class Encoder: diff --git a/odps/tunnel/pb/encoder_c.pyx b/odps/tunnel/pb/encoder_c.pyx index d7bf66ab..c96c79c1 100644 --- a/odps/tunnel/pb/encoder_c.pyx +++ b/odps/tunnel/pb/encoder_c.pyx @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ include "util_c.pxi" from libc.stdint cimport * from libc.string cimport * from libcpp.string cimport string + from ...src.stringstream cimport stringstream @@ -37,36 +38,36 @@ cdef class CEncoder: cpdef bytes tostring(self): return bytes(self._buffer.to_string()) - cdef int append_tag(self, int field_num, int wire_type) except + nogil: + cdef int append_tag(self, int field_num, int wire_type) except -1 nogil: cdef int key key = (field_num << 3) | wire_type cdef int size = set_varint64(key, self._buffer[0]) return size - cdef int append_sint32(self, int32_t value) except + nogil: + cdef int append_sint32(self, int32_t value) except -1 nogil: return set_signed_varint32(value, self._buffer[0]) - cdef int append_uint32(self, uint32_t value) except + nogil: + cdef int append_uint32(self, uint32_t value) except -1 nogil: return set_varint32(value, self._buffer[0]) - cdef int append_sint64(self, int64_t value) except + nogil: + cdef int append_sint64(self, int64_t value) except -1 nogil: return set_signed_varint64(value, self._buffer[0]) - cdef int append_uint64(self, uint64_t value) except + nogil: + cdef int append_uint64(self, uint64_t value) except -1 nogil: return set_varint64(value, self._buffer[0]) - cdef int append_bool(self, bint value) except + nogil: + cdef int append_bool(self, bint value) except -1 nogil: return set_varint32(value, self._buffer[0]) - cdef int append_float(self, float value) except + nogil: + cdef int append_float(self, float value) except -1 nogil: self._buffer.write(&value, sizeof(float)) return sizeof(float) - cdef int append_double(self, double value) except + nogil: + cdef int append_double(self, double value) except -1 nogil: self._buffer.write(&value, sizeof(double)) return sizeof(double) - cdef int append_string(self, const char *ptr, size_t value_len) except + nogil: + cdef int append_string(self, const char *ptr, size_t value_len) except -1 nogil: cdef int size = set_varint32(value_len, self._buffer[0]) self._buffer.write(ptr, value_len) return size + value_len diff --git a/odps/tunnel/pb/errors.py b/odps/tunnel/pb/errors.py index 06825b41..5cee4c3f 100644 --- a/odps/tunnel/pb/errors.py +++ b/odps/tunnel/pb/errors.py @@ -31,4 +31,3 @@ class DecodeError(Error): class EncodeError(Error): pass - diff --git a/odps/tunnel/pb/input_stream.py b/odps/tunnel/pb/input_stream.py index 6600580d..5755e568 100644 --- a/odps/tunnel/pb/input_stream.py +++ b/odps/tunnel/pb/input_stream.py @@ -21,8 +21,7 @@ import struct -from . import errors -from . import wire_format +from . import errors, wire_format class InputStream(object): @@ -48,10 +47,12 @@ def read_string(self, size): as a string. """ if size < 0: - raise errors.DecodeError('Negative size %d' % size) + raise errors.DecodeError("Negative size %d" % size) s = self._input.read(size) if len(s) != size: - raise errors.DecodeError('String claims to have %d bytes, but read %d' % (size, len(s))) + raise errors.DecodeError( + "String claims to have %d bytes, but read %d" % (size, len(s)) + ) self._pos += len(s) # Only advance by the number of bytes actually read. return s @@ -60,8 +61,9 @@ def read_little_endian32(self): encoded, unsiged 32-bit integer, and returns that integer. """ try: - i = struct.unpack(wire_format.FORMAT_UINT32_LITTLE_ENDIAN, - self._input.read(4)) + i = struct.unpack( + wire_format.FORMAT_UINT32_LITTLE_ENDIAN, self._input.read(4) + ) self._pos += 4 return i[0] # unpack() result is a 1-element tuple. except struct.error as e: @@ -72,8 +74,9 @@ def read_little_endian64(self): encoded, unsiged 64-bit integer, and returns that integer. """ try: - i = struct.unpack(wire_format.FORMAT_UINT64_LITTLE_ENDIAN, - self._input.read(8)) + i = struct.unpack( + wire_format.FORMAT_UINT64_LITTLE_ENDIAN, self._input.read(8) + ) self._pos += 8 return i[0] # unpack() result is a 1-element tuple. except struct.error as e: @@ -85,7 +88,7 @@ def read_varint32(self): """ i = self.read_varint64() if not wire_format.INT32_MIN <= i <= wire_format.INT32_MAX: - raise errors.DecodeError('Value out of range for int32: %d' % i) + raise errors.DecodeError("Value out of range for int32: %d" % i) return int(i) def read_var_uint32(self): @@ -94,7 +97,7 @@ def read_var_uint32(self): """ i = self.read_var_uint64() if i > wire_format.UINT32_MAX: - raise errors.DecodeError('Value out of range for uint32: %d' % i) + raise errors.DecodeError("Value out of range for uint32: %d" % i) return i def read_varint64(self): @@ -103,7 +106,7 @@ def read_varint64(self): """ i = self.read_var_uint64() if i > wire_format.INT64_MAX: - i -= (1 << 64) + i -= 1 << 64 return i def read_var_uint64(self): @@ -112,7 +115,7 @@ def read_var_uint64(self): """ i = self._read_varint_helper() if not 0 <= i <= wire_format.UINT64_MAX: - raise errors.DecodeError('Value out of range for uint64: %d' % i) + raise errors.DecodeError("Value out of range for uint64: %d" % i) return i def _read_varint_helper(self): @@ -127,13 +130,13 @@ def _read_varint_helper(self): shift = 0 while 1: if shift >= 64: - raise errors.DecodeError('Too many bytes when decoding varint.') + raise errors.DecodeError("Too many bytes when decoding varint.") try: b = ord(self._input.read(1)) except IndexError: - raise errors.DecodeError('Truncated varint.') + raise errors.DecodeError("Truncated varint.") self._pos += 1 - result |= ((b & 0x7f) << shift) + result |= (b & 0x7F) << shift shift += 7 if not (b & 0x80): return result diff --git a/odps/tunnel/pb/output_stream.py b/odps/tunnel/pb/output_stream.py index e34c0f34..1d1aacec 100644 --- a/odps/tunnel/pb/output_stream.py +++ b/odps/tunnel/pb/output_stream.py @@ -19,25 +19,27 @@ Modified by onesuperclark@gmail.com(onesuper). """ -import sys import array import struct +import sys -from . import errors -from . import wire_format +from . import errors, wire_format class OutputStream(object): """Contains all logic for writing bits, and ToString() to get the result.""" def __init__(self): - self._buffer = array.array('B') + self._buffer = array.array("B") if sys.version_info < (3, 3): + def append_raw_bytes(self, raw_bytes): """Appends raw_bytes to our internal buffer.""" self._buffer.fromstring(raw_bytes) + else: + def append_raw_bytes(self, raw_bytes): """Appends raw_bytes to our internal buffer.""" self._buffer.frombytes(raw_bytes) @@ -48,9 +50,11 @@ def append_little_endian32(self, unsigned_value): """ if not 0 <= unsigned_value <= wire_format.UINT32_MAX: raise errors.EncodeError( - 'Unsigned 32-bit out of range: %d' % unsigned_value) - self.append_raw_bytes(struct.pack( - wire_format.FORMAT_UINT32_LITTLE_ENDIAN, unsigned_value)) + "Unsigned 32-bit out of range: %d" % unsigned_value + ) + self.append_raw_bytes( + struct.pack(wire_format.FORMAT_UINT32_LITTLE_ENDIAN, unsigned_value) + ) def append_little_endian64(self, unsigned_value): """Appends an unsigned 64-bit integer to the internal buffer, @@ -58,9 +62,11 @@ def append_little_endian64(self, unsigned_value): """ if not 0 <= unsigned_value <= wire_format.UINT64_MAX: raise errors.EncodeError( - 'Unsigned 64-bit out of range: %d' % unsigned_value) - self.append_raw_bytes(struct.pack( - wire_format.FORMAT_UINT64_LITTLE_ENDIAN, unsigned_value)) + "Unsigned 64-bit out of range: %d" % unsigned_value + ) + self.append_raw_bytes( + struct.pack(wire_format.FORMAT_UINT64_LITTLE_ENDIAN, unsigned_value) + ) def append_varint32(self, value): """Appends a signed 32-bit integer to the internal buffer, @@ -68,7 +74,7 @@ def append_varint32(self, value): always require 10 bytes of space.) """ if not wire_format.INT32_MIN <= value <= wire_format.INT32_MAX: - raise errors.EncodeError('Value out of range: %d' % value) + raise errors.EncodeError("Value out of range: %d" % value) self.append_varint64(value) def append_var_uint32(self, value): @@ -76,7 +82,7 @@ def append_var_uint32(self, value): encoded as a varint. """ if not 0 <= value <= wire_format.UINT32_MAX: - raise errors.EncodeError('Value out of range: %d' % value) + raise errors.EncodeError("Value out of range: %d" % value) self.append_var_uint64(value) def append_varint64(self, value): @@ -84,9 +90,9 @@ def append_varint64(self, value): encoded as a varint. """ if not wire_format.INT64_MIN <= value <= wire_format.INT64_MAX: - raise errors.EncodeError('Value out of range: %d' % value) + raise errors.EncodeError("Value out of range: %d" % value) if value < 0: - value += (1 << 64) + value += 1 << 64 self.append_var_uint64(value) def append_var_uint64(self, unsigned_value): @@ -94,9 +100,9 @@ def append_var_uint64(self, unsigned_value): encoded as a varint. """ if not 0 <= unsigned_value <= wire_format.UINT64_MAX: - raise errors.EncodeError('Value out of range: %d' % unsigned_value) + raise errors.EncodeError("Value out of range: %d" % unsigned_value) while True: - bits = unsigned_value & 0x7f + bits = unsigned_value & 0x7F unsigned_value >>= 7 if unsigned_value: bits |= 0x80 @@ -105,10 +111,13 @@ def append_var_uint64(self, unsigned_value): break if sys.version_info < (3, 3): + def tostring(self): """Returns a string containing the bytes in our internal buffer.""" return self._buffer.tostring() + else: + def tostring(self): """Returns a string containing the bytes in our internal buffer.""" return self._buffer.tobytes() diff --git a/odps/tunnel/pb/util_c.pxi b/odps/tunnel/pb/util_c.pxi index 28cc3753..494b2b92 100644 --- a/odps/tunnel/pb/util_c.pxi +++ b/odps/tunnel/pb/util_c.pxi @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -110,7 +110,7 @@ cdef int64_t get_signed_varint64(char** input, char* end, size_t* pos) nogil: return ((value >> 1) ^ (-(value & 1))) # zigzag decoding -cdef int set_varint32(int32_t varint, stringstream &buf) except + nogil: +cdef int set_varint32(int32_t varint, stringstream &buf) except -1 nogil: """ Serialize an integer into a protobuf varint; return the number of bytes serialized. @@ -131,7 +131,7 @@ cdef int set_varint32(int32_t varint, stringstream &buf) except + nogil: return idx + 1 -cdef int set_varint64(int64_t varint, stringstream &buf) except + nogil: +cdef int set_varint64(int64_t varint, stringstream &buf) except -1 nogil: """ Serialize an integer into a protobuf varint; return the number of bytes serialized. @@ -152,7 +152,7 @@ cdef int set_varint64(int64_t varint, stringstream &buf) except + nogil: return idx + 1 -cdef int set_signed_varint32(int32_t varint, stringstream &buf) except + nogil: +cdef int set_signed_varint32(int32_t varint, stringstream &buf) except -1 nogil: """ Serialize an integer into a signed protobuf varint; return the number of bytes serialized. @@ -173,7 +173,7 @@ cdef int set_signed_varint32(int32_t varint, stringstream &buf) except + nogil: return idx + 1 -cdef int set_signed_varint64(int64_t varint, stringstream &buf) except + nogil: +cdef int set_signed_varint64(int64_t varint, stringstream &buf) except -1 nogil: """ Serialize an integer into a signed protobuf varint; return the number of bytes serialized. diff --git a/odps/tunnel/pb/wire_format.py b/odps/tunnel/pb/wire_format.py index 85dcde9d..7786b11d 100644 --- a/odps/tunnel/pb/wire_format.py +++ b/odps/tunnel/pb/wire_format.py @@ -48,8 +48,8 @@ UINT64_MAX = (1 << 64) - 1 # "struct" format strings that will encode/decode the specified formats. -FORMAT_UINT32_LITTLE_ENDIAN = ' UINT64_MAX: - raise errors.EncodeError('Value out of range: %d' % uint64) + raise errors.EncodeError("Value out of range: %d" % uint64) bytes = 1 - while uint64 > 0x7f: + while uint64 > 0x7F: bytes += 1 uint64 >>= 7 return bytes diff --git a/odps/tunnel/pdio/block_decoder_c.pxd b/odps/tunnel/pdio/block_decoder_c.pxd deleted file mode 100644 index 0c28b925..00000000 --- a/odps/tunnel/pdio/block_decoder_c.pxd +++ /dev/null @@ -1,54 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from libc.stdint cimport * -from libc.string cimport * -from libcpp.string cimport string - - -cdef struct FieldParam: - int field_number - int wire_type - - -cdef class Decoder: - cdef int _pos - cdef int _buf_len - cdef int _last_error - cdef char *_buf_ptr - - cdef init(self, char *buf_ptr, int buf_len) - - cdef int get_last_error(self) nogil - cdef void set_last_error(self, int errno) nogil - - cdef int position(self) nogil - cdef void add_offset(self, int n) nogil - cdef int32_t read_field_number(self) nogil - cdef FieldParam read_field_number_and_wire_type(self) nogil - cdef int32_t read_sint32(self) nogil - cdef uint32_t read_uint32(self) nogil - cdef int64_t read_sint64(self) nogil - cdef uint64_t read_uint64(self) nogil - cdef bint read_bool(self) nogil - cdef double read_double(self) nogil - cdef float read_float(self) nogil - cdef string read_string(self) nogil - - cdef int _read_input_byte(self) nogil - cdef int32_t _get_varint32(self) nogil - cdef int64_t _get_varint64(self) nogil - cdef int32_t _get_signed_varint32(self) nogil - cdef int64_t _get_signed_varint64(self) nogil diff --git a/odps/tunnel/pdio/block_decoder_c.pyx b/odps/tunnel/pdio/block_decoder_c.pyx deleted file mode 100644 index a12f88ea..00000000 --- a/odps/tunnel/pdio/block_decoder_c.pyx +++ /dev/null @@ -1,207 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from libc.stdint cimport * -from libc.string cimport * -from libcpp.string cimport string - -from . import errno -from ..pb.wire_format import TAG_TYPE_BITS as PY_TAG_TYPE_BITS, _TAG_TYPE_MASK as _PY_TAG_TYPE_MASK - -cdef: - int TAG_TYPE_BITS = PY_TAG_TYPE_BITS - int _TAG_TYPE_MASK = _PY_TAG_TYPE_MASK - - int BD_SUCCESS = errno.BD_SUCCESS - int BD_BUFFER_EXHAUSTED = errno.BD_BUFFER_EXHAUSTED - int BD_CHECKSUM_INVALID = errno.BD_CHECKSUM_INVALID - int BD_COUNT_NOT_MATCH = errno.BD_COUNT_NOT_MATCH - int BD_INVALID_STREAM_DATA = errno.BD_INVALID_STREAM_DATA - int BD_INVALID_PB_TAG = errno.BD_INVALID_PB_TAG - - -cdef class Decoder: - cdef init(self, char *buf_ptr, int buf_len): - self._buf_ptr = buf_ptr - self._buf_len = buf_len - self._last_error = BD_SUCCESS - self._pos = 0 - - cdef int get_last_error(self) nogil: - return self._last_error - - cdef void set_last_error(self, int errno) nogil: - self._last_error = errno - - cdef int position(self) nogil: - return self._pos - - cdef void add_offset(self, int n) nogil: - self._pos += n - - cdef int32_t read_field_number(self) nogil: - cdef int32_t tag_and_type - tag_and_type = self.read_uint32() - return tag_and_type >> TAG_TYPE_BITS - - cdef FieldParam read_field_number_and_wire_type(self) nogil: - cdef int32_t tag_and_type - cdef FieldParam fp - tag_and_type = self.read_uint32() - fp.field_number = tag_and_type >> TAG_TYPE_BITS - fp.wire_type = tag_and_type & _TAG_TYPE_MASK - return fp - - cdef int32_t read_sint32(self) nogil: - return self._get_signed_varint32() - - cdef uint32_t read_uint32(self) nogil: - return self._get_varint32() - - cdef int64_t read_sint64(self) nogil: - return self._get_signed_varint64() - - cdef uint64_t read_uint64(self) nogil: - return self._get_varint64() - - cdef bint read_bool(self) nogil: - return self._get_varint32() - - cdef double read_double(self) nogil: - cdef double retval - - if sizeof(double) + self._pos > self._buf_len: - self._last_error = BD_BUFFER_EXHAUSTED - return 0.0 - - memcpy(&retval, self._buf_ptr + self._pos, sizeof(double)) - self._pos += sizeof(double) - return retval - - cdef float read_float(self) nogil: - cdef float retval - - if sizeof(float) + self._pos > self._buf_len: - self._last_error = BD_BUFFER_EXHAUSTED - return 0.0 - - memcpy(&retval, self._buf_ptr + self._pos, sizeof(float)) - self._pos += sizeof(float) - return retval - - cdef string read_string(self) nogil: - cdef int size - cdef int old_pos = self._pos - - size = self.read_uint32() - if self._last_error: - return string() - if size + self._pos > self._buf_len: - self._pos = old_pos - self._last_error = BD_BUFFER_EXHAUSTED - return string() - - return string(self._buf_ptr + self._pos, size) - - cdef int _read_input_byte(self) nogil: - cdef int ret - if self._pos < self._buf_len: - self._last_error = BD_SUCCESS - ret = self._buf_ptr[self._pos] - self._pos += 1 - return ret - else: - self._last_error = BD_BUFFER_EXHAUSTED - return 0 - - cdef int32_t _get_varint32(self) nogil: - """ - Deserialize a protobuf varint read from input stream; update - offset based on number of bytes consumed. - """ - cdef int32_t value = 0 - cdef int32_t base = 1 - cdef int index = 0 - cdef int val_byte - - while True: - val_byte = self._read_input_byte() - if self._last_error != BD_SUCCESS: - return 0 - value += (val_byte & 0x7F) * base - if val_byte & 0x80: - base *= 128 - else: - return value - - cdef int64_t _get_varint64(self) nogil: - """ - Deserialize a protobuf varint read from input stream; update - offset based on number of bytes consumed. - """ - cdef int64_t value = 0 - cdef int64_t base = 1 - cdef int index = 0 - cdef int val_byte - - while True: - val_byte = self._read_input_byte() - if self._last_error != BD_SUCCESS: - return 0 - value += (val_byte & 0x7F) * base - if val_byte & 0x80: - base *= 128 - else: - return value - - cdef int32_t _get_signed_varint32(self) nogil: - """ - Deserialize a signed protobuf varint read from input stream; - update offset based on number of bytes consumed. - """ - cdef uint32_t value = 0 - cdef int32_t base = 1 - cdef int index = 0 - cdef int val_byte - - while True: - val_byte = self._read_input_byte() - if self._last_error != BD_SUCCESS: - return 0 - value += (val_byte & 0x7F) * base - if val_byte & 0x80: - base *= 128 - else: - return ((value >> 1) ^ (-(value & 1))) # zigzag decoding - - cdef int64_t _get_signed_varint64(self) nogil: - """ - Deserialize a signed protobuf varint read from input stream; - update offset based on number of bytes consumed. - """ - cdef uint64_t value = 0 - cdef int64_t base = 1 - cdef int index = 0 - cdef int val_byte - - while True: - val_byte = self._read_input_byte() - if self._last_error != BD_SUCCESS: - return 0 - value += (val_byte & 0x7F) * base - if val_byte & 0x80: - base *= 128 - else: - return ((value >> 1) ^ (-(value & 1))) # zigzag decoding diff --git a/odps/tunnel/pdio/block_encoder_c.pxd b/odps/tunnel/pdio/block_encoder_c.pxd deleted file mode 100644 index 446cff7e..00000000 --- a/odps/tunnel/pdio/block_encoder_c.pxd +++ /dev/null @@ -1,45 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from libc.stdint cimport * -from libc.string cimport * - - -cdef class Encoder: - cdef int _pos - cdef int _buf_len - cdef int _last_error - cdef char *_buf_ptr - - cdef init(self, char *buf_ptr, int buf_len) - - cdef int position(self) nogil - cdef int get_last_error(self) nogil - cdef void set_last_error(self, int errno) nogil - - cdef int append_tag(self, int field_num, int wire_type) nogil - cdef int append_sint32(self, int32_t value) nogil - cdef int append_uint32(self, uint32_t value) nogil - cdef int append_sint64(self, int64_t value) nogil - cdef int append_uint64(self, uint64_t value) nogil - cdef int append_bool(self, bint value) nogil - cdef int append_float(self, float value) nogil - cdef int append_double(self, double value) nogil - cdef int append_string(self, const char *ptr, int value_len) nogil - - cdef int _set_varint32(self, int32_t varint) nogil - cdef int _set_varint64(self, int64_t varint) nogil - cdef int _set_signed_varint32(self, int32_t varint) nogil - cdef int _set_signed_varint64(self, int64_t varint) nogil diff --git a/odps/tunnel/pdio/block_encoder_c.pyx b/odps/tunnel/pdio/block_encoder_c.pyx deleted file mode 100644 index c919203c..00000000 --- a/odps/tunnel/pdio/block_encoder_c.pyx +++ /dev/null @@ -1,206 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from libc.stdint cimport * -from libc.string cimport * - -from . import errno - -cdef: - int BD_SUCCESS = errno.BD_SUCCESS - int BD_BUFFER_EXHAUSTED = errno.BD_BUFFER_EXHAUSTED - int BD_CHECKSUM_INVALID = errno.BD_CHECKSUM_INVALID - int BD_COUNT_NOT_MATCH = errno.BD_COUNT_NOT_MATCH - int BD_INVALID_STREAM_DATA = errno.BD_INVALID_STREAM_DATA - int BD_INVALID_PB_TAG = errno.BD_INVALID_PB_TAG - - -cdef class Encoder: - cdef init(self, char *buf_ptr, int buf_len): - self._buf_ptr = buf_ptr - self._buf_len = buf_len - self._last_error = BD_SUCCESS - self._pos = 0 - - cdef int position(self) nogil: - return self._pos - - cdef int get_last_error(self) nogil: - return self._last_error - - cdef void set_last_error(self, int errno) nogil: - self._last_error = errno - - cdef int append_tag(self, int field_num, int wire_type) nogil: - cdef int key - key = (field_num << 3) | wire_type - cdef int size = self._set_varint64(key) - return size - - cdef int append_sint32(self, int32_t value) nogil: - return self._set_signed_varint32(value) - - cdef int append_uint32(self, uint32_t value) nogil: - return self._set_varint32(value) - - cdef int append_sint64(self, int64_t value) nogil: - return self._set_signed_varint64(value) - - cdef int append_uint64(self, uint64_t value) nogil: - return self._set_varint64(value) - - cdef int append_bool(self, bint value) nogil: - return self._set_varint32(value) - - cdef int append_float(self, float value) nogil: - if self._pos + sizeof(float) >= self._buf_len: - self._last_error = BD_BUFFER_EXHAUSTED - return -1 - memcpy(self._buf_ptr + self._pos, &value, sizeof(float)) - self._pos += sizeof(float) - return sizeof(float) - - cdef int append_double(self, double value) nogil: - if self._pos + sizeof(double) >= self._buf_len: - self._last_error = BD_BUFFER_EXHAUSTED - return -1 - memcpy(self._buf_ptr + self._pos, &value, sizeof(double)) - self._pos += sizeof(double) - return sizeof(double) - - cdef int append_string(self, const char *ptr, int value_len) nogil: - cdef int size = self._set_varint32(value_len) - if self._last_error: - return -1 - if self._pos + value_len >= self._buf_len: - self._last_error = BD_BUFFER_EXHAUSTED - return -1 - memcpy(self._buf_ptr + self._pos, ptr, value_len) - return size + value_len - - cdef int _set_varint32(self, int32_t varint) nogil: - """ - Serialize an integer into a protobuf varint; return the number of bytes - serialized. - """ - - # Negative numbers are always 10 bytes, so we need a uint64_t to - # facilitate encoding - cdef uint64_t enc = varint - bits = enc & 0x7f - enc >>= 7 - cdef int idx = 0 - while enc: - if self._pos + idx >= self._buf_len: - self._last_error = BD_BUFFER_EXHAUSTED - return -1 - else: - self._buf_ptr[self._pos + idx] = bits | 0x80 - bits = enc & 0x7f - enc >>= 7 - idx += 1 - - if self._pos + idx >= self._buf_len: - self._last_error = BD_BUFFER_EXHAUSTED - return -1 - self._buf_ptr[self._pos + idx] = bits - self._pos += idx + 1 - return idx + 1 - - cdef int _set_varint64(self, int64_t varint) nogil: - """ - Serialize an integer into a protobuf varint; return the number of bytes - serialized. - """ - - # Negative numbers are always 10 bytes, so we need a uint64_t to - # facilitate encoding - cdef uint64_t enc = varint - bits = enc & 0x7f - enc >>= 7 - cdef int idx = 0 - while enc: - if self._pos + idx >= self._buf_len: - self._last_error = BD_BUFFER_EXHAUSTED - return -1 - else: - self._buf_ptr[self._pos + idx] = bits | 0x80 - bits = enc & 0x7f - enc >>= 7 - idx += 1 - - if self._pos + idx >= self._buf_len: - self._last_error = BD_BUFFER_EXHAUSTED - return -1 - self._buf_ptr[self._pos + idx] = bits - self._pos += idx + 1 - return idx + 1 - - cdef int _set_signed_varint32(self, int32_t varint) nogil: - """ - Serialize an integer into a signed protobuf varint; return the number of - bytes serialized. - """ - cdef uint32_t enc - cdef int idx = 0 - - enc = (varint << 1) ^ (varint >> 31) # zigzag encoding - bits = enc & 0x7f - enc >>= 7 - while enc: - if self._pos + idx >= self._buf_len: - self._last_error = BD_BUFFER_EXHAUSTED - return -1 - else: - self._buf_ptr[self._pos + idx] = bits | 0x80 - bits = enc & 0x7f - enc >>= 7 - idx += 1 - - if self._pos + idx >= self._buf_len: - self._last_error = BD_BUFFER_EXHAUSTED - return -1 - self._buf_ptr[self._pos + idx] = bits - self._pos += idx + 1 - return idx + 1 - - - cdef int _set_signed_varint64(self, int64_t varint) nogil: - """ - Serialize an integer into a signed protobuf varint; return the number of - bytes serialized. - """ - cdef uint64_t enc - cdef int idx = 0 - - enc = (varint << 1) ^ (varint >> 63) # zigzag encoding - bits = enc & 0x7f - enc >>= 7 - while enc: - if self._pos + idx >= self._buf_len: - self._last_error = BD_BUFFER_EXHAUSTED - return -1 - else: - self._buf_ptr[self._pos + idx] = bits | 0x80 - bits = enc & 0x7f - enc >>= 7 - idx += 1 - - if self._pos + idx >= self._buf_len: - self._last_error = BD_BUFFER_EXHAUSTED - return -1 - self._buf_ptr[self._pos + idx] = bits - self._pos += idx + 1 - return idx + 1 diff --git a/odps/tunnel/pdio/pdreader_c.pxd b/odps/tunnel/pdio/pdreader_c.pxd deleted file mode 100644 index e68fc1d4..00000000 --- a/odps/tunnel/pdio/pdreader_c.pxd +++ /dev/null @@ -1,63 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from libc.stdint cimport * -from libc.string cimport * -from libcpp.vector cimport vector - -from ...src.types_c cimport SchemaSnapshot -from ..checksum_c cimport Checksum -from .util_c cimport * -from .block_decoder_c cimport Decoder - - -ctypedef void (*_NOGIL_READER)(TunnelPandasReader self, ArrayVariantPtrs &record, int row) nogil - - -cdef class TunnelPandasReader: - cdef object _schema - cdef object _columns - cdef object _stream - cdef Checksum _crc - cdef Checksum _crccrc - cdef int _n_columns - cdef int _read_limit - cdef object _reader_schema - cdef SchemaSnapshot _schema_snapshot - cdef Decoder _decoder - - cdef int _cur_cursor - cdef int _mem_cache_size - cdef int _mem_cache_bound - cdef int _row_cache_size - - cdef vector[_NOGIL_READER] _nogil_readers - cdef object _mem_cache - cdef int _row_mem_ptr - cdef uint32_t _row_checksum - cdef int _row_ptr - cdef int _use_no_gil - - cdef void _read_bool(self, ArrayVariantPtrs &aptr, int row) nogil - cdef void _read_int64(self, ArrayVariantPtrs &aptr, int ptr) nogil - cdef void _read_float(self, ArrayVariantPtrs &aptr, int idx) nogil - cdef void _read_double(self, ArrayVariantPtrs &aptr, int row) nogil - cdef int _fill_ndarrays_nogil(self, vector[ArrayVariantPtrs] &col_ptrs, int start_row, int limit) nogil - - cdef void _scan_schema(self) - cpdef refill_cache(self) - cpdef reset_positions(self, object cache, int cache_size) - - cpdef int readinto(self, object buffers, object columns=*, int limit=*) except? -1 diff --git a/odps/tunnel/pdio/pdreader_c.pyx b/odps/tunnel/pdio/pdreader_c.pyx deleted file mode 100644 index 2fee41ca..00000000 --- a/odps/tunnel/pdio/pdreader_c.pyx +++ /dev/null @@ -1,390 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from libc.stdint cimport * -from libc.string cimport * -from libcpp.vector cimport vector - -import ctypes -import numpy as np -cimport numpy as np - -from ..checksum_c cimport Checksum -from .block_decoder_c cimport Decoder -from .util_c cimport * -from . import errno -from ..wireconstants import ProtoWireConstants -from ... import types, options - -try: - import pandas as pd -except (ImportError, ValueError): - pd = None - -cdef: - uint32_t WIRE_TUNNEL_META_COUNT = ProtoWireConstants.TUNNEL_META_COUNT - uint32_t WIRE_TUNNEL_META_CHECKSUM = ProtoWireConstants.TUNNEL_META_CHECKSUM - uint32_t WIRE_TUNNEL_END_RECORD = ProtoWireConstants.TUNNEL_END_RECORD - - int BD_SUCCESS = errno.BD_SUCCESS - int BD_BUFFER_EXHAUSTED = errno.BD_BUFFER_EXHAUSTED - int BD_CHECKSUM_INVALID = errno.BD_CHECKSUM_INVALID - int BD_COUNT_NOT_MATCH = errno.BD_COUNT_NOT_MATCH - int BD_INVALID_STREAM_DATA = errno.BD_INVALID_STREAM_DATA - int BD_INVALID_PB_TAG = errno.BD_INVALID_PB_TAG - - -cdef class TunnelPandasReader: - def __init__(self, object schema, object input_stream, columns=None): - self._schema = schema - if columns is None: - self._columns = self._schema.columns - else: - self._columns = [self._schema[c] for c in columns] - self._reader_schema = types.OdpsSchema(columns=self._columns) - self._schema_snapshot = self._reader_schema.build_snapshot() - self._n_columns = len(self._columns) - - self._mem_cache_size = options.tunnel.pd_mem_cache_size - - self._stream = input_stream - - self._row_mem_ptr = 0 - self._row_checksum = 0 - self._cur_cursor = 0 - self._mem_cache_bound = 0 - self._mem_cache = None - self._crc = Checksum() - self._crccrc = Checksum() - - self._scan_schema() - - cdef void _scan_schema(self): - self._nogil_readers.resize(self._n_columns) - self._use_no_gil = 1 - for i in range(self._n_columns): - self._nogil_readers[i] = NULL - data_type = self._schema_snapshot._col_types[i] - if data_type == types.boolean: - self._nogil_readers[i] = self._read_bool - elif data_type == types.datetime: - self._use_no_gil = 0 - elif data_type == types.string: - self._use_no_gil = 0 - elif data_type == types.float_: - self._nogil_readers[i] = self._read_float - elif data_type == types.double: - self._nogil_readers[i] = self._read_double - elif data_type in types.integer_types: - self._nogil_readers[i] = self._read_int64 - elif data_type == types.binary: - self._use_no_gil = 0 - elif data_type == types.timestamp: - self._use_no_gil = 0 - elif data_type == types.interval_day_time: - self._use_no_gil = 0 - elif data_type == types.interval_year_month: - self._use_no_gil = 0 - elif isinstance(data_type, types.Decimal): - self._use_no_gil = 0 - elif isinstance(data_type, (types.Char, types.Varchar)): - self._use_no_gil = 0 - else: - self._use_no_gil = 0 - - @property - def row_mem_ptr(self): - return self._row_mem_ptr - - @property - def mem_cache_bound(self): - return self._mem_cache_bound - - cpdef reset_positions(self, object cache, int cache_size): - cdef char *cache_ptr - cdef uint64_t cache_ptr_int - - self._mem_cache_bound = cache_size - self._row_mem_ptr = 0 - self._row_ptr = 0 - self._crc.c_setvalue(self._row_checksum) - self._mem_cache = cache - - if isinstance(cache, ctypes.Array): - cache_ptr_int = ctypes.addressof(cache) - cache_ptr = cache_ptr_int - else: - cache_ptr = cache - self._decoder = Decoder() - self._decoder.init(cache_ptr, self._mem_cache_bound) - - cpdef refill_cache(self): - cdef bytearray new_mem_cache = bytearray(self._mem_cache_size) - cdef object new_mem_view = memoryview(new_mem_cache) - cdef object old_mem_view - cdef int left_size = 0 - cdef int read_size = 0 - - if self._mem_cache is not None: - old_mem_view = memoryview(self._mem_cache) - left_size = self._mem_cache_bound - self._row_mem_ptr - if left_size: - new_mem_view[0:left_size] = old_mem_view[self._row_mem_ptr:] - - read_size = self._stream.readinto(new_mem_view[left_size:]) - self.reset_positions(new_mem_cache, left_size + read_size) - return read_size - - cdef void _read_bool(self, ArrayVariantPtrs &aptr, int row) nogil: - cdef bint val - val = self._decoder.read_bool() - if self._decoder.get_last_error(): - return - aptr.v.pbool_array[row] = val - self._crc.c_update_bool(val) - - cdef void _read_int64(self, ArrayVariantPtrs &aptr, int row) nogil: - cdef int64_t val - val = self._decoder.read_sint64() - if self._decoder.get_last_error(): - return - aptr.v.pl_array[row] = val - self._crc.c_update_long(val) - - cdef void _read_float(self, ArrayVariantPtrs &aptr, int row) nogil: - cdef float val - val = self._decoder.read_float() - if self._decoder.get_last_error(): - return - aptr.v.pflt_array[row] = val - self._crc.c_update_float(val) - - cdef void _read_double(self, ArrayVariantPtrs &aptr, int row) nogil: - cdef double val - val = self._decoder.read_double() - if self._decoder.get_last_error(): - return - aptr.v.pdbl_array[row] = val - self._crc.c_update_double(val) - - cdef int _fill_ndarrays_nogil(self, vector[ArrayVariantPtrs] &col_ptrs, int start_row, int limit) nogil: - cdef int idx - cdef int field_num - cdef int idx_of_checksum - cdef int rowid = start_row - cdef int32_t checksum - - while rowid < limit: - while True: - field_num = self._decoder.read_field_number() - if self._decoder.get_last_error() == BD_BUFFER_EXHAUSTED: - return rowid - start_row - - if field_num == 0: - continue - if field_num == WIRE_TUNNEL_END_RECORD: - checksum = self._crc.c_getvalue() - if self._decoder.read_uint32() != checksum: - if self._decoder.get_last_error() == BD_BUFFER_EXHAUSTED: - return rowid - start_row - self._decoder.set_last_error(BD_CHECKSUM_INVALID) - return rowid - start_row - self._crc.c_reset() - self._crccrc.c_update_int(checksum) - break - if field_num == WIRE_TUNNEL_META_COUNT: - if self._cur_cursor != self._decoder.read_sint64(): - if self._decoder.get_last_error() == BD_BUFFER_EXHAUSTED: - return rowid - start_row - self._decoder.set_last_error(BD_COUNT_NOT_MATCH) - return rowid - start_row - idx_of_checksum = self._decoder.read_field_number() - if self._decoder.get_last_error() == BD_BUFFER_EXHAUSTED: - return rowid - start_row - if WIRE_TUNNEL_META_CHECKSUM != idx_of_checksum: - self._decoder.set_last_error(BD_INVALID_STREAM_DATA) - return rowid - start_row - if self._crccrc.c_getvalue() != self._decoder.read_uint32(): - if self._decoder.get_last_error() == BD_BUFFER_EXHAUSTED: - return rowid - start_row - self._decoder.set_last_error(BD_CHECKSUM_INVALID) - return rowid - start_row - self._row_mem_ptr = self._decoder.position() - return rowid - start_row - - if field_num > self._n_columns: - self._decoder.set_last_error(BD_INVALID_PB_TAG) - return rowid - start_row - - self._crc.c_update_int(field_num) - - idx = field_num - 1 - self._nogil_readers[idx](self, col_ptrs[idx], rowid) - if self._decoder.get_last_error() == BD_BUFFER_EXHAUSTED: - return rowid - start_row - self._row_mem_ptr = self._decoder.position() - self._row_checksum = self._crc.c_getvalue() - rowid += 1 - self._cur_cursor += 1 - - return rowid - start_row - - cpdef int readinto(self, object buffers, object columns=None, int limit=-1) except? -1: - """ - Read data into an existing buffer. The ``buffers`` variable can be a list or a dict - of numpy arrays, or a Pandas DataFrame. The argument ``columns`` determines the subset - and order of data. - - Currently only ``bigint``, ``float``, ``double`` and ``boolean`` are supported. - - :param buffers: data buffer to read, can be numpy array, dict or pandas DataFrame - :param columns: column names to read - :param limit: total number of records to read - :return: number of records read - """ - cdef: - int i - int filled - int filled_total = 0 - int fetch_count = 0 - - vector[ArrayVariantPtrs] col_ptrs - int64_t[:] int_mmap - float[:] flt_mmap - double[:] dbl_mmap - np.ndarray[np.npy_bool, ndim=1, cast=True] bool_array - - dict col_dict - - if not self._use_no_gil: - raise NotImplementedError('Currently complex types are not supported.') - - col_dict = dict() - if isinstance(buffers, dict): - col_dict = buffers - elif pd and isinstance(buffers, pd.DataFrame): - for col_name in buffers: - col_dict[col_name] = buffers[col_name].values - elif columns is not None: - for col_name, buf in zip(columns, buffers): - col_dict[col_name] = buf - else: - for col, buf in zip(self._columns, buffers): - col_dict[col.name] = buf - - if limit < 0: - limit = 0x7fffffff - for buf in col_dict.values(): - limit = min(limit, len(buf)) - - col_ptrs.resize(self._n_columns) - for i in range(self._n_columns): - col_name = self._columns[i].name - data_type = self._schema_snapshot._col_types[i] - if data_type == types.float_: - flt_mmap = col_dict[col_name] - col_ptrs[i].v.pflt_array = &flt_mmap[0] - elif data_type == types.double: - dbl_mmap = col_dict[col_name] - col_ptrs[i].v.pdbl_array = &dbl_mmap[0] - elif data_type in types.integer_types: - int_mmap = col_dict[col_name] - col_ptrs[i].v.pl_array = &int_mmap[0] - elif data_type == types.boolean: - bool_array = col_dict[col_name] - col_ptrs[i].v.pbool_array = bool_array.data - - while filled_total < limit: - fetch_count = self.refill_cache() - if fetch_count == 0: - break - self._decoder.set_last_error(BD_SUCCESS) - filled = self._fill_ndarrays_nogil(col_ptrs, filled_total, limit) - if self._decoder.get_last_error() != BD_SUCCESS: - if self._decoder.get_last_error() == BD_CHECKSUM_INVALID: - raise IOError('Checksum invalid') - elif self._decoder.get_last_error() == BD_INVALID_STREAM_DATA: - raise IOError('Invalid stream data') - elif self._decoder.get_last_error() == BD_COUNT_NOT_MATCH: - raise IOError('Count not match') - elif self._decoder.get_last_error() == BD_INVALID_PB_TAG: - raise IOError('Invalid protobuf tag. Perhaps the datastream ' - 'from server is crushed.') - filled_total += filled - - return filled_total - - def read(self, columns=None, limit=-1): - """ - Read data into a pandas DataFrame. If pandas is not installed, a dict will be returned instead. - The argument ``columns`` determines the subset and order of data. - - Currently only ``bigint``, ``float``, ``double`` and ``boolean`` are supported. - - :param columns: column names to read - :param limit: total number of records to read - :return: number of records read - """ - if not columns: - columns = self._schema.names - - buf = dict() - results = dict() - buf_len = options.tunnel.pd_row_cache_size - - for col in columns: - results[col] = [] - col_type = self._schema[col].type - if col_type == types.float_: - buf[col] = np.empty((buf_len,), dtype=np.float32) - elif col_type == types.double: - buf[col] = np.empty((buf_len,), dtype=np.float64) - elif col_type in types.integer_types: - buf[col] = np.empty((buf_len,), dtype=np.int64) - elif col_type == types.boolean: - buf[col] = np.empty((buf_len,), dtype=np.bool_) - - read_all = limit < 0 - while read_all or limit > 0: - read_cols = self.readinto(buf, columns, limit) - if read_all: - if read_cols == 0: - break - else: - limit -= read_cols - for col in buf: - results[col].append(np.copy(buf[col])) - - merged = dict() - for col in results: - if len(results[col]) > 1: - merged[col] = np.concatenate(results[col]) - else: - merged[col] = results[col][0] - - if pd: - return pd.DataFrame(merged, columns=columns) - else: - return merged - - def close(self): - if hasattr(self._schema, 'close'): - self._schema.close() - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.close() diff --git a/odps/tunnel/pdio/pdwriter.py b/odps/tunnel/pdio/pdwriter.py deleted file mode 100644 index 1af45c71..00000000 --- a/odps/tunnel/pdio/pdwriter.py +++ /dev/null @@ -1,45 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from ..io.stream import RequestsIO, get_compress_stream -from ... import options -from ...compat import six - -try: - from .pdwriter_c import BasePandasWriter -except ImportError: - BasePandasWriter = None - - -if BasePandasWriter: - class TunnelPandasWriter(BasePandasWriter): - def __init__(self, schema, request_callback, compress_option=None): - self._req_io = RequestsIO(request_callback, chunk_size=options.chunk_size) - out = get_compress_stream(self._req_io, compress_option) - super(TunnelPandasWriter, self).__init__(schema, out) - self._req_io.start() - - def write(self, data, columns=None, limit=-1, dim_offsets=None): - if self._req_io._async_err: - ex_type, ex_value, tb = self._req_io._async_err - six.reraise(ex_type, ex_value, tb) - super(TunnelPandasWriter, self).write(data, columns=columns, limit=limit, - dim_offsets=dim_offsets) - - write.__doc__ = BasePandasWriter.write.__doc__ - - def close(self): - super(TunnelPandasWriter, self).close() - self._req_io.finish() diff --git a/odps/tunnel/pdio/pdwriter_c.pxd b/odps/tunnel/pdio/pdwriter_c.pxd deleted file mode 100644 index 555b2157..00000000 --- a/odps/tunnel/pdio/pdwriter_c.pxd +++ /dev/null @@ -1,74 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from libc.stdint cimport * -from libc.string cimport * -from libcpp.vector cimport vector - -from ...src.types_c cimport SchemaSnapshot -from ..checksum_c cimport Checksum -from .util_c cimport * -from .block_encoder_c cimport Encoder - - -ctypedef void (*_NOGIL_WRITER)(BasePandasWriter self, ArrayVariantPtrs &record, int row) nogil - - -cdef class BasePandasWriter: - cdef object _schema - cdef object _columns - cdef object _stream - cdef Checksum _crc - cdef Checksum _crccrc - cdef int _n_columns - cdef int _read_limit - cdef Encoder _encoder - - cdef object _mem_cache - cdef object _mem_cache_view - cdef int _mem_cache_size - - cdef int _count - cdef uint32_t _row_pos - - cdef vector[_NOGIL_WRITER] _nogil_writers - - cdef void _write_long_val(self, long val) nogil - - cdef void _write_long(self, ArrayVariantPtrs &aptr, int index) nogil - - cdef void _write_bool(self, ArrayVariantPtrs &aptr, int index) nogil - - cdef void _write_float(self, ArrayVariantPtrs &aptr, int index) nogil - - cdef void _write_double(self, ArrayVariantPtrs &aptr, int index) nogil - - cdef int _write_single_ndarray_nogil( - self, ArrayVariantPtrs &col_ptr, vector[int] &dims, vector[int] &col_to_dim, - long start_pos, long limit, vector[long] &dim_offsets) nogil - cdef int _write_dims_nogil( - self, vector[ArrayVariantPtrs] &col_ptrs, vector[int] &col_to_dim, - long start_row, long limit) nogil - - cpdef init_cache(self) - cpdef reset_positions(self) - cpdef write_stream(self, object data, int length) - - cpdef _write_single_array(self, object data, object columns, long limit, object dim_offsets) - cpdef _write_dims(self, object data, object columns, long limit) - cpdef write(self, object data, object columns=*, long limit=*, object dim_offsets=*) - - cpdef flush(self) - cpdef close(self) diff --git a/odps/tunnel/pdio/pdwriter_c.pyx b/odps/tunnel/pdio/pdwriter_c.pyx deleted file mode 100644 index affadce1..00000000 --- a/odps/tunnel/pdio/pdwriter_c.pyx +++ /dev/null @@ -1,446 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from libc.stdint cimport * -from libc.string cimport * -from libcpp.vector cimport vector - -import numpy as np -cimport numpy as np -try: - from scipy import sparse as sps -except ImportError: - sps = None - -from ..checksum_c cimport Checksum -from .block_encoder_c cimport Encoder -from . import errno -from ..pb import wire_format -from ..wireconstants import ProtoWireConstants -from ... import types, options - -try: - import pandas as pd -except (ImportError, ValueError): - pd = None - -cdef: - uint32_t WIRETYPE_VARINT = wire_format.WIRETYPE_VARINT - uint32_t WIRETYPE_FIXED32 = wire_format.WIRETYPE_FIXED32 - uint32_t WIRETYPE_FIXED64 = wire_format.WIRETYPE_FIXED64 - uint32_t WIRETYPE_LENGTH_DELIMITED = wire_format.WIRETYPE_LENGTH_DELIMITED - - uint32_t WIRE_TUNNEL_META_COUNT = ProtoWireConstants.TUNNEL_META_COUNT - uint32_t WIRE_TUNNEL_META_CHECKSUM = ProtoWireConstants.TUNNEL_META_CHECKSUM - uint32_t WIRE_TUNNEL_END_RECORD = ProtoWireConstants.TUNNEL_END_RECORD - - int BD_SUCCESS = errno.BD_SUCCESS - int BD_BUFFER_EXHAUSTED = errno.BD_BUFFER_EXHAUSTED - int BD_CHECKSUM_INVALID = errno.BD_CHECKSUM_INVALID - int BD_COUNT_NOT_MATCH = errno.BD_COUNT_NOT_MATCH - int BD_INVALID_STREAM_DATA = errno.BD_INVALID_STREAM_DATA - int BD_INVALID_PB_TAG = errno.BD_INVALID_PB_TAG - - -cdef class BasePandasWriter: - def __init__(self, object schema, object output_stream, columns=None): - self._schema = schema - if columns is None: - self._columns = self._schema.columns - else: - self._columns = [self._schema[c] for c in columns] - self._n_columns = len(self._columns) - - self._mem_cache_size = options.tunnel.pd_mem_cache_size - - self._stream = output_stream - - self._count = 0 - self._row_pos = 0 - self._crc = Checksum() - self._crccrc = Checksum() - - cdef void _write_long_val(self, long val) nogil: - self._crc.c_update_long(val) - self._encoder.append_sint32(val) - - cdef void _write_long(self, ArrayVariantPtrs &aptr, int index) nogil: - cdef long val = aptr.v.pl_array[index] - self._crc.c_update_long(val) - self._encoder.append_sint32(val) - - cdef void _write_bool(self, ArrayVariantPtrs &aptr, int index) nogil: - cdef np.npy_bool val = aptr.v.pbool_array[index] - self._crc.c_update_bool(val) - self._encoder.append_bool(val) - - cdef void _write_float(self, ArrayVariantPtrs &aptr, int index) nogil: - cdef float val = aptr.v.pflt_array[index] - self._crc.c_update_float(val) - self._encoder.append_float(val) - - cdef void _write_double(self, ArrayVariantPtrs &aptr, int index) nogil: - cdef double val = aptr.v.pdbl_array[index] - self._crc.c_update_double(val) - self._encoder.append_double(val) - - cpdef reset_positions(self): - self._row_pos = 0 - self._encoder.init(self._mem_cache, self._mem_cache_size) - - cpdef init_cache(self): - self._mem_cache = bytearray(self._mem_cache_size) - self._mem_cache_view = memoryview(self._mem_cache) - self._encoder = Encoder() - self.reset_positions() - - cpdef write_stream(self, object data, int length): - self._stream.write(self._mem_cache_view[:length]) - - cdef int _write_single_ndarray_nogil(self, ArrayVariantPtrs &col_ptr, vector[int] &dims, - vector[int] &col_to_dim, long start_pos, long limit, - vector[long] &dim_offsets) nogil: - cdef: - int i, j, dim_id - long flat_pos - long max_pos = 1 - long rest_pos - vector[int] array_pos - - array_pos.resize(dims.size()) - rest_pos = start_pos - for i in reversed(range(dims.size())): - max_pos *= dims[i] - array_pos[i] = rest_pos % dims[i] - rest_pos /= dims[i] - - if limit > 0: - max_pos = min(limit, max_pos) - - self._crc.c_setvalue(0) - for i in range(start_pos, max_pos): - self._crc.c_setvalue(0) - - for j in range(col_to_dim.size()): - if col_to_dim[j] < 0: - continue - self._crc.c_update_int(j + 1) - if col_to_dim[j] == 0: - self._encoder.append_tag(j + 1, col_ptr.wire_type) - if self._encoder.get_last_error() != BD_SUCCESS: - return i - start_pos - - self._nogil_writers[0](self, col_ptr, i) - if self._encoder.get_last_error() != BD_SUCCESS: - return i - start_pos - else: - dim_id = col_to_dim[j] - 1 - self._encoder.append_tag(j + 1, WIRETYPE_VARINT) - if self._encoder.get_last_error() != BD_SUCCESS: - return i - start_pos - - self._write_long_val(array_pos[dim_id] + dim_offsets[dim_id]) - if self._encoder.get_last_error() != BD_SUCCESS: - return i - start_pos - - checksum = self._crc.c_getvalue() - - self._encoder.append_tag(WIRE_TUNNEL_END_RECORD, WIRETYPE_VARINT) - if self._encoder.get_last_error() != BD_SUCCESS: - return i - start_pos - self._encoder.append_uint32(checksum) - if self._encoder.get_last_error() != BD_SUCCESS: - return i - start_pos - self._crccrc.c_update_int(checksum) - - self._row_pos = self._encoder.position() - - array_pos[array_pos.size() - 1] += 1 - for j in reversed(range(1, array_pos.size())): - if array_pos[j] >= dims[j]: - array_pos[j - 1] += 1 - array_pos[j] = 0 - else: - break - - return max_pos - start_pos - - cdef int _write_dims_nogil(self, vector[ArrayVariantPtrs] &col_ptrs, vector[int] &col_to_dim, - long start_row, long limit) nogil: - cdef: - int i - int dim_id - int row_id - - for row_id in range(start_row, limit): - self._crc.c_setvalue(0) - - for i in range(col_to_dim.size()): - if col_to_dim[i] < 0: - continue - self._crc.c_update_int(i + 1) - dim_id = col_to_dim[i] - 1 - - self._encoder.append_tag(i + 1, col_ptrs[dim_id].wire_type) - if self._encoder.get_last_error() != BD_SUCCESS: - return row_id - start_row - - self._nogil_writers[dim_id](self, col_ptrs[dim_id], row_id) - if self._encoder.get_last_error() != BD_SUCCESS: - return row_id - start_row - - checksum = self._crc.c_getvalue() - - self._encoder.append_tag(WIRE_TUNNEL_END_RECORD, WIRETYPE_VARINT) - if self._encoder.get_last_error() != BD_SUCCESS: - return row_id - start_row - self._encoder.append_uint32(checksum) - if self._encoder.get_last_error() != BD_SUCCESS: - return row_id - start_row - self._crccrc.c_update_int(checksum) - - self._row_pos = self._encoder.position() - - return limit - start_row - - cpdef _write_single_array(self, object data, object columns, long limit, object dim_offsets): - cdef: - int i - long start_pos - long count_delta - long total_size - vector[int] dims - vector[long] dim_offset_vct - vector[int] col_to_dim - - ArrayVariantPtrs col_ptr - Py_buffer buf - - dict col_ids - object array_type - object val_column - - total_size = 1 - dims.resize(len(data.shape)) - for i in range(dims.size()): - dims[i] = data.shape[i] - total_size *= dims[i] - - dim_offset_vct.resize(len(data.shape)) - if dim_offsets is None: - for i in range(dim_offset_vct.size()): - dim_offset_vct[i] = 0 - else: - for i in range(dim_offset_vct.size()): - dim_offset_vct[i] = dim_offsets[i] - - col_to_dim.resize(data.ndim + 1) - if columns is None: - if self._n_columns != col_to_dim.size(): - raise ValueError('Column number not consistent with array shape: num of ' - 'columns should be 1 + ndim') - for i in range(col_to_dim.size()): - col_to_dim[i] = i + 1 - col_to_dim[col_to_dim.size() - 1] = 0 - val_column = self._schema[-1] - else: - col_ids = dict() - for idx, col_name in enumerate(columns): - col_ids[col_name] = idx + 1 - col_ids[columns[-1]] = 0 - - for idx, col in enumerate(self._schema): - if col.name in col_ids: - i = idx - col_to_dim[i] = col_ids[col.name] - if col_to_dim[i] == 0: - val_column = col - else: - col_to_dim[i] = -1 - - self._nogil_writers.resize(1) - if val_column.type == types.float_: - col_ptr.wire_type = WIRETYPE_FIXED32 - data = data.astype(np.float_) if data.dtype != np.float_ else data - col_ptr.v.pflt_array = np.PyArray_DATA(data) - self._nogil_writers[0] = self._write_float - elif val_column.type == types.double: - col_ptr.wire_type = WIRETYPE_FIXED64 - data = data.astype(np.double) if data.dtype != np.double else data - col_ptr.v.pdbl_array = np.PyArray_DATA(data) - self._nogil_writers[0] = self._write_double - elif val_column.type in types.integer_types: - col_ptr.wire_type = WIRETYPE_VARINT - data = data.astype(np.int64) if data.dtype != np.int64 else data - col_ptr.v.pl_array = np.PyArray_DATA(data) - self._nogil_writers[0] = self._write_long - elif val_column.type == types.boolean: - col_ptr.wire_type = WIRETYPE_VARINT - data = data.astype(np.bool_) if data.dtype != np.bool_ else data - col_ptr.v.pbool_array = np.PyArray_DATA(data) - self._nogil_writers[0] = self._write_bool - - if limit <= 0: - limit = total_size - - start_pos = 0 - self.init_cache() - while start_pos < limit: - self.reset_positions() - count_delta = self._write_single_ndarray_nogil( - col_ptr, dims, col_to_dim, start_pos, limit, dim_offset_vct) - self._count += count_delta - start_pos += count_delta - self.write_stream(self._mem_cache, self._row_pos) - - cpdef _write_dims(self, object data, object columns, long limit): - cdef: - int i - long start_pos - long count_delta - long total_size - vector[int] dims - vector[int] col_to_dim - - vector[ArrayVariantPtrs] col_ptrs - dict col_dict - dict col_idx - - object col_data - int64_t[:] int_mmap - float[:] flt_mmap - double[:] dbl_mmap - np.ndarray[np.npy_bool, ndim=1, cast=True] bool_array - - col_dict = dict() - col_idx = dict() - - if isinstance(data, dict): - col_dict = data - elif pd and isinstance(data, pd.DataFrame): - for col_name in data: - col_dict[col_name] = data[col_name].values - elif columns is not None: - for col_name, buf in zip(columns, data): - col_dict[col_name] = buf - else: - for col, buf in zip(self._columns, data): - col_dict[col.name] = buf - - if limit < 0: - limit = 0x7fffffff - for buf in col_dict.values(): - limit = min(limit, len(buf)) - - col_ptrs.resize(self._n_columns) - col_to_dim.resize(self._n_columns) - - i = 0 - self._nogil_writers.resize(len(col_dict)) - for col_name, col_data in col_dict.items(): - data_type = self._schema[col_name].type - if data_type == types.float_: - col_ptrs[i].wire_type = WIRETYPE_FIXED32 - flt_mmap = col_data.astype(np.float_) if col_data.dtype != np.float_ else col_data - col_ptrs[i].v.pflt_array = &flt_mmap[0] - self._nogil_writers[i] = self._write_float - elif data_type == types.double: - col_ptrs[i].wire_type = WIRETYPE_FIXED64 - dbl_mmap = col_data.astype(np.double) if col_data.dtype != np.double else col_data - col_ptrs[i].v.pdbl_array = &dbl_mmap[0] - self._nogil_writers[i] = self._write_double - elif data_type in types.integer_types: - col_ptrs[i].wire_type = WIRETYPE_VARINT - int_mmap = col_data.astype(np.int64) if col_data.dtype != np.int64 else col_data - col_ptrs[i].v.pl_array = &int_mmap[0] - self._nogil_writers[i] = self._write_long - elif data_type == types.boolean: - col_ptrs[i].wire_type = WIRETYPE_VARINT - bool_array = col_data.astype(np.bool_) if col_data.dtype != np.bool_ else col_data - col_ptrs[i].v.pbool_array = bool_array.data - self._nogil_writers[i] = self._write_bool - col_idx[col_name] = i - i += 1 - - for i in range(self._n_columns): - col_name = self._columns[i].name - - if col_name not in col_idx: - col_to_dim[i] = -1 - continue - else: - col_to_dim[i] = col_idx[col_name] + 1 - - start_pos = 0 - self.init_cache() - while start_pos < limit: - self.reset_positions() - count_delta = self._write_dims_nogil(col_ptrs, col_to_dim, start_pos, limit) - self._count += count_delta - start_pos += count_delta - self._stream.write(self._mem_cache_view[:self._row_pos]) - - cpdef write(self, object data, object columns=None, long limit=-1, object dim_offsets=None): - """ - Write a numpy array, a pandas DataFrame or a dict of column names and columns into a table. - When writing a numpy array, the indices and value of every element is written. The indices - are written before values. The argument ``columns`` determines the subset and order of data. - - Currently only ``bigint``, ``float``, ``double`` and ``boolean`` are supported. - - :param data: data to write, can be numpy array, dict or pandas DataFrame - :param columns: column names to write - :param limit: total number of records to write - :param dim_offsets: offsets for every dimensions, only applicable for arrays - :return: number of records written - """ - if isinstance(data, np.ndarray): - return self._write_single_array(data, columns, limit, dim_offsets) - - if sps: - if isinstance(data, sps.csr_matrix): - data = data.tocoo() - if isinstance(data, sps.coo_matrix): - row = data.row - col = data.col - data_col = data.data - del data - if dim_offsets is not None: - row += dim_offsets[0] - col += dim_offsets[1] - return self._write_dims([row, col, data_col], columns, limit) - - return self._write_dims(data, columns, limit) - - cpdef close(self): - self.reset_positions() - self._encoder.append_tag(WIRE_TUNNEL_META_COUNT, WIRETYPE_VARINT) - self._encoder.append_sint64(self._count) - self._encoder.append_tag(WIRE_TUNNEL_META_CHECKSUM, WIRETYPE_VARINT) - self._encoder.append_uint32(self._crccrc.getvalue()) - self._stream.write(self._mem_cache_view[:self._encoder.position()]) - self.flush() - - cpdef flush(self): - self._stream.flush() - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - # if an error occurs inside the with block, we do not commit - if exc_val is not None: - return - self.close() diff --git a/odps/tunnel/pdio/util_c.pxd b/odps/tunnel/pdio/util_c.pxd deleted file mode 100644 index c80bfbce..00000000 --- a/odps/tunnel/pdio/util_c.pxd +++ /dev/null @@ -1,29 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from libc.stdint cimport * -from libc.string cimport * - -cimport numpy as np - -ctypedef union ValueUnion: - int64_t *pl_array - double *pdbl_array - float *pflt_array - np.npy_bool *pbool_array - -ctypedef struct ArrayVariantPtrs: - uint32_t wire_type - ValueUnion v diff --git a/odps/tunnel/tabletunnel.py b/odps/tunnel/tabletunnel.py index b8122821..227203cf 100644 --- a/odps/tunnel/tabletunnel.py +++ b/odps/tunnel/tabletunnel.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,16 +26,16 @@ from ..lib.monotonic import monotonic from ..models import Projects, Record, TableSchema from ..types import Column -from .base import BaseTunnel, TUNNEL_VERSION +from .base import TUNNEL_VERSION, BaseTunnel from .errors import TunnelError, TunnelWriteTimeout -from .io.reader import TunnelRecordReader, TunnelArrowReader, ArrowRecordReader +from .io.reader import ArrowRecordReader, TunnelArrowReader, TunnelRecordReader from .io.stream import CompressOption, get_decompress_stream from .io.writer import ( - RecordWriter, + ArrowWriter, BufferedArrowWriter, BufferedRecordWriter, + RecordWriter, StreamRecordWriter, - ArrowWriter, Upsert, ) @@ -61,7 +61,7 @@ def wrapped(*args, **kwargs): return func(*args, **kwargs) except requests.ConnectionError as ex: ex_str = str(ex) - if 'timed out' in ex_str: + if "timed out" in ex_str: raise TunnelWriteTimeout(ex_str, request_id=request_id) else: raise @@ -73,7 +73,7 @@ def wrapped(*args, **kwargs): class BaseTableTunnelSession(serializers.JSONSerializableModel): @staticmethod - def get_common_headers(content_length=None, chunked=False): + def get_common_headers(content_length=None, chunked=False, tags=None): header = { "odps-tunnel-date-transform": TUNNEL_DATA_TRANSFORM_VERSION, "odps-tunnel-sdk-support-schema-evolution": "true", @@ -88,6 +88,11 @@ def get_common_headers(content_length=None, chunked=False): "Content-Type": "application/octet-stream", } ) + tags = tags or options.tunnel.tags + if tags: + if isinstance(tags, six.string_types): + tags = tags.split(",") + header["odps-tunnel-tags"] = ",".join(tags) return header @staticmethod @@ -95,15 +100,15 @@ def normalize_partition_spec(partition_spec): if isinstance(partition_spec, six.string_types): partition_spec = types.PartitionSpec(partition_spec) if isinstance(partition_spec, types.PartitionSpec): - partition_spec = str(partition_spec).replace("'", '') + partition_spec = str(partition_spec).replace("'", "") return partition_spec def get_common_params(self, **kwargs): params = {k: str(v) for k, v in kwargs.items()} if getattr(self, "_quota_name", None): - params['quotaName'] = self._quota_name + params["quotaName"] = self._quota_name if self._partition_spec is not None and len(self._partition_spec) > 0: - params['partition'] = self._partition_spec + params["partition"] = self._partition_spec return params def check_tunnel_response(self, resp): @@ -113,34 +118,49 @@ def check_tunnel_response(self, resp): def new_record(self, values=None): return Record( - schema=self.schema, values=values, + schema=self.schema, + values=values, max_field_size=getattr(self, "max_field_size", None), ) class TableDownloadSession(BaseTableTunnelSession): __slots__ = ( - '_client', '_table', '_partition_spec', '_compress_option', '_quota_name' + "_client", + "_table", + "_partition_spec", + "_compress_option", + "_quota_name", ) class Status(Enum): - Unknown = 'UNKNOWN' - Normal = 'NORMAL' - Closes = 'CLOSES' - Expired = 'EXPIRED' - Initiating = 'INITIATING' + Unknown = "UNKNOWN" + Normal = "NORMAL" + Closes = "CLOSES" + Expired = "EXPIRED" + Initiating = "INITIATING" - id = serializers.JSONNodeField('DownloadID') + id = serializers.JSONNodeField("DownloadID") status = serializers.JSONNodeField( - 'Status', parse_callback=lambda s: TableDownloadSession.Status(s.upper()) + "Status", parse_callback=lambda s: TableDownloadSession.Status(s.upper()) ) - count = serializers.JSONNodeField('RecordCount') - schema = serializers.JSONNodeReferenceField(TableSchema, 'Schema') - quota_name = serializers.JSONNodeField('QuotaName') + count = serializers.JSONNodeField("RecordCount") + schema = serializers.JSONNodeReferenceField(TableSchema, "Schema") + quota_name = serializers.JSONNodeField("QuotaName") - def __init__(self, client, table, partition_spec, download_id=None, - compress_option=None, async_mode=True, timeout=None, - quota_name=None, **kw): + def __init__( + self, + client, + table, + partition_spec, + download_id=None, + compress_option=None, + async_mode=True, + timeout=None, + quota_name=None, + tags=None, + **kw + ): super(TableDownloadSession, self).__init__() self._client = client @@ -154,10 +174,10 @@ def __init__(self, client, table, partition_spec, download_id=None, if kw: raise TypeError("Cannot accept arguments %s" % ", ".join(kw.keys())) if download_id is None: - self._init(async_mode=async_mode, timeout=timeout) + self._init(async_mode=async_mode, timeout=timeout, tags=tags) else: self.id = download_id - self.reload() + self.reload(tags=tags) self._compress_option = compress_option logger.info("Tunnel session created: %r", self) @@ -172,15 +192,17 @@ def __repr__(self): self._partition_spec, ) - def _init(self, async_mode, timeout): - params = self.get_common_params(downloads='') - headers = self.get_common_headers(content_length=0) + def _init(self, async_mode, timeout, tags=None): + params = self.get_common_params(downloads="") + headers = self.get_common_headers(content_length=0, tags=tags) if async_mode: - params['asyncmode'] = 'true' + params["asyncmode"] = "true" url = self._table.table_resource() try: - resp = self._client.post(url, {}, params=params, headers=headers, timeout=timeout) + resp = self._client.post( + url, {}, params=params, headers=headers, timeout=timeout + ) except requests.exceptions.ReadTimeout: if callable(options.tunnel_session_create_timeout_callback): options.tunnel_session_create_timeout_callback(*sys.exc_info()) @@ -196,9 +218,9 @@ def _init(self, async_mode, timeout): if self.schema is not None: self.schema.build_snapshot() - def reload(self): + def reload(self, tags=None): params = self.get_common_params(downloadid=self.id) - headers = self.get_common_headers(content_length=0) + headers = self.get_common_headers(content_length=0, tags=tags) url = self._table.table_resource() resp = self._client.get(url, params=params, headers=headers) @@ -208,8 +230,8 @@ def reload(self): if self.schema is not None: self.schema.build_snapshot() - def _open_reader( - self, start, count, compress=False, columns=None, arrow=False, reader_cls=None, **kw + def _build_input_stream( + self, start, count, compress=False, columns=None, arrow=False ): compress_option = self._compress_option or CompressOption() @@ -219,79 +241,128 @@ def _open_reader( if compress: encoding = compress_option.algorithm.get_encoding() if encoding: - headers['Accept-Encoding'] = encoding + headers["Accept-Encoding"] = encoding - params['rowrange'] = '(%s,%s)' % (start, count) + params["rowrange"] = "(%s,%s)" % (start, count) if columns is not None and len(columns) > 0: col_name = lambda col: col.name if isinstance(col, types.Column) else col - params['columns'] = ','.join(col_name(col) for col in columns) + params["columns"] = ",".join(col_name(col) for col in columns) if arrow: actions.append("arrow") url = self._table.table_resource() - resp = self._client.get(url, stream=True, actions=actions, params=params, headers=headers) + resp = self._client.get( + url, stream=True, actions=actions, params=params, headers=headers + ) self.check_tunnel_response(resp) - content_encoding = resp.headers.get('Content-Encoding') + content_encoding = resp.headers.get("Content-Encoding") if content_encoding is not None: - compress_algo = CompressOption.CompressAlgorithm.from_encoding(content_encoding) + compress_algo = CompressOption.CompressAlgorithm.from_encoding( + content_encoding + ) if compress_algo != compress_option.algorithm: - compress_option = self._compress_option = CompressOption(compress_algo, -1, 0) + compress_option = self._compress_option = CompressOption( + compress_algo, -1, 0 + ) compress = True else: compress = False option = compress_option if compress else None - input_stream = get_decompress_stream(resp, option) - return reader_cls(self.schema, input_stream, columns=columns, **kw) + return get_decompress_stream(resp, option) - def open_record_reader(self, start, count, compress=False, columns=None): - return self._open_reader( - start, count, compress=compress, columns=columns, - reader_cls=TunnelRecordReader, partition_spec=self._partition_spec, + def _open_reader( + self, + start, + count, + compress=False, + columns=None, + arrow=False, + reader_cls=None, + **kw + ): + pt_cols = ( + set(types.PartitionSpec(self._partition_spec).keys()) + if self._partition_spec + else set() ) + reader_cols = [c for c in columns if c not in pt_cols] if columns else columns + stream_kw = dict(compress=compress, columns=reader_cols, arrow=arrow) + + def stream_creator(cursor): + return self._build_input_stream(start + cursor, count - cursor, **stream_kw) + + return reader_cls(self.schema, stream_creator, columns=columns, **kw) - if np is not None: - @utils.survey - def open_pandas_reader(self, start, count, compress=False, columns=None): - from .pdio.pdreader_c import TunnelPandasReader - return self._open_reader(start, count, compress=compress, columns=columns, - reader_cls=TunnelPandasReader) + def open_record_reader( + self, start, count, compress=False, columns=None, append_partitions=True + ): + return self._open_reader( + start, + count, + compress=compress, + columns=columns, + append_partitions=append_partitions, + partition_spec=self._partition_spec, + reader_cls=TunnelRecordReader, + ) - def open_arrow_reader(self, start, count, compress=False, columns=None): + def open_arrow_reader( + self, start, count, compress=False, columns=None, append_partitions=False + ): return self._open_reader( - start, count, compress=compress, columns=columns, - arrow=True, reader_cls=TunnelArrowReader + start, + count, + compress=compress, + columns=columns, + arrow=True, + append_partitions=append_partitions, + partition_spec=self._partition_spec, + reader_cls=TunnelArrowReader, ) class TableUploadSession(BaseTableTunnelSession): __slots__ = ( - '_client', '_table', '_partition_spec', '_compress_option', - '_overwrite', '_quota_name', + "_client", + "_table", + "_partition_spec", + "_compress_option", + "_overwrite", + "_quota_name", ) class Status(Enum): - Unknown = 'UNKNOWN' - Normal = 'NORMAL' - Closing = 'CLOSING' - Closed = 'CLOSED' - Canceled = 'CANCELED' - Expired = 'EXPIRED' - Critical = 'CRITICAL' - - id = serializers.JSONNodeField('UploadID') + Unknown = "UNKNOWN" + Normal = "NORMAL" + Closing = "CLOSING" + Closed = "CLOSED" + Canceled = "CANCELED" + Expired = "EXPIRED" + Critical = "CRITICAL" + + id = serializers.JSONNodeField("UploadID") status = serializers.JSONNodeField( - 'Status', parse_callback=lambda s: TableUploadSession.Status(s.upper()) + "Status", parse_callback=lambda s: TableUploadSession.Status(s.upper()) ) - blocks = serializers.JSONNodesField('UploadedBlockList', 'BlockID') - schema = serializers.JSONNodeReferenceField(TableSchema, 'Schema') - max_field_size = serializers.JSONNodeField('MaxFieldSize') - quota_name = serializers.JSONNodeField('QuotaName') + blocks = serializers.JSONNodesField("UploadedBlockList", "BlockID") + schema = serializers.JSONNodeReferenceField(TableSchema, "Schema") + max_field_size = serializers.JSONNodeField("MaxFieldSize") + quota_name = serializers.JSONNodeField("QuotaName") - def __init__(self, client, table, partition_spec, upload_id=None, - compress_option=None, overwrite=False, quota_name=None): + def __init__( + self, + client, + table, + partition_spec, + upload_id=None, + compress_option=None, + overwrite=False, + quota_name=None, + tags=None, + ): super(TableUploadSession, self).__init__() self._client = client @@ -302,10 +373,10 @@ def __init__(self, client, table, partition_spec, upload_id=None, self._overwrite = overwrite if upload_id is None: - self._init() + self._init(tags=tags) else: self.id = upload_id - self.reload() + self.reload(tags=tags) self._compress_option = compress_option logger.info("Tunnel session created: %r", self) @@ -320,16 +391,16 @@ def __repr__(self): self._partition_spec, ) - def _create_or_reload_session(self, reload=False): - headers = self.get_common_headers(content_length=0) + def _create_or_reload_session(self, tags=None, reload=False): + headers = self.get_common_headers(content_length=0, tags=tags) params = self.get_common_params(reload=reload) if not reload and self._overwrite: params["overwrite"] = "true" if reload: - params['uploadid'] = self.id + params["uploadid"] = self.id else: - params['uploads'] = 1 + params["uploads"] = 1 def _call_tunnel(func, *args, **kw): resp = func(*args, **kw) @@ -350,18 +421,18 @@ def _call_tunnel(func, *args, **kw): if self.schema is not None: self.schema.build_snapshot() - def _init(self): - self._create_or_reload_session(reload=False) + def _init(self, tags=None): + self._create_or_reload_session(tags=tags, reload=False) - def reload(self): - self._create_or_reload_session(reload=True) + def reload(self, tags=None): + self._create_or_reload_session(tags=tags, reload=True) @classmethod def _iter_data_in_batches(cls, data): pos = 0 chunk_size = options.chunk_size while pos < len(data): - yield data[pos: pos + chunk_size] + yield data[pos : pos + chunk_size] pos += chunk_size def _open_writer( @@ -382,33 +453,39 @@ def _open_writer( if ( writer_cls is not None and issubclass(writer_cls, ArrowWriter) - and compress_option.algorithm == CompressOption.CompressAlgorithm.ODPS_LZ4 + and compress_option.algorithm + == CompressOption.CompressAlgorithm.ODPS_LZ4 ): - compress_option.algorithm = CompressOption.CompressAlgorithm.ODPS_ARROW_LZ4 + compress_option.algorithm = ( + CompressOption.CompressAlgorithm.ODPS_ARROW_LZ4 + ) encoding = compress_option.algorithm.get_encoding() if encoding: - headers['Content-Encoding'] = encoding + headers["Content-Encoding"] = encoding url = self._table.table_resource() option = compress_option if compress else None if block_id is None: + @_wrap_upload_call(self.id) def upload_block(blockid, data): - params['blockid'] = blockid + params["blockid"] = blockid def upload_func(): if isinstance(data, (bytes, bytearray)): to_upload = self._iter_data_in_batches(data) else: to_upload = data - return self._client.put(url, data=to_upload, params=params, headers=headers) + return self._client.put( + url, data=to_upload, params=params, headers=headers + ) return utils.call_with_retry(upload_func) if writer_cls is ArrowWriter: writer_cls = BufferedArrowWriter - params['arrow'] = '' + params["arrow"] = "" else: writer_cls = BufferedRecordWriter @@ -421,74 +498,78 @@ def upload_func(): block_id_gen=block_id_gen, ) else: - params['blockid'] = block_id + params["blockid"] = block_id @_wrap_upload_call(self.id) def upload(data): return self._client.put(url, data=data, params=params, headers=headers) if writer_cls is ArrowWriter: - params['arrow'] = '' + params["arrow"] = "" writer = writer_cls(self.schema, upload, compress_option=option) return writer def open_record_writer( - self, block_id=None, compress=False, buffer_size=None, initial_block_id=None, - block_id_gen=None + self, + block_id=None, + compress=False, + buffer_size=None, + initial_block_id=None, + block_id_gen=None, ): return self._open_writer( - block_id=block_id, compress=compress, buffer_size=buffer_size, - initial_block_id=initial_block_id, block_id_gen=block_id_gen, - writer_cls=RecordWriter + block_id=block_id, + compress=compress, + buffer_size=buffer_size, + initial_block_id=initial_block_id, + block_id_gen=block_id_gen, + writer_cls=RecordWriter, ) def open_arrow_writer( - self, block_id=None, compress=False, buffer_size=None, initial_block_id=None, - block_id_gen=None + self, + block_id=None, + compress=False, + buffer_size=None, + initial_block_id=None, + block_id_gen=None, ): return self._open_writer( - block_id=block_id, compress=compress, buffer_size=buffer_size, - initial_block_id=initial_block_id, block_id_gen=block_id_gen, - writer_cls=ArrowWriter + block_id=block_id, + compress=compress, + buffer_size=buffer_size, + initial_block_id=initial_block_id, + block_id_gen=block_id_gen, + writer_cls=ArrowWriter, ) - if np is not None: - @utils.survey - def open_pandas_writer(self, block_id=None, compress=False, buffer_size=None): - from .pdio import TunnelPandasWriter - return self._open_writer(block_id=block_id, compress=compress, buffer_size=buffer_size, - writer_cls=TunnelPandasWriter) - def get_block_list(self): self.reload() return self.blocks def commit(self, blocks): if blocks is None: - raise ValueError('Invalid parameter: blocks.') + raise ValueError("Invalid parameter: blocks.") if isinstance(blocks, six.integer_types): - blocks = [blocks, ] + blocks = [blocks] server_block_map = dict( - [ - (int(block_id), True) for block_id in self.get_block_list() - ] + [(int(block_id), True) for block_id in self.get_block_list()] ) client_block_map = dict([(int(block_id), True) for block_id in blocks]) if len(server_block_map) != len(client_block_map): raise TunnelError( - 'Blocks not match, server: %s, tunnelServerClient: %s. ' - 'Make sure all block writers closed or with-blocks exited.' % ( - len(server_block_map), len(client_block_map) - ) + "Blocks not match, server: %s, tunnelServerClient: %s. " + "Make sure all block writers closed or with-blocks exited." + % (len(server_block_map), len(client_block_map)) ) for block_id in blocks: if block_id not in server_block_map: raise TunnelError( - 'Block not exists on server, block id is %s' % (block_id,) + "Block not exists on server, block id is %s" % (block_id,) ) self._complete_upload() @@ -501,10 +582,14 @@ def _complete_upload(self): resp = utils.call_with_retry( self._client.post, url, - '', + "", params=params, headers=headers, - exc_type=(requests.Timeout, requests.ConnectionError, errors.InternalServerError), + exc_type=( + requests.Timeout, + requests.ConnectionError, + errors.InternalServerError, + ), ) self.parse(resp, obj=self) @@ -530,17 +615,17 @@ def port(self): @property def server(self): - return str(self._ip) + ':' + str(self._port) + return str(self._ip) + ":" + str(self._port) def set_server(self, server, check_empty=False): - if len(server.split(':')) != 2: - raise TunnelError('Invalid slot format: {}'.format(server)) + if len(server.split(":")) != 2: + raise TunnelError("Invalid slot format: {}".format(server)) - ip, port = server.split(':') + ip, port = server.split(":") if check_empty: if (not ip) or (not port): - raise TunnelError('Empty server ip or port') + raise TunnelError("Empty server ip or port") if ip: self._ip = ip if port: @@ -549,7 +634,13 @@ def set_server(self, server, check_empty=False): class TableStreamUploadSession(BaseTableTunnelSession): __slots__ = ( - '_client', '_table', '_partition_spec', '_compress_option', '_quota_name' + "_client", + "_table", + "_partition_spec", + "_compress_option", + "_quota_name", + "_create_partition", + "_zorder_columns", ) class Slots(object): @@ -558,7 +649,7 @@ def __init__(self, slot_elements): self._cur_index = -1 for value in slot_elements: if len(value) != 2: - raise TunnelError('Invalid slot routes') + raise TunnelError("Invalid slot routes") self._slots.append(Slot(value[0], value[1])) if len(self._slots) > 0: @@ -581,12 +672,13 @@ def __iter__(self): self._cur_index = 0 yield self._slots[self._cur_index] - schema = serializers.JSONNodeReferenceField(TableSchema, 'schema') - id = serializers.JSONNodeField('session_name') - status = serializers.JSONNodeField('status') + schema = serializers.JSONNodeReferenceField(TableSchema, "schema") + id = serializers.JSONNodeField("session_name") + status = serializers.JSONNodeField("status") slots = serializers.JSONNodeField( - 'slots', parse_callback=lambda val: TableStreamUploadSession.Slots(val)) - quota_name = serializers.JSONNodeField('QuotaName') + "slots", parse_callback=lambda val: TableStreamUploadSession.Slots(val) + ) + quota_name = serializers.JSONNodeField("QuotaName") schema_version = serializers.JSONNodeField("schema_version") def __init__( @@ -596,7 +688,10 @@ def __init__( partition_spec, compress_option=None, quota_name=None, + create_partition=False, + zorder_columns=None, schema_version=None, + tags=None, ): super(TableStreamUploadSession, self).__init__() @@ -605,9 +700,11 @@ def __init__( self._partition_spec = self.normalize_partition_spec(partition_spec) self._quota_name = quota_name + self._create_partition = create_partition + self._zorder_columns = zorder_columns self.schema_version = schema_version - self._init() + self._init(tags=tags) self._compress_option = compress_option logger.info("Tunnel session created: %r", self) @@ -625,12 +722,19 @@ def __repr__(self): ) ) - def _init(self): + def _init(self, tags=None): params = self.get_common_params() - headers = self.get_common_headers(content_length=0) + headers = self.get_common_headers(content_length=0, tags=tags) + if self._create_partition: + params["create_partition"] = "" if self.schema_version is not None: params["schema_version"] = str(self.schema_version) + if self._zorder_columns: + cols = self._zorder_columns + if not isinstance(self._zorder_columns, six.string_types): + cols = ",".join(self._zorder_columns) + params["zorder_columns"] = cols url = self._get_resource() resp = self._client.post(url, {}, params=params, headers=headers) @@ -642,11 +746,11 @@ def _init(self): self.schema.build_snapshot() def _get_resource(self): - return self._table.table_resource() + '/streams' + return self._table.table_resource() + "/streams" - def reload(self): + def reload(self, tags=None): params = self.get_common_params(uploadid=self.id) - headers = self.get_common_headers(content_length=0) + headers = self.get_common_headers(content_length=0, tags=tags) url = self._get_resource() resp = self._client.get(url, params=params, headers=headers) @@ -680,15 +784,17 @@ def _open_writer(self, compress=False): slot = next(iter(self.slots)) headers = self.get_common_headers(chunked=True) - headers.update({ - "odps-tunnel-slot-num": str(len(self.slots)), - "odps-tunnel-routed-server": slot.server, - }) + headers.update( + { + "odps-tunnel-slot-num": str(len(self.slots)), + "odps-tunnel-routed-server": slot.server, + } + ) if compress: encoding = compress_option.algorithm.get_encoding() if encoding: - headers['Content-Encoding'] = encoding + headers["Content-Encoding"] = encoding params = self.get_common_params(uploadid=self.id, slotid=slot.slot) url = self._get_resource() @@ -710,8 +816,13 @@ def open_record_writer(self, compress=False): class TableUpsertSession(BaseTableTunnelSession): __slots__ = ( - '_client', '_table', '_partition_spec', '_compress_option', - '_slot_num', '_commit_timeout', '_quota_name' + "_client", + "_table", + "_partition_spec", + "_compress_option", + "_slot_num", + "_commit_timeout", + "_quota_name", ) UPSERT_EXTRA_COL_NUM = 5 @@ -734,9 +845,9 @@ def __init__(self, slot_elements): self._slots = [] self._buckets = {} for value in slot_elements: - slot = Slot(value['slot_id'], value['worker_addr']) + slot = Slot(value["slot_id"], value["worker_addr"]) self._slots.append(slot) - self._buckets.update({idx: slot for idx in value['buckets']}) + self._buckets.update({idx: slot for idx in value["buckets"]}) for idx in self._buckets.keys(): if idx > len(self._buckets): @@ -749,20 +860,28 @@ def buckets(self): def __len__(self): return len(self._slots) - schema = serializers.JSONNodeReferenceField(TableSchema, 'schema') - id = serializers.JSONNodeField('id') + schema = serializers.JSONNodeReferenceField(TableSchema, "schema") + id = serializers.JSONNodeField("id") status = serializers.JSONNodeField( - 'status', parse_callback=lambda s: TableUpsertSession.Status(s.upper()) + "status", parse_callback=lambda s: TableUpsertSession.Status(s.upper()) ) slots = serializers.JSONNodeField( - 'slots', parse_callback=lambda val: TableUpsertSession.Slots(val)) - quota_name = serializers.JSONNodeField('quota_name') - hash_keys = serializers.JSONNodeField('hash_key') - hasher = serializers.JSONNodeField('hasher') + "slots", parse_callback=lambda val: TableUpsertSession.Slots(val) + ) + quota_name = serializers.JSONNodeField("quota_name") + hash_keys = serializers.JSONNodeField("hash_key") + hasher = serializers.JSONNodeField("hasher") def __init__( - self, client, table, partition_spec, compress_option=None, slot_num=1, - commit_timeout=DEFAULT_UPSERT_COMMIT_TIMEOUT, quota_name=None + self, + client, + table, + partition_spec, + compress_option=None, + slot_num=1, + commit_timeout=DEFAULT_UPSERT_COMMIT_TIMEOUT, + quota_name=None, + tags=None, ): super(TableUpsertSession, self).__init__() @@ -774,7 +893,7 @@ def __init__( self._slot_num = slot_num self._commit_timeout = commit_timeout - self._init() + self._init(tags=tags) self._compress_option = compress_option logger.info("Upsert session created: %r", self) @@ -798,7 +917,7 @@ def buckets(self): return self.slots.buckets def _get_resource(self): - return self._table.table_resource() + '/upserts' + return self._table.table_resource() + "/upserts" def _patch_schema(self): if self.schema is None: @@ -815,14 +934,14 @@ def _patch_schema(self): self.schema = self.schema.extend(patch_schema) self.schema.build_snapshot() - def _init_or_reload(self, reload=False): + def _init_or_reload(self, reload=False, tags=None): params = self.get_common_params() - headers = self.get_common_headers(content_length=0) + headers = self.get_common_headers(content_length=0, tags=tags) if not reload: - params['slotnum'] = str(self._slot_num) + params["slotnum"] = str(self._slot_num) else: - params['upsertid'] = self.id + params["upsertid"] = self.id url = self._get_resource() if not reload: @@ -836,16 +955,16 @@ def _init_or_reload(self, reload=False): e = TunnelError.parse(resp) raise e - def _init(self): - self._init_or_reload() + def _init(self, tags=None): + self._init_or_reload(tags=tags) def new_record(self, values=None): if values: values = list(values) + [None] * 5 return super(TableUpsertSession, self).new_record(values) - def reload(self, init=False): - self._init_or_reload(reload=True) + def reload(self, init=False, tags=None): + self._init_or_reload(reload=True, tags=tags) def abort(self): params = self.get_common_params(upsertid=self.id) @@ -864,7 +983,7 @@ def open_upsert_stream(self, compress=False): if compress: encoding = compress_option.algorithm.get_encoding() if encoding: - headers['Content-Encoding'] = encoding + headers["Content-Encoding"] = encoding url = self._get_resource() @@ -872,12 +991,18 @@ def open_upsert_stream(self, compress=False): def upload_block(bucket, slot, record_count, data): req_params = params.copy() req_params.update( - dict(bucketid=bucket, slotid=str(slot.slot), record_count=str(record_count)) + dict( + bucketid=bucket, + slotid=str(slot.slot), + record_count=str(record_count), + ) ) req_headers = headers.copy() req_headers["odps-tunnel-routed-server"] = slot.server req_headers["Content-Length"] = len(data) - return self._client.put(url, data=data, params=req_params, headers=req_headers) + return self._client.put( + url, data=data, params=req_params, headers=req_headers + ) return Upsert(self.schema, upload_block, self, compress_option) @@ -896,7 +1021,10 @@ def commit(self, async_=False): delay = 1 start = monotonic() - while self.status in (TableUpsertSession.Status.Committing, TableUpsertSession.Status.Normal): + while self.status in ( + TableUpsertSession.Status.Committing, + TableUpsertSession.Status.Normal, + ): try: if monotonic() - start > self._commit_timeout: raise TunnelError("Commit session timeout") @@ -915,10 +1043,19 @@ def commit(self, async_=False): class TableTunnel(BaseTunnel): def _get_tunnel_table(self, table, schema=None): + try: + if isinstance(table, six.string_types): + table = self._project.odps.get_table(table) + except: + pass + + project_name = self._project.name if not isinstance(table, six.string_types): + project_name = table.project.name or project_name schema = schema or getattr(table.get_schema(), "name", None) table = table.name - parent = Projects(client=self.tunnel_rest)[self._project.name] + + parent = Projects(client=self.tunnel_rest)[project_name] # tailor project for resource locating only parent._set_tunnel_defaults() if schema is not None: @@ -933,13 +1070,26 @@ def _build_compress_option(compress_algo=None, level=None, strategy=None): compress_algo=compress_algo, level=level, strategy=strategy ) - def create_download_session(self, table, async_mode=True, partition_spec=None, - download_id=None, compress_option=None, - compress_algo=None, compress_level=None, - compress_strategy=None, schema=None, timeout=None, **kw): + def create_download_session( + self, + table, + async_mode=True, + partition_spec=None, + download_id=None, + compress_option=None, + compress_algo=None, + compress_level=None, + compress_strategy=None, + schema=None, + timeout=None, + tags=None, + **kw + ): table = self._get_tunnel_table(table, schema) compress_option = compress_option or self._build_compress_option( - compress_algo=compress_algo, level=compress_level, strategy=compress_strategy + compress_algo=compress_algo, + level=compress_level, + strategy=compress_strategy, ) if "async_" in kw: async_mode = kw.pop("async_") @@ -954,6 +1104,7 @@ def create_download_session(self, table, async_mode=True, partition_spec=None, async_mode=async_mode, timeout=timeout, quota_name=self._quota_name, + tags=tags, ) def create_upload_session( @@ -967,11 +1118,14 @@ def create_upload_session( compress_strategy=None, schema=None, overwrite=False, + tags=None, ): table = self._get_tunnel_table(table, schema) compress_option = compress_option compress_option = compress_option or self._build_compress_option( - compress_algo=compress_algo, level=compress_level, strategy=compress_strategy + compress_algo=compress_algo, + level=compress_level, + strategy=compress_strategy, ) return TableUploadSession( self.tunnel_rest, @@ -981,6 +1135,7 @@ def create_upload_session( compress_option=compress_option, overwrite=overwrite, quota_name=self._quota_name, + tags=tags, ) def create_stream_upload_session( @@ -993,10 +1148,13 @@ def create_stream_upload_session( compress_strategy=None, schema=None, schema_version=None, + tags=None, ): table = self._get_tunnel_table(table, schema) compress_option = compress_option or self._build_compress_option( - compress_algo=compress_algo, level=compress_level, strategy=compress_strategy + compress_algo=compress_algo, + level=compress_level, + strategy=compress_strategy, ) return TableStreamUploadSession( self.tunnel_rest, @@ -1005,6 +1163,7 @@ def create_stream_upload_session( compress_option=compress_option, quota_name=self._quota_name, schema_version=schema_version, + tags=tags, ) def create_upsert_session( @@ -1018,10 +1177,13 @@ def create_upsert_session( compress_level=None, compress_strategy=None, schema=None, + tags=None, ): table = self._get_tunnel_table(table, schema) compress_option = compress_option or self._build_compress_option( - compress_algo=compress_algo, level=compress_level, strategy=compress_strategy + compress_algo=compress_algo, + level=compress_level, + strategy=compress_strategy, ) return TableUpsertSession( self.tunnel_rest, @@ -1031,6 +1193,7 @@ def create_upsert_session( commit_timeout=commit_timeout, compress_option=compress_option, quota_name=self._quota_name, + tags=tags, ) def open_preview_reader( @@ -1047,13 +1210,16 @@ def open_preview_reader( timeout=None, make_compat=True, read_all=False, + tags=None, ): if pa is None: raise ImportError("Need pyarrow to run open_preview_reader.") tunnel_table = self._get_tunnel_table(table) compress_option = compress_option or self._build_compress_option( - compress_algo=compress_algo, level=compress_level, strategy=compress_strategy + compress_algo=compress_algo, + level=compress_level, + strategy=compress_strategy, ) params = {"limit": str(limit) if limit else "-1"} @@ -1063,13 +1229,13 @@ def open_preview_reader( ordered_col = [c.name for c in table.table_schema if c.name in col_set] params["columns"] = ",".join(ordered_col) if partition_spec is not None and len(partition_spec) > 0: - params['partition'] = partition_spec + params["partition"] = partition_spec - headers = BaseTableTunnelSession.get_common_headers(content_length=0) + headers = BaseTableTunnelSession.get_common_headers(content_length=0, tags=tags) if compress_option: encoding = compress_option.algorithm.get_encoding(legacy=False) if encoding: - headers['Accept-Encoding'] = encoding + headers["Accept-Encoding"] = encoding url = tunnel_table.table_resource(force_schema=True) + "/preview" resp = self.tunnel_rest.get( @@ -1084,8 +1250,13 @@ def open_preview_reader( # stream is empty, replace with empty stream input_stream = None + def stream_creator(pos): + # part retry not supported currently + assert pos == 0 + return input_stream + reader = TunnelArrowReader( - table.table_schema, input_stream, columns=columns, use_ipc_stream=True + table.table_schema, stream_creator, columns=columns, use_ipc_stream=True ) if not arrow: reader = ArrowRecordReader( diff --git a/odps/tunnel/tests/test_arrow_tabletunnel.py b/odps/tunnel/tests/test_arrow_tabletunnel.py index 33b37b9b..0ce1bf54 100644 --- a/odps/tunnel/tests/test_arrow_tabletunnel.py +++ b/odps/tunnel/tests/test_arrow_tabletunnel.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,8 @@ import datetime import math -from collections import namedtuple, OrderedDict +from collections import OrderedDict, namedtuple + try: from string import letters except ImportError: @@ -24,6 +25,7 @@ import mock import pytest + try: import zoneinfo except ImportError: @@ -33,8 +35,8 @@ except ImportError: pytz = None try: - import pandas as pd import numpy as np + import pandas as pd except ImportError: pd = None np = None @@ -46,7 +48,7 @@ from ...config import options from ...models import TableSchema -from ...tests.core import tn +from ...tests.core import get_test_unique_name, tn def _get_tz_str(): @@ -68,16 +70,16 @@ def upload_data(test_table, data, compress=False, **kw): writer.write(data) writer.close() - upload_ss.commit([0, ]) + upload_ss.commit([0]) def buffered_upload_data(test_table, data, buffer_size=None, compress=False, **kw): upload_ss = tunnel.create_upload_session(test_table, **kw) writer = upload_ss.open_arrow_writer(compress=compress, buffer_size=buffer_size) pd_data = data.to_pandas() - part1 = pd_data.iloc[:len(pd_data) // 2] + part1 = pd_data.iloc[: len(pd_data) // 2] writer.write(part1) - part2 = pd_data.iloc[len(pd_data) // 2:] + part2 = pd_data.iloc[len(pd_data) // 2 :] writer.write(part2) writer.close() @@ -87,11 +89,20 @@ def buffered_upload_data(test_table, data, buffer_size=None, compress=False, **k assert len(writer.get_blocks_written()) > 1 upload_ss.commit(writer.get_blocks_written()) - def download_data(test_table, columns=None, compress=False, **kw): - count = kw.pop('count', None) + def download_data( + test_table, columns=None, compress=False, append_partitions=None, **kw + ): + count = kw.pop("count", None) download_ss = tunnel.create_download_session(test_table, **kw) count = count or download_ss.count or 1 - with download_ss.open_arrow_reader(0, count, compress=compress, columns=columns) as reader: + down_kw = ( + {"append_partitions": append_partitions} + if append_partitions is not None + else {} + ) + with download_ss.open_arrow_reader( + 0, count, compress=compress, columns=columns, **down_kw + ) as reader: pd_data = reader.to_pandas() for col_name, dtype in pd_data.dtypes.items(): if isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.datetime64): @@ -100,27 +111,26 @@ def download_data(test_table, columns=None, compress=False, **kw): def gen_data(repeat=1): data = OrderedDict() - data['id'] = ['hello \x00\x00 world', 'goodbye', 'c' * 2, 'c' * 20] * repeat - data['int_num'] = [2 ** 63 - 1, 222222, -2 ** 63 + 1, -2 ** 11 + 1] * repeat - data['float_num'] = [math.pi, math.e, -2.222, 2.222] * repeat - data['bool'] = [True, False, True, True] * repeat - data['date'] = [ + data["id"] = ["hello \x00\x00 world", "goodbye", "c" * 2, "c" * 20] * repeat + data["int_num"] = [2**63 - 1, 222222, -(2**63) + 1, -(2**11) + 1] * repeat + data["float_num"] = [math.pi, math.e, -2.222, 2.222] * repeat + data["bool"] = [True, False, True, True] * repeat + data["date"] = [ datetime.date.today() + datetime.timedelta(days=idx) for idx in range(4) ] * repeat - data['dt'] = [ - datetime.datetime.now() + datetime.timedelta(days=idx) - for idx in range(4) + data["dt"] = [ + datetime.datetime.now() + datetime.timedelta(days=idx) for idx in range(4) ] * repeat - data['dt'] = [ - dt.replace(microsecond=dt.microsecond // 1000 * 1000) for dt in data['dt'] + data["dt"] = [ + dt.replace(microsecond=dt.microsecond // 1000 * 1000) for dt in data["dt"] ] pd_data = pd.DataFrame(data) return pa.RecordBatch.from_pandas(pd_data) def create_table(table_name): - fields = ['id', 'int_num', 'float_num', 'bool', 'date', 'dt'] - types = ['string', 'bigint', 'double', 'boolean', 'date', 'datetime'] + fields = ["id", "int_num", "float_num", "bool", "date", "dt"] + types = ["string", "bigint", "double", "boolean", "date", "datetime"] odps.delete_table(table_name, if_exists=True) return odps.create_table( @@ -128,12 +138,12 @@ def create_table(table_name): ) def create_partitioned_table(table_name): - fields = ['id', 'int_num', 'float_num', 'bool', 'date', 'dt'] - types = ['string', 'bigint', 'double', 'boolean', 'date', 'datetime'] + fields = ["id", "int_num", "float_num", "bool", "date", "dt"] + types = ["string", "bigint", "double", "boolean", "date", "datetime"] odps.delete_table(table_name, if_exists=True) return odps.create_table( - table_name, TableSchema.from_lists(fields, types, ['ds'], ['string']) + table_name, TableSchema.from_lists(fields, types, ["ds"], ["string"]) ) def delete_table(table_name): @@ -142,15 +152,20 @@ def delete_table(table_name): nt = namedtuple( "NT", "upload_data, buffered_upload_data download_data, gen_data, " - "create_table, create_partitioned_table, delete_table" + "create_table, create_partitioned_table, delete_table", ) raw_chunk_size = options.chunk_size raw_buffer_size = options.tunnel.block_buffer_size try: options.sql.use_odps2_extension = True yield nt( - upload_data, buffered_upload_data, download_data, gen_data, - create_table, create_partitioned_table, delete_table + upload_data, + buffered_upload_data, + download_data, + gen_data, + create_table, + create_partitioned_table, + delete_table, ) finally: options.sql.use_odps2_extension = None @@ -161,15 +176,17 @@ def delete_table(table_name): def _assert_frame_equal(left, right): if isinstance(left, pa.RecordBatch): left = left.to_pandas() + if "dt" in left and not getattr(left["dt"].dtype, "tz", None): left["dt"] = left["dt"].dt.tz_localize(_get_tz_str()) if isinstance(right, pa.RecordBatch): right = right.to_pandas() + if "dt" in right and not getattr(right["dt"].dtype, "tz", None): right["dt"] = right["dt"].dt.tz_localize(_get_tz_str()) pd.testing.assert_frame_equal(left, right) def test_upload_and_download_by_raw_tunnel(odps, setup): - test_table_name = tn('pyodps_test_arrow_tunnel') + test_table_name = tn("pyodps_test_arrow_tunnel") setup.create_table(test_table_name) pd_df = setup.download_data(test_table_name) assert len(pd_df) == 0 @@ -193,7 +210,7 @@ def test_upload_and_download_by_raw_tunnel(odps, setup): ) setup.upload_data(test_table_name, new_data) - data_dict['int_num'] = data.columns[data.schema.get_field_index('id')] + data_dict["int_num"] = data.columns[data.schema.get_field_index("id")] new_data = pa.RecordBatch.from_arrays( list(data_dict.values()), names=list(data_dict.keys()) ) @@ -201,7 +218,7 @@ def test_upload_and_download_by_raw_tunnel(odps, setup): setup.upload_data(test_table_name, new_data) assert "Failed to cast" in str(err_info.value) - data_dict.pop('int_num') + data_dict.pop("int_num") new_data = pa.RecordBatch.from_arrays( list(data_dict.values()), names=list(data_dict.keys()) ) @@ -213,7 +230,7 @@ def test_upload_and_download_by_raw_tunnel(odps, setup): def test_buffered_upload_and_download_by_raw_tunnel(odps, setup): from ..tabletunnel import TableUploadSession - test_table_name = tn('pyodps_test_buffered_arrow_tunnel') + test_table_name = tn("pyodps_test_buffered_arrow_tunnel") table = setup.create_table(test_table_name) pd_df = setup.download_data(test_table_name) assert len(pd_df) == 0 @@ -240,7 +257,8 @@ def _gen_with_error(cls, data): yield chunk with mock.patch( - "odps.tunnel.tabletunnel.TableUploadSession._iter_data_in_batches", new=_gen_with_error + "odps.tunnel.tabletunnel.TableUploadSession._iter_data_in_batches", + new=_gen_with_error, ): setup.buffered_upload_data(test_table_name, data) assert not raises[0], "error not raised" @@ -249,21 +267,57 @@ def _gen_with_error(cls, data): _assert_frame_equal(data, pd_df) +def test_download_with_retry(odps, setup, tunnel): + from ..tabletunnel import TableDownloadSession + + test_table_name = tn("pyodps_test_buffered_arrow_tunnel_retry") + setup.create_table(test_table_name) + + data = setup.gen_data() + setup.buffered_upload_data(test_table_name, data) + setup.buffered_upload_data(test_table_name, data) + + ranges = [] + original = TableDownloadSession._build_input_stream + + def new_build_input_stream(self, start, count, *args, **kw): + ranges.append((start, count)) + assert start in (0, 4) + assert start == 0 or count == session.count - 4 + return original(self, start, count, *args, **kw) + + with mock.patch( + "odps.tunnel.tabletunnel.TableDownloadSession._build_input_stream", + new=new_build_input_stream, + ): + session = tunnel.create_download_session(test_table_name) + reader = session.open_arrow_reader(0, session.count) + + reader._inject_error(4, ValueError) + pd_df = reader.to_pandas() + + assert ranges == [(0, 8), (4, 4)] + pd_data = data.to_pandas() + _assert_frame_equal(pd.concat([pd_data, pd_data], ignore_index=True), pd_df) + + def test_download_with_specified_columns(odps, setup): - test_table_name = tn('pyodps_test_arrow_tunnel_columns') + test_table_name = tn("pyodps_test_arrow_tunnel_columns") setup.create_table(test_table_name) data = setup.gen_data() setup.upload_data(test_table_name, data) - result = setup.download_data(test_table_name, columns=['id']) - _assert_frame_equal(data.to_pandas()[['id']], result) + result = setup.download_data( + test_table_name, columns=["id"], append_partitions=True + ) + _assert_frame_equal(data.to_pandas()[["id"]], result) setup.delete_table(test_table_name) def test_partition_upload_and_download_by_raw_tunnel(odps, setup): - test_table_name = tn('pyodps_test_arrow_partition_tunnel') - test_table_partition = 'ds=test' + test_table_name = tn("pyodps_test_arrow_partition_tunnel") + test_table_partition = "ds=test" odps.delete_table(test_table_name, if_exists=True) table = setup.create_partitioned_table(test_table_name) @@ -271,13 +325,20 @@ def test_partition_upload_and_download_by_raw_tunnel(odps, setup): data = setup.gen_data() setup.upload_data(test_table_name, data, partition_spec=test_table_partition) + result = setup.download_data( + test_table_name, partition_spec=test_table_partition, append_partitions=True + ) + pd_data = data.to_pandas() + pd_data["ds"] = "test" + _assert_frame_equal(pd_data, result) + result = setup.download_data(test_table_name, partition_spec=test_table_partition) _assert_frame_equal(data, result) def test_partition_download_with_specified_columns(odps, setup): - test_table_name = tn('pyodps_test_arrow_tunnel_partition_columns') - test_table_partition = 'ds=test' + test_table_name = tn("pyodps_test_arrow_tunnel_partition_columns") + test_table_partition = "ds=test" odps.delete_table(test_table_name, if_exists=True) table = setup.create_partitioned_table(test_table_name) @@ -286,18 +347,27 @@ def test_partition_download_with_specified_columns(odps, setup): setup.upload_data(test_table_name, data, partition_spec=test_table_partition) result = setup.download_data( - test_table_name, partition_spec=test_table_partition, columns=['int_num'] + test_table_name, partition_spec=test_table_partition, columns=["int_num"] + ) + _assert_frame_equal(data.to_pandas()[["int_num"]], result) + + result = setup.download_data( + test_table_name, partition_spec=test_table_partition, columns=["int_num", "ds"] ) - _assert_frame_equal(data.to_pandas()[['int_num']], result) + pd_data = data.to_pandas()[["int_num"]] + pd_data["ds"] = "test" + _assert_frame_equal(pd_data, result) -@pytest.mark.parametrize("compress_algo, module", [("zlib", None), ("lz4", "lz4.frame")]) +@pytest.mark.parametrize( + "compress_algo, module", [("zlib", None), ("lz4", "lz4.frame")] +) def test_upload_and_download_with_compress(odps, setup, compress_algo, module): options.chunk_size = 16 if module: pytest.importorskip(module) - test_table_name = tn('pyodps_test_arrow_zlib_tunnel') + test_table_name = tn("pyodps_test_arrow_zlib_tunnel_" + get_test_unique_name(5)) odps.delete_table(test_table_name, if_exists=True) setup.create_table(test_table_name) @@ -310,10 +380,10 @@ def test_upload_and_download_with_compress(odps, setup, compress_algo, module): setup.delete_table(test_table_name) -@pytest.mark.skipif(pytz is None and zoneinfo is None, reason='pytz not installed') -@pytest.mark.parametrize("zone", [False, True, 'Asia/Shanghai', 'America/Los_Angeles']) +@pytest.mark.skipif(pytz is None and zoneinfo is None, reason="pytz not installed") +@pytest.mark.parametrize("zone", [False, True, "Asia/Shanghai", "America/Los_Angeles"]) def test_buffered_upload_and_download_with_timezone(odps, setup, zone): - test_table_name = tn('pyodps_test_arrow_tunnel_with_tz') + test_table_name = tn("pyodps_test_arrow_tunnel_with_tz_" + get_test_unique_name(5)) odps.delete_table(test_table_name, if_exists=True) try: options.local_timezone = zone diff --git a/odps/tunnel/tests/test_hasher.py b/odps/tunnel/tests/test_hasher.py index 88eb8881..7b38853b 100644 --- a/odps/tunnel/tests/test_hasher.py +++ b/odps/tunnel/tests/test_hasher.py @@ -14,16 +14,16 @@ import itertools import pytest + try: import pandas as pd except ImportError: pd = None from ...models import Record -from ...types import OdpsSchema, Column +from ...types import Column, OdpsSchema from .. import hasher as py_hasher - hasher_mods = [py_hasher] try: @@ -57,14 +57,20 @@ def _build_schema_and_record(pd): datetime.datetime(2023, 6, 11, 22, 33, 11), ] if pd is not None: - columns.extend([ - Column("col8", "timestamp"), - Column("col9", "interval_day_time"), - ]) - values.extend([ - pd.Timestamp(2022, 6, 11, 22, 33, 1, 134561, 241), - pd.Timedelta(days=128, hours=10, minutes=5, seconds=17, microseconds=11), - ]) + columns.extend( + [ + Column("col8", "timestamp"), + Column("col9", "interval_day_time"), + ] + ) + values.extend( + [ + pd.Timestamp(2022, 6, 11, 22, 33, 1, 134561, 241), + pd.Timedelta( + days=128, hours=10, minutes=5, seconds=17, microseconds=11 + ), + ] + ) schema = OdpsSchema(columns) record = Record(schema=schema, values=values) return schema, record @@ -77,21 +83,31 @@ def test_default_hasher(hasher_mod, pd): assert hasher_mod.hash_value("default", "double", 15672.56271) == 1254569207 assert hasher_mod.hash_value("default", "boolean", True) == 388737479 assert hasher_mod.hash_value("default", "string", "hello".encode()) == -1259046373 - assert hasher_mod.hash_value( - "default", "date", datetime.date(2022, 12, 5) - ) == 903574500 - assert hasher_mod.hash_value( - "default", "datetime", datetime.datetime(2022, 6, 11, 22, 33, 11) - ) == -2026178719 + assert ( + hasher_mod.hash_value("default", "date", datetime.date(2022, 12, 5)) + == 903574500 + ) + assert ( + hasher_mod.hash_value( + "default", "datetime", datetime.datetime(2022, 6, 11, 22, 33, 11) + ) + == -2026178719 + ) if pd is not None: - assert hasher_mod.hash_value( - "default", "timestamp", pd.Timestamp("2023-07-05 11:24:15.145673214") - ) == -31960127 - assert hasher_mod.hash_value( - "default", "interval_day_time", pd.Timedelta( - seconds=100002, microseconds=2000, nanoseconds=1 + assert ( + hasher_mod.hash_value( + "default", "timestamp", pd.Timestamp("2023-07-05 11:24:15.145673214") ) - ) == -1088782317 + == -31960127 + ) + assert ( + hasher_mod.hash_value( + "default", + "interval_day_time", + pd.Timedelta(seconds=100002, microseconds=2000, nanoseconds=1), + ) + == -1088782317 + ) schema, rec = _build_schema_and_record(pd) col_names = [c.name for c in schema.columns] @@ -109,21 +125,31 @@ def test_legacy_hasher(hasher_mod, pd): assert hasher_mod.hash_value("legacy", "double", 15672.56271) == 1177487321 assert hasher_mod.hash_value("legacy", "boolean", False) == -978963218 assert hasher_mod.hash_value("legacy", "string", "hello".encode()) == 99162322 - assert hasher_mod.hash_value( - "legacy", "date", datetime.date(2022, 12, 5) - ) == 1670198400 - assert hasher_mod.hash_value( - "legacy", "datetime", datetime.datetime(2022, 6, 11, 22, 33, 11) - ) == 1395582425 + assert ( + hasher_mod.hash_value("legacy", "date", datetime.date(2022, 12, 5)) + == 1670198400 + ) + assert ( + hasher_mod.hash_value( + "legacy", "datetime", datetime.datetime(2022, 6, 11, 22, 33, 11) + ) + == 1395582425 + ) if pd is not None: - assert hasher_mod.hash_value( - "legacy", "timestamp", pd.Timestamp("2023-07-05 11:24:15.145673214") - ) == -779619479 - assert hasher_mod.hash_value( - "legacy", "interval_day_time", pd.Timedelta( - seconds=100002, microseconds=2000, nanoseconds=1 + assert ( + hasher_mod.hash_value( + "legacy", "timestamp", pd.Timestamp("2023-07-05 11:24:15.145673214") + ) + == -779619479 + ) + assert ( + hasher_mod.hash_value( + "legacy", + "interval_day_time", + pd.Timedelta(seconds=100002, microseconds=2000, nanoseconds=1), ) - ) == -2145458903 + == -2145458903 + ) schema, rec = _build_schema_and_record(pd) col_names = [c.name for c in schema.columns] diff --git a/odps/tunnel/tests/test_instancetunnel.py b/odps/tunnel/tests/test_instancetunnel.py index bba881b1..7edff39a 100644 --- a/odps/tunnel/tests/test_instancetunnel.py +++ b/odps/tunnel/tests/test_instancetunnel.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,8 +29,8 @@ from ... import options from ...models import TableSchema -from ...tests.core import tn, get_code_mode, py_and_c -from .. import TableTunnel, InstanceTunnel +from ...tests.core import get_code_mode, py_and_c, tn +from .. import InstanceTunnel, TableTunnel def _reloader(): @@ -40,11 +40,17 @@ def _reloader(): cfg.tunnel = TableTunnel(cfg.odps, endpoint=cfg.odps._tunnel_endpoint) -py_and_c_deco = py_and_c([ - "odps.models.record", "odps.models", "odps.tunnel.io.reader", - "odps.tunnel.io.writer", "odps.tunnel.tabletunnel", - "odps.tunnel.instancetunnel", -], _reloader) +py_and_c_deco = py_and_c( + [ + "odps.models.record", + "odps.models", + "odps.tunnel.io.reader", + "odps.tunnel.io.writer", + "odps.tunnel.tabletunnel", + "odps.tunnel.instancetunnel", + ], + _reloader, +) @pytest.fixture @@ -69,15 +75,19 @@ def _upload_data(tunnel, test_table, records, compress=False, **kw): record[i] = it writer.write(record) writer.close() - upload_ss.commit([0, ]) + upload_ss.commit([0]) -def _download_instance_data(instance_tunnel, test_instance, compress=False, columns=None, **kw): - count = kw.pop('count', 3) +def _download_instance_data( + instance_tunnel, test_instance, compress=False, columns=None, **kw +): + count = kw.pop("count", 3) download_ss = instance_tunnel.create_download_session(test_instance, **kw) # make sure session reprs work well repr(download_ss) - with download_ss.open_record_reader(0, count, compress=compress, columns=columns) as reader: + with download_ss.open_record_reader( + 0, count, compress=compress, columns=columns + ) as reader: # test use right py or c writer assert get_code_mode() == reader._mode() @@ -92,19 +102,51 @@ def _download_instance_data(instance_tunnel, test_instance, compress=False, colu def _gen_data(): return [ - ('hello \x00\x00 world', 2**63-1, math.pi, datetime(2015, 9, 19, 2, 11, 25, 33000), - True, Decimal('3.14'), ['simple', 'easy'], OrderedDict({'s': 1})), - ('goodbye', 222222, math.e, datetime(2020, 3, 10), False, Decimal('2.555555'), - ['true', None], OrderedDict({'true': 1})), - ('c'*300, -2**63+1, -2.222, datetime(1990, 5, 25, 3, 10), True, Decimal(22222), - ['false'], OrderedDict({'false': 0})), + ( + "hello \x00\x00 world", + 2**63 - 1, + math.pi, + datetime(2015, 9, 19, 2, 11, 25, 33000), + True, + Decimal("3.14"), + ["simple", "easy"], + OrderedDict({"s": 1}), + ), + ( + "goodbye", + 222222, + math.e, + datetime(2020, 3, 10), + False, + Decimal("2.555555"), + ["true", None], + OrderedDict({"true": 1}), + ), + ( + "c" * 300, + -(2**63) + 1, + -2.222, + datetime(1990, 5, 25, 3, 10), + True, + Decimal(22222), + ["false"], + OrderedDict({"false": 0}), + ), ] def _create_table(odps, table_name): - fields = ['id', 'int_num', 'float_num', 'dt', 'bool', 'dec', 'arr', 'm'] - types = ['string', 'bigint', 'double', 'datetime', 'boolean', 'decimal', - 'array', 'map'] + fields = ["id", "int_num", "float_num", "dt", "bool", "dec", "arr", "m"] + types = [ + "string", + "bigint", + "double", + "datetime", + "boolean", + "decimal", + "array", + "map", + ] odps.delete_table(table_name, if_exists=True) return odps.create_table( @@ -113,13 +155,23 @@ def _create_table(odps, table_name): def _create_partitioned_table(odps, table_name): - fields = ['id', 'int_num', 'float_num', 'dt', 'bool', 'dec', 'arr', 'm'] - types = ['string', 'bigint', 'double', 'datetime', 'boolean', 'decimal', - 'array', 'map'] + fields = ["id", "int_num", "float_num", "dt", "bool", "dec", "arr", "m"] + types = [ + "string", + "bigint", + "double", + "datetime", + "boolean", + "decimal", + "array", + "map", + ] odps.delete_table(table_name, if_exists=True) return odps.create_table( - table_name, TableSchema.from_lists(fields, types, ['ds'], ['string']), lifecycle=1 + table_name, + TableSchema.from_lists(fields, types, ["ds"], ["string"]), + lifecycle=1, ) @@ -129,18 +181,55 @@ def _delete_table(odps, table_name): @py_and_c_deco def test_download_by_raw_tunnel(config, instance_tunnel): - test_table_name = tn('pyodps_test_raw_inst_tunnel') + test_table_name = tn("pyodps_test_raw_inst_tunnel") _create_table(config.odps, test_table_name) data = _gen_data() _upload_data(config.tunnel, test_table_name, data) - inst = config.odps.execute_sql('select * from %s' % test_table_name) + inst = config.odps.execute_sql("select * from %s" % test_table_name) records = _download_instance_data(instance_tunnel, inst) assert list(data) == list(records) _delete_table(config.odps, test_table_name) +@py_and_c_deco +def test_tunnel_read_with_retry(config, instance_tunnel): + from ..instancetunnel import InstanceDownloadSession + + test_table_name = tn("pyodps_test_inst_tunnel_with_retry") + _create_table(config.odps, test_table_name) + data = _gen_data() + + _upload_data(config.tunnel, test_table_name, data) + inst = config.odps.execute_sql("select * from %s" % test_table_name) + + try: + ranges = [] + original = InstanceDownloadSession._build_input_stream + + def new_build_input_stream(self, start, count, *args, **kw): + ranges.append((start, count)) + assert start in (0, 2) + assert start == 0 or count == session.count - 2 + return original(self, start, count, *args, **kw) + + with mock.patch( + "odps.tunnel.instancetunnel.InstanceDownloadSession._build_input_stream", + new=new_build_input_stream, + ): + session = instance_tunnel.create_download_session(inst) + reader = session.open_record_reader(0, session.count) + + reader._inject_error(2, ValueError) + result = [tuple(r.values) for r in reader] + + assert ranges == [(0, 3), (2, 1)] + assert data == result + finally: + _delete_table(config.odps, test_table_name) + + @py_and_c_deco @pytest.mark.parametrize("algo, module", [(None, None), ("snappy", "snappy")]) def test_upload_and_download_with_compress(config, instance_tunnel, algo, module): @@ -150,14 +239,14 @@ def test_upload_and_download_with_compress(config, instance_tunnel, algo, module pytest.importorskip(module) try: - test_table_name = tn('pyodps_test_zlib_inst_tunnel') + test_table_name = tn("pyodps_test_zlib_inst_tunnel") _create_table(config.odps, test_table_name) data = _gen_data() _upload_data( config.tunnel, test_table_name, data, compress=True, compress_algo=algo ) - inst = config.odps.execute_sql('select * from %s' % test_table_name) + inst = config.odps.execute_sql("select * from %s" % test_table_name) records = _download_instance_data( instance_tunnel, inst, compress=True, compress_algo=algo ) @@ -170,15 +259,17 @@ def test_upload_and_download_with_compress(config, instance_tunnel, algo, module @py_and_c_deco def test_partition_upload_and_download_by_raw_tunnel(config, instance_tunnel): - test_table_name = tn('pyodps_test_raw_partition_tunnel') - test_table_partition = 'ds=test' + test_table_name = tn("pyodps_test_raw_partition_tunnel") + test_table_partition = "ds=test" config.odps.delete_table(test_table_name, if_exists=True) table = _create_partitioned_table(config.odps, test_table_name) table.create_partition(test_table_partition) data = _gen_data() - _upload_data(config.tunnel, test_table_name, data, partition_spec=test_table_partition) + _upload_data( + config.tunnel, test_table_name, data, partition_spec=test_table_partition + ) inst = config.odps.execute_sql("select * from %s where ds='test'" % test_table_name) records = _download_instance_data(instance_tunnel, inst) assert data == [r[:-1] for r in records] diff --git a/odps/tunnel/tests/test_pb.py b/odps/tunnel/tests/test_pb.py index e28b0000..9f2b4658 100644 --- a/odps/tunnel/tests/test_pb.py +++ b/odps/tunnel/tests/test_pb.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,56 +19,65 @@ import pytest from ...utils import to_binary, to_text -from ..pb.encoder import Encoder as PyEncoder +from ..pb import wire_format from ..pb.decoder import Decoder as PyDecoder -from ..pb.wire_format import * +from ..pb.encoder import Encoder as PyEncoder try: - from ..pb.encoder_c import Encoder as CEncoder from ..pb.decoder_c import Decoder as CDecoder + from ..pb.encoder_c import Encoder as CEncoder except ImportError: CEncoder = CDecoder = None -@pytest.mark.parametrize("encoder_cls, decoder_cls", [ - pytest.param(PyEncoder, PyDecoder, id="py"), - pytest.param(CEncoder, CDecoder, id="c"), -]) +@pytest.mark.parametrize( + "encoder_cls, decoder_cls", + [ + pytest.param(PyEncoder, PyDecoder, id="py"), + pytest.param(CEncoder, CDecoder, id="c"), + ], +) def test_encode_and_decode(encoder_cls, decoder_cls): if encoder_cls is None or decoder_cls is None: - pytest.skip('No Encoder or Decoder built by cython found') + pytest.skip("No Encoder or Decoder built by cython found") encoder = encoder_cls() - encoder.append_tag(0, WIRETYPE_VARINT) - encoder.append_tag(1, WIRETYPE_VARINT) - encoder.append_sint64(-2 ** 40) - encoder.append_tag(2, WIRETYPE_LENGTH_DELIMITED) + encoder.append_tag(0, wire_format.WIRETYPE_VARINT) + encoder.append_tag(1, wire_format.WIRETYPE_VARINT) + encoder.append_sint64(-(2**40)) + encoder.append_tag(2, wire_format.WIRETYPE_LENGTH_DELIMITED) encoder.append_string(to_binary("hello")) - encoder.append_tag(3, WIRETYPE_VARINT) + encoder.append_tag(3, wire_format.WIRETYPE_VARINT) encoder.append_bool(True) - encoder.append_tag(4, WIRETYPE_FIXED64) + encoder.append_tag(4, wire_format.WIRETYPE_FIXED64) encoder.append_float(3.14) encoder.append_double(0.31415926) - encoder.append_tag(5, WIRETYPE_VARINT) - encoder.append_uint32(2 ** 30) - encoder.append_tag(6, WIRETYPE_VARINT) - encoder.append_uint64(2 ** 40) + encoder.append_tag(5, wire_format.WIRETYPE_VARINT) + encoder.append_uint32(2**30) + encoder.append_tag(6, wire_format.WIRETYPE_VARINT) + encoder.append_uint64(2**40) buffer_size = len(encoder) tube = io.BytesIO(encoder.tostring()) decoder = decoder_cls(tube) - assert (0, WIRETYPE_VARINT) == decoder.read_field_number_and_wire_type() - assert (1, WIRETYPE_VARINT) == decoder.read_field_number_and_wire_type() - assert -2**40 == decoder.read_sint64() - assert (2, WIRETYPE_LENGTH_DELIMITED) == decoder.read_field_number_and_wire_type() + assert (0, wire_format.WIRETYPE_VARINT) == decoder.read_field_number_and_wire_type() + assert (1, wire_format.WIRETYPE_VARINT) == decoder.read_field_number_and_wire_type() + assert -(2**40) == decoder.read_sint64() + assert ( + 2, + wire_format.WIRETYPE_LENGTH_DELIMITED, + ) == decoder.read_field_number_and_wire_type() assert to_text("hello") == to_text(decoder.read_string()) - assert (3, WIRETYPE_VARINT) == decoder.read_field_number_and_wire_type() + assert (3, wire_format.WIRETYPE_VARINT) == decoder.read_field_number_and_wire_type() assert decoder.read_bool() - assert (4, WIRETYPE_FIXED64) == decoder.read_field_number_and_wire_type() + assert ( + 4, + wire_format.WIRETYPE_FIXED64, + ) == decoder.read_field_number_and_wire_type() assert pytest.approx(3.14) == decoder.read_float() assert 0.31415926 == decoder.read_double() - assert (5, WIRETYPE_VARINT) == decoder.read_field_number_and_wire_type() + assert (5, wire_format.WIRETYPE_VARINT) == decoder.read_field_number_and_wire_type() assert 2**30 == decoder.read_uint32() - assert (6, WIRETYPE_VARINT) == decoder.read_field_number_and_wire_type() + assert (6, wire_format.WIRETYPE_VARINT) == decoder.read_field_number_and_wire_type() assert 2**40 == decoder.read_uint64() assert buffer_size == len(decoder) diff --git a/odps/tunnel/tests/test_pdio.py b/odps/tunnel/tests/test_pdio.py deleted file mode 100644 index 238c5acf..00000000 --- a/odps/tunnel/tests/test_pdio.py +++ /dev/null @@ -1,258 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import pytest - -from ...tests.core import tn -from ...config import options -from .. import TableTunnel - -try: - from ..pdio import TunnelPandasReader, TunnelPandasWriter -except ImportError: - TunnelPandasReader, TunnelPandasWriter = None, None - -try: - import numpy as np - from numpy.testing import assert_array_equal - import pandas as pd -except ImportError: - pd = None - - -@pytest.fixture(autouse=True) -def wrap_options(): - old_pd_mem_cache_size = options.tunnel.pd_mem_cache_size - try: - yield - finally: - options.tunnel.pd_mem_cache_size = old_pd_mem_cache_size - - -@pytest.mark.skipif(not TunnelPandasReader, reason='Accelerated pandas IO not available') -def test_read_into(odps): - options.tunnel.pd_mem_cache_size = 200 - - test_table_name = tn('test_pdio_read_into') - odps.delete_table(test_table_name, if_exists=True) - table = odps.create_table(test_table_name, 'col1 bigint, col2 double, col3 boolean') - - data = [table.new_record([random.randint(0, 1048576), random.random(), random.random() > 0.5]) - for _ in range(10000)] - odps.write_table(test_table_name, data) - - tunnel = TableTunnel(odps) - download_session = tunnel.create_download_session(table.name) - reader = download_session.open_pandas_reader(0, download_session.count) - - read_buffer = [np.empty(5000, dtype=np.int64), np.empty(5000, dtype=np.float64), - np.empty(5000, dtype=np.bool_)] - count = reader.readinto(read_buffer) - - assert count == 5000 - assert_array_equal(read_buffer[0], np.array([r[0] for r in data[:5000]], dtype=np.int64)) - assert_array_equal(read_buffer[1], np.array([r[1] for r in data[:5000]], dtype=np.float64)) - assert_array_equal(read_buffer[2], np.array([r[2] for r in data[:5000]], dtype=np.bool_)) - - count = reader.readinto(read_buffer) - - assert count == 5000 - assert_array_equal(read_buffer[0], np.array([r[0] for r in data[5000:]], dtype=np.int64)) - assert_array_equal(read_buffer[1], np.array([r[1] for r in data[5000:]], dtype=np.float64)) - assert_array_equal(read_buffer[2], np.array([r[2] for r in data[5000:]], dtype=np.bool_)) - - assert reader.readinto(read_buffer) == 0 - - tunnel = TableTunnel(odps) - download_session = tunnel.create_download_session(table.name) - reader = download_session.open_pandas_reader(0, download_session.count, columns=['col2', 'col3', 'col1']) - - read_buffer = [np.empty(10000, dtype=np.float64), np.empty(10000, dtype=np.bool_), - np.empty(10000, dtype=np.int64)] - count = reader.readinto(read_buffer) - - assert count == 10000 - assert_array_equal(read_buffer[0], np.array([r[1] for r in data], dtype=np.float64)) - assert_array_equal(read_buffer[1], np.array([r[2] for r in data], dtype=np.bool_)) - assert_array_equal(read_buffer[2], np.array([r[0] for r in data], dtype=np.int64)) - - tunnel = TableTunnel(odps) - download_session = tunnel.create_download_session(table.name) - reader = download_session.open_pandas_reader(0, download_session.count, columns=['col2', 'col3', 'col1']) - - read_buffer = [np.empty(10000, dtype=np.int64), np.empty(10000, dtype=np.float64), - np.empty(10000, dtype=np.bool_)] - count = reader.readinto(read_buffer, columns=['col1', 'col2', 'col3']) - - assert count == 10000 - assert_array_equal(read_buffer[0], np.array([r[0] for r in data], dtype=np.int64)) - assert_array_equal(read_buffer[1], np.array([r[1] for r in data], dtype=np.float64)) - assert_array_equal(read_buffer[2], np.array([r[2] for r in data], dtype=np.bool_)) - - try: - import pandas as pd - tunnel = TableTunnel(odps) - download_session = tunnel.create_download_session(table.name) - reader = download_session.open_pandas_reader(0, download_session.count) - - read_buffer = pd.DataFrame(dict(col1=np.empty(10000, dtype=np.int64), - col2=np.empty(10000, dtype=np.float64), - col3=np.empty(10000, dtype=np.bool_)), columns='col1 col2 col3'.split()) - - count = reader.readinto(read_buffer) - assert count == 10000 - - assert_array_equal(read_buffer.col1.values, np.array([r[0] for r in data], dtype=np.int64)) - assert_array_equal(read_buffer.col2.values, np.array([r[1] for r in data], dtype=np.float64)) - assert_array_equal(read_buffer.col3.values, np.array([r[2] for r in data], dtype=np.bool_)) - except ImportError: - pass - - -@pytest.mark.skipif(not pd, reason='pandas not available') -def test_read(odps): - options.tunnel.pd_mem_cache_size = 200 - options.tunnel.pd_row_cache_size = 200 - - test_table_name = tn('test_pdio_read_into') - odps.delete_table(test_table_name, if_exists=True) - table = odps.create_table(test_table_name, 'col1 bigint, col2 double, col3 boolean') - - data = [table.new_record([random.randint(0, 1048576), random.random(), random.random() > 0.5]) - for _ in range(10000)] - odps.write_table(test_table_name, data) - - tunnel = TableTunnel(odps) - download_session = tunnel.create_download_session(table.name) - reader = download_session.open_pandas_reader(0, download_session.count) - - result = reader.read() - assert_array_equal(result.col1.values, np.array([r[0] for r in data], dtype=np.int64)) - assert_array_equal(result.col2.values, np.array([r[1] for r in data], dtype=np.float64)) - assert_array_equal(result.col3.values, np.array([r[2] for r in data], dtype=np.bool_)) - - -@pytest.mark.skipif(not TunnelPandasWriter, reason='Accelerated pandas IO not available') -def test_write_array(odps): - options.tunnel.pd_mem_cache_size = 200 - - test_table_name = tn('test_pdio_write_array') - odps.delete_table(test_table_name, if_exists=True) - table = odps.create_table(test_table_name, 'col1 bigint, col2 bigint, col3 double') - - data = np.random.rand(100, 300) * 1000 - - tunnel = TableTunnel(odps) - upload_session = tunnel.create_upload_session(table.name) - writer = upload_session.open_pandas_writer(0) - - writer.write(data) - - writer.close() - upload_session.commit([0]) - - recv_data = np.empty((100, 300), dtype=np.double) - for rec in odps.read_table(test_table_name): - recv_data[rec[0], rec[1]] = rec[2] - - assert_array_equal(data, recv_data) - - table.truncate() - - tunnel = TableTunnel(odps) - upload_session = tunnel.create_upload_session(table.name) - writer = upload_session.open_pandas_writer(0) - - writer.write(data, dim_offsets=(500, 100)) - - writer.close() - upload_session.commit([0]) - - recv_data = np.empty((100, 300), dtype=np.double) - for rec in odps.read_table(test_table_name): - recv_data[rec[0] - 500, rec[1] - 100] = rec[2] - - assert_array_equal(data, recv_data) - - -@pytest.mark.skipif(not TunnelPandasWriter, reason='Accelerated pandas IO not available') -def test_write_arrays(odps): - def assert_results(): - recv_data = [np.empty((10000, ), dtype=np.int64), np.empty((10000, ), dtype=np.double), - np.empty((10000, ), dtype=np.bool_)] - - for idx, rec in enumerate(odps.read_table(test_table_name)): - recv_data[0][idx] = rec[0] - recv_data[1][idx] = rec[1] - recv_data[2][idx] = rec[2] - - assert_array_equal(raw_data[0], recv_data[0]) - assert_array_equal(raw_data[1], recv_data[1]) - assert_array_equal(raw_data[2], recv_data[2]) - - options.tunnel.pd_mem_cache_size = 200 - - test_table_name = tn('test_pdio_write_arrays') - odps.delete_table(test_table_name, if_exists=True) - table = odps.create_table(test_table_name, 'col1 bigint, col2 double, col3 boolean') - - raw_data = [np.random.randint(1048576, size=(10000,)), np.random.rand(10000), - np.random.rand(10000) > 0.5] - data = raw_data - - tunnel = TableTunnel(odps) - upload_session = tunnel.create_upload_session(table.name) - writer = upload_session.open_pandas_writer(0) - - writer.write(data) - - writer.close() - upload_session.commit([0]) - assert_results() - - table.truncate() - - data = dict(col1=raw_data[0], col2=raw_data[1], col3=raw_data[2]) - - tunnel = TableTunnel(odps) - upload_session = tunnel.create_upload_session(table.name) - writer = upload_session.open_pandas_writer(0) - - writer.write(data) - - writer.close() - upload_session.commit([0]) - assert_results() - - table.truncate() - - try: - import pandas as pd - data = pd.DataFrame(dict(col1=raw_data[0], col2=raw_data[1], col3=raw_data[2]), - columns='col1 col2 col3'.split()) - - tunnel = TableTunnel(odps) - upload_session = tunnel.create_upload_session(table.name) - writer = upload_session.open_pandas_writer(0) - - writer.write(data) - - writer.close() - upload_session.commit([0]) - assert_results() - except ImportError: - pass diff --git a/odps/tunnel/tests/test_streamio.py b/odps/tunnel/tests/test_streamio.py index 3bd3d965..5d04ba4c 100644 --- a/odps/tunnel/tests/test_streamio.py +++ b/odps/tunnel/tests/test_streamio.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ import threading import time import traceback + try: from string import letters except ImportError: @@ -30,7 +31,6 @@ from ..io import stream as io_stream - TEXT = u""" 上善若水。水善利万物而不争,处众人之所恶,故几於道。居善地,心善渊,与善仁,言善信,正善 @@ -59,13 +59,16 @@ def fix_config(): io_stream._FORCE_THREAD = False -@pytest.mark.parametrize("compress_algo, package", [ - (io_stream.CompressOption.CompressAlgorithm.ODPS_RAW, None), - (io_stream.CompressOption.CompressAlgorithm.ODPS_ZLIB, None), - (io_stream.CompressOption.CompressAlgorithm.ODPS_SNAPPY, "snappy"), - (io_stream.CompressOption.CompressAlgorithm.ODPS_ZSTD, "zstandard"), - (io_stream.CompressOption.CompressAlgorithm.ODPS_LZ4, "lz4.frame"), -]) +@pytest.mark.parametrize( + "compress_algo, package", + [ + (io_stream.CompressOption.CompressAlgorithm.ODPS_RAW, None), + (io_stream.CompressOption.CompressAlgorithm.ODPS_ZLIB, None), + (io_stream.CompressOption.CompressAlgorithm.ODPS_SNAPPY, "snappy"), + (io_stream.CompressOption.CompressAlgorithm.ODPS_ZSTD, "zstandard"), + (io_stream.CompressOption.CompressAlgorithm.ODPS_LZ4, "lz4.frame"), + ], +) def test_compress_and_decompress(compress_algo, package): if package is not None: pytest.importorskip(package) @@ -73,7 +76,7 @@ def test_compress_and_decompress(compress_algo, package): tube = io.BytesIO() option = io_stream.CompressOption(compress_algo) - data_bytes = TEXT.encode('utf-8') + data_bytes = TEXT.encode("utf-8") outstream = io_stream.get_compress_stream(tube, option) for pos in range(0, len(data_bytes), 128): @@ -90,21 +93,21 @@ def test_compress_and_decompress(compress_algo, package): break b += part - assert TEXT.encode('utf8') == b + assert TEXT.encode("utf8") == b tube.seek(0) instream = io_stream.get_decompress_stream(tube, option, requests=False) - b = bytearray(len(TEXT.encode('utf8'))) + b = bytearray(len(TEXT.encode("utf8"))) mv = memoryview(b) pos = 0 while True: - incr = instream.readinto(mv[pos:pos + 1]) + incr = instream.readinto(mv[pos : pos + 1]) if not incr: break pos += incr - assert TEXT.encode('utf8') == b + assert TEXT.encode("utf8") == b def test_class(): @@ -115,17 +118,25 @@ def test_class(): assert isinstance(req_io, io_stream.ThreadRequestsIO) else: assert isinstance(req_io, io_stream.GreenletRequestsIO) - assert isinstance(io_stream.ThreadRequestsIO(lambda c: None), io_stream.ThreadRequestsIO) + assert isinstance( + io_stream.ThreadRequestsIO(lambda c: None), io_stream.ThreadRequestsIO + ) if io_stream.GreenletRequestsIO is not None: - assert isinstance(io_stream.GreenletRequestsIO(lambda c: None), io_stream.GreenletRequestsIO) + assert isinstance( + io_stream.GreenletRequestsIO(lambda c: None), io_stream.GreenletRequestsIO + ) io_stream._FORCE_THREAD = True req_io = io_stream.RequestsIO(lambda c: None) assert isinstance(req_io, io_stream.ThreadRequestsIO) - assert isinstance(io_stream.ThreadRequestsIO(lambda c: None), io_stream.ThreadRequestsIO) + assert isinstance( + io_stream.ThreadRequestsIO(lambda c: None), io_stream.ThreadRequestsIO + ) if io_stream.GreenletRequestsIO is not None: - assert isinstance(io_stream.GreenletRequestsIO(lambda c: None), io_stream.GreenletRequestsIO) + assert isinstance( + io_stream.GreenletRequestsIO(lambda c: None), io_stream.GreenletRequestsIO + ) @pytest.fixture @@ -140,6 +151,7 @@ def semaphore_random_delay(request): if not request.param: yield else: + def new_acquire(self, *args, **kw): time.sleep(random.random() / 4.0) ret = original_acquire(self, *args, **kw) @@ -166,19 +178,19 @@ def raise_poster(it): raise AttributeError except: tb = traceback.format_exc().splitlines() - exc_trace[0] = '\n'.join(tb[-3:]) + exc_trace[0] = "\n".join(tb[-3:]) raise req_io = io_stream.RequestsIO(raise_poster, chunk_size=5) req_io.start() try: - req_io.write(b'TEST_DATA') - req_io.write(b'ANOTHER_PIECE') - req_io.write(b'THIS_SHALL_RAISE') + req_io.write(b"TEST_DATA") + req_io.write(b"ANOTHER_PIECE") + req_io.write(b"THIS_SHALL_RAISE") assert False, "Statement above not raised" except AttributeError: tb = traceback.format_exc().splitlines() - assert '\n'.join(tb[-3:]) == exc_trace[0] + assert "\n".join(tb[-3:]) == exc_trace[0] @pytest.mark.parametrize( @@ -201,7 +213,7 @@ def check_poster(it): req_io = io_stream.RequestsIO(check_poster, chunk_size=chunk_size) req_io.start() for _ in range(repeats): - req_io.write(TEXT.encode('utf-8')) + req_io.write(TEXT.encode("utf-8")) req_io.finish() - assert b"".join(recv_chunks) == TEXT.encode('utf-8') * repeats + assert b"".join(recv_chunks) == TEXT.encode("utf-8") * repeats diff --git a/odps/tunnel/tests/test_tabletunnel.py b/odps/tunnel/tests/test_tabletunnel.py index b650865b..e6c7eafd 100644 --- a/odps/tunnel/tests/test_tabletunnel.py +++ b/odps/tunnel/tests/test_tabletunnel.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,7 +22,7 @@ import time import warnings from collections import OrderedDict -from datetime import datetime, date +from datetime import date, datetime from multiprocessing.pool import ThreadPool import six @@ -44,14 +44,22 @@ import pytest import requests -from ... import types, options -from ...errors import throw_if_parsable -from ...tests.core import py_and_c, tn, pandas_case, pyarrow_case, \ - odps2_typed_case, get_code_mode, approx_list, flaky, get_test_unique_name +from ... import options, types +from ...compat import DECIMAL_TYPES, Decimal, Monthdelta, Version +from ...errors import DatetimeOverflowError, throw_if_parsable +from ...models import Record, TableSchema +from ...tests.core import ( + approx_list, + flaky, + get_code_mode, + get_test_unique_name, + odps2_typed_case, + pandas_case, + py_and_c, + pyarrow_case, + tn, +) from ...utils import get_zone_name, to_text -from ...compat import Decimal, Monthdelta, Iterable, Version, DECIMAL_TYPES -from ...errors import DatetimeOverflowError -from ...models import TableSchema, Record from .. import TableTunnel from ..errors import TunnelWriteTimeout @@ -60,8 +68,12 @@ def check_malicious_requests(odps): def _new_throw_if_parsable(resp, *args, **kw): if resp.status_code in (400, 404) and not resp.url.startswith(odps.endpoint): - raise AssertionError("Malicious request detected.") - throw_if_parsable(resp, *args, **kw) + try: + throw_if_parsable(resp, *args, **kw) + except: + raise AssertionError("Malicious request detected.") + else: + throw_if_parsable(resp, *args, **kw) with mock.patch("odps.errors.throw_if_parsable", new=_new_throw_if_parsable): yield @@ -78,10 +90,10 @@ def _gen_random_bigint(self): def _gen_random_string(self, max_length=15): gen_letter = lambda: letters[random.randint(0, 51)] - return to_text(''.join([gen_letter() for _ in range(random.randint(1, 15))])) + return to_text("".join([gen_letter() for _ in range(random.randint(1, 15))])) def _gen_random_double(self): - return random.uniform(-2 ** 32, 2 ** 32) + return random.uniform(-(2**32), 2**32) def _gen_random_datetime(self): return datetime.fromtimestamp(random.randint(0, int(time.time()))) @@ -93,14 +105,14 @@ def _gen_random_decimal(self): return Decimal(str(self._gen_random_double())) def _gen_random_array_type(self): - t = random.choice(['string', 'bigint', 'double', 'boolean']) + t = random.choice(["string", "bigint", "double", "boolean"]) return types.Array(t) gen_random_array_type = _gen_random_array_type def _gen_random_map_type(self): - random_key_type = random.choice(['bigint', 'string']) - random_value_type = random.choice(['bigint', 'string', 'double']) + random_key_type = random.choice(["bigint", "string"]) + random_value_type = random.choice(["bigint", "string", "double"]) return types.Map(random_key_type, random_value_type) @@ -110,7 +122,7 @@ def _gen_random_array(self, random_type, size=None): random_type = types.validate_data_type(random_type) if isinstance(random_type, types.Array): random_type = random_type.value_type - method = getattr(self, '_gen_random_%s' % random_type.name) + method = getattr(self, "_gen_random_%s" % random_type.name) array = [method() for _ in range(size)] return array @@ -129,11 +141,17 @@ def _gen_random_map(self, random_map_type): return m def gen_table( - self, partition=None, partition_type=None, partition_val=None, size=100, odps=None + self, + partition=None, + partition_type=None, + partition_val=None, + size=100, + odps=None, + append_partitions=True, ): def gen_name(name): - if '<' in name: - name = name.split('<', 1)[0] + if "<" in name: + name = name.split("<", 1)[0] if len(name) > 4: name = name[:4] else: @@ -141,40 +159,47 @@ def gen_name(name): return name odps = odps or self.odps - test_table_name = tn('pyodps_t_tmp_tunnel_' + get_test_unique_name()) + test_table_name = tn("pyodps_t_tmp_tunnel_" + get_test_unique_name()) - types = ['bigint', 'string', 'double', 'datetime', 'boolean', 'decimal'] + types = ["bigint", "string", "double", "datetime", "boolean", "decimal"] types.append(self.gen_random_array_type().name) types.append(self._gen_random_map_type().name) random.shuffle(types) names = [gen_name(t) for t in types] odps.delete_table(test_table_name, if_exists=True) - partition_names = [partition, ] if partition else None - partition_types = [partition_type, ] if partition_type else None + partition_names = [partition] if partition else None + partition_types = [partition_type] if partition_type else None table = self.last_table = odps.create_table( test_table_name, TableSchema.from_lists( - names, types, partition_names=partition_names, partition_types=partition_types + names, + types, + partition_names=partition_names, + partition_types=partition_types, ), lifecycle=1, ) if partition_val: - table.create_partition('%s=%s' % (partition, partition_val)) + table.create_partition("%s=%s" % (partition, partition_val)) data = [] for i in range(size): record = [] for t in types: - n = t.split('<', 1)[0] - method = getattr(self, '_gen_random_' + n) - if n in ('map', 'array'): + n = t.split("<", 1)[0] + method = getattr(self, "_gen_random_" + n) + if n in ("map", "array"): record.append(method(t)) - elif n == 'double' and i == 0: - record.append(float('nan')) + elif n == "double" and i == 0: + record.append(float("nan")) else: record.append(method()) - if partition is not None and partition_val is not None: + if ( + append_partitions + and partition is not None + and partition_val is not None + ): record.append(partition_val) data.append(record) @@ -188,7 +213,13 @@ def _nan_inf(v): data = [r.values if isinstance(r, Record) else r for r in data] sortable_types = ( - six.string_types, six.integer_types, bool, float, DECIMAL_TYPES, datetime, date + six.string_types, + six.integer_types, + bool, + float, + DECIMAL_TYPES, + datetime, + date, ) sort_idxes = [] if reads and data: @@ -196,8 +227,12 @@ def _nan_inf(v): if isinstance(v, sortable_types): sort_idxes.append(idx) if sort_idxes: - reads = sorted(reads, key=lambda x: tuple(_nan_inf(x[ix]) for ix in sort_idxes)) - data = sorted(data, key=lambda x: tuple(_nan_inf(x[ix]) for ix in sort_idxes)) + reads = sorted( + reads, key=lambda x: tuple(_nan_inf(x[ix]) for ix in sort_idxes) + ) + data = sorted( + data, key=lambda x: tuple(_nan_inf(x[ix]) for ix in sort_idxes) + ) try: assert len(data) == len(reads) for val1, val2 in zip(data, reads): @@ -208,8 +243,12 @@ def _nan_inf(v): assert any(it1[k] == it2[k] for k in it1) is True elif isinstance(it1, list): assert it1 == list(it2) - elif isinstance(it1, float) and math.isnan(it1) and \ - isinstance(it2, float) and math.isnan(it2): + elif ( + isinstance(it1, float) + and math.isnan(it1) + and isinstance(it2, float) + and math.isnan(it2) + ): continue else: assert it1 == it2 @@ -237,7 +276,7 @@ def upload_data(self, test_table, records, compress=False, **kw): record[i] = it writer.write(record) writer.close() - upload_ss.commit([0, ]) + upload_ss.commit([0]) def stream_upload_data(self, test_table, records, compress=False, **kw): upload_ss = self.tunnel.create_stream_upload_session(test_table, **kw) @@ -258,11 +297,15 @@ def stream_upload_data(self, test_table, records, compress=False, **kw): writer.close() upload_ss.abort() - def buffered_upload_data(self, test_table, records, buffer_size=None, compress=False, **kw): + def buffered_upload_data( + self, test_table, records, buffer_size=None, compress=False, **kw + ): upload_ss = self.tunnel.create_upload_session(test_table, **kw) # make sure session reprs work well repr(upload_ss) - writer = upload_ss.open_record_writer(buffer_size=buffer_size, compress=compress) + writer = upload_ss.open_record_writer( + buffer_size=buffer_size, compress=compress + ) for r in records: record = upload_ss.new_record() for i, it in enumerate(r): @@ -276,13 +319,22 @@ def buffered_upload_data(self, test_table, records, buffer_size=None, compress=F assert len(writer.get_blocks_written()) > 1 upload_ss.commit(writer.get_blocks_written()) - def download_data(self, test_table, compress=False, columns=None, **kw): - count = kw.pop('count', None) + def download_data( + self, test_table, compress=False, columns=None, append_partitions=None, **kw + ): + count = kw.pop("count", None) download_ss = self.tunnel.create_download_session(test_table, **kw) count = count or download_ss.count # make sure session reprs work well repr(download_ss) - with download_ss.open_record_reader(0, count, compress=compress, columns=columns) as reader: + down_kw = ( + {"append_partitions": append_partitions} + if append_partitions is not None + else {} + ) + with download_ss.open_record_reader( + 0, count, compress=compress, columns=columns, **down_kw + ) as reader: # test use right py or c writer assert get_code_mode() == reader._mode() @@ -296,20 +348,60 @@ def download_data(self, test_table, compress=False, columns=None, **kw): def gen_data(self): return [ - ('hello \x00\x00 world', 2**63-1, math.pi, datetime(2015, 9, 19, 2, 11, 25, 33000), - True, Decimal('3.14'), ['simple', 'easy'], OrderedDict({'s': 1})), - ('goodbye', 222222, math.e, datetime(2020, 3, 10), False, Decimal('1234567898765431'), - ['true', None], OrderedDict({'true': 1})), - ('c' * 300, -2 ** 63 + 1, -2.222, datetime(1999, 5, 25, 3, 10), True, Decimal(28318318318318318), - ['false'], OrderedDict({'false': 0})), - ('c' * 20, -2 ** 11 + 1, 2.222, datetime(1961, 10, 30, 11, 32), True, Decimal('12345678.98765431'), - ['true'], OrderedDict({'false': 0})), + ( + "hello \x00\x00 world", + 2**63 - 1, + math.pi, + datetime(2015, 9, 19, 2, 11, 25, 33000), + True, + Decimal("3.14"), + ["simple", "easy"], + OrderedDict({"s": 1}), + ), + ( + "goodbye", + 222222, + math.e, + datetime(2020, 3, 10), + False, + Decimal("1234567898765431"), + ["true", None], + OrderedDict({"true": 1}), + ), + ( + "c" * 300, + -(2**63) + 1, + -2.222, + datetime(1999, 5, 25, 3, 10), + True, + Decimal(28318318318318318), + ["false"], + OrderedDict({"false": 0}), + ), + ( + "c" * 20, + -(2**11) + 1, + 2.222, + datetime(1961, 10, 30, 11, 32), + True, + Decimal("12345678.98765431"), + ["true"], + OrderedDict({"false": 0}), + ), ] def create_table(self, table_name, odps=None): - fields = ['id', 'int_num', 'float_num', 'dt', 'bool', 'dec', 'arr', 'm'] - types = ['string', 'bigint', 'double', 'datetime', 'boolean', 'decimal', - 'array', 'map'] + fields = ["id", "int_num", "float_num", "dt", "bool", "dec", "arr", "m"] + types = [ + "string", + "bigint", + "double", + "datetime", + "boolean", + "decimal", + "array", + "map", + ] odps = odps or self.odps odps.delete_table(table_name, if_exists=True) @@ -318,15 +410,23 @@ def create_table(self, table_name, odps=None): ) def create_partitioned_table(self, table_name, odps=None): - fields = ['id', 'int_num', 'float_num', 'dt', 'bool', 'dec', 'arr', 'm'] - types = ['string', 'bigint', 'double', 'datetime', 'boolean', 'decimal', - 'array', 'map'] + fields = ["id", "int_num", "float_num", "dt", "bool", "dec", "arr", "m"] + types = [ + "string", + "bigint", + "double", + "datetime", + "boolean", + "decimal", + "array", + "map", + ] odps = odps or self.odps odps.delete_table(table_name, if_exists=True) return odps.create_table( table_name, - TableSchema.from_lists(fields, types, ['ds'], ['string']), + TableSchema.from_lists(fields, types, ["ds"], ["string"]), ) def delete_table(self, table_name): @@ -340,11 +440,17 @@ def _reloader(): cfg.tunnel = TableTunnel(cfg.odps, endpoint=cfg.odps._tunnel_endpoint) -py_and_c_deco = py_and_c([ - "odps.models.record", "odps.models", "odps.tunnel.io.reader", - "odps.tunnel.io.writer", "odps.tunnel.tabletunnel", - "odps.tunnel.instancetunnel", -], _reloader) +py_and_c_deco = py_and_c( + [ + "odps.models.record", + "odps.models", + "odps.tunnel.io.reader", + "odps.tunnel.io.writer", + "odps.tunnel.tabletunnel", + "odps.tunnel.instancetunnel", + ], + _reloader, +) @pytest.fixture @@ -372,7 +478,7 @@ def test_malicious_request_detection(setup): @py_and_c_deco def test_upload_and_download_by_raw_tunnel(setup): - test_table_name = tn('pyodps_test_raw_tunnel') + test_table_name = tn("pyodps_test_raw_tunnel_" + get_code_mode()) setup.create_table(test_table_name) data = setup.gen_data() @@ -385,7 +491,7 @@ def test_upload_and_download_by_raw_tunnel(setup): @py_and_c_deco def test_stream_upload_and_download_tunnel(odps, setup): - test_table_name = tn('pyodps_test_stream_upload_' + get_code_mode()) + test_table_name = tn("pyodps_test_stream_upload_" + get_code_mode()) odps.delete_table(test_table_name, if_exists=True) setup.create_table(test_table_name) data = setup.gen_data() @@ -413,10 +519,11 @@ def test_buffered_upload_and_download_by_raw_tunnel(setup): @py_and_c_deco -@pytest.mark.skipif(pytz is None and zoneinfo is None, reason='pytz not installed') -@pytest.mark.parametrize("zone", [False, True, 'Asia/Shanghai', 'America/Los_Angeles']) +@pytest.mark.skipif(pytz is None and zoneinfo is None, reason="pytz not installed") +@pytest.mark.parametrize("zone", [False, True, "Asia/Shanghai", "America/Los_Angeles"]) def test_buffered_upload_and_download_with_timezone(setup, zone): from ...utils import MillisecondsConverter + try: tz = MillisecondsConverter._get_tz(zone) options.local_timezone = zone @@ -443,13 +550,13 @@ def test_buffered_upload_and_download_with_timezone(setup, zone): @py_and_c_deco def test_download_with_specified_columns(setup): - test_table_name = tn('pyodps_test_raw_tunnel_columns') + test_table_name = tn("pyodps_test_raw_tunnel_columns_" + get_code_mode()) setup.create_table(test_table_name) data = setup.gen_data() setup.upload_data(test_table_name, data) - records = setup.download_data(test_table_name, columns=['id']) + records = setup.download_data(test_table_name, columns=["id"]) assert [r[0] for r in records] == [r[0] for r in data] for r in records: for i in range(1, len(r)): @@ -461,7 +568,7 @@ def test_download_with_specified_columns(setup): def test_download_limitation(odps, setup): old_limit = options.table_read_limit - test_table_name = tn('pyodps_test_tunnel_limit') + test_table_name = tn("pyodps_test_tunnel_limit_" + get_code_mode()) setup.create_table(test_table_name) data = setup.gen_data() setup.upload_data(test_table_name, data * 20) @@ -483,8 +590,8 @@ def test_download_limitation(odps, setup): @py_and_c_deco def test_partition_upload_and_download_by_raw_tunnel(odps, setup): - test_table_name = tn('pyodps_test_raw_partition_tunnel') - test_table_partition = 'ds=test' + test_table_name = tn("pyodps_test_raw_partition_tunnel_" + get_code_mode()) + test_table_partition = "ds=test" odps.delete_table(test_table_name, if_exists=True) table = setup.create_partitioned_table(test_table_name) @@ -495,13 +602,18 @@ def test_partition_upload_and_download_by_raw_tunnel(odps, setup): records = setup.download_data(test_table_name, partition_spec=test_table_partition) assert list(data) == [r[:-1] for r in records] + records = setup.download_data( + test_table_name, partition_spec=test_table_partition, append_partitions=False + ) + assert list(data) == list(records) + setup.delete_table(test_table_name) @py_and_c_deco def test_partition_download_with_specified_columns(odps, setup): - test_table_name = tn('pyodps_test_raw_tunnel_partition_columns') - test_table_partition = 'ds=test' + test_table_name = tn("pyodps_test_raw_tunnel_partition_columns_" + get_code_mode()) + test_table_partition = "ds=test" odps.delete_table(test_table_name, if_exists=True) table = setup.create_partitioned_table(test_table_name) @@ -509,8 +621,9 @@ def test_partition_download_with_specified_columns(odps, setup): data = setup.gen_data() setup.upload_data(test_table_name, data, partition_spec=test_table_partition) - records = setup.download_data(test_table_name, partition_spec=test_table_partition, - columns=['int_num']) + records = setup.download_data( + test_table_name, partition_spec=test_table_partition, columns=["int_num"] + ) assert [r[1] for r in data] == [r[0] for r in records] setup.delete_table(test_table_name) @@ -518,9 +631,8 @@ def test_partition_download_with_specified_columns(odps, setup): @py_and_c_deco @pytest.mark.parametrize( - "algo, module", [ - (None, None), ("snappy", "snappy"), ("zstd", "zstandard"), ("lz4", "lz4.frame") - ] + "algo, module", + [(None, None), ("snappy", "snappy"), ("zstd", "zstandard"), ("lz4", "lz4.frame")], ) def test_upload_and_download_with_compress(setup, algo, module): raw_chunk_size = options.chunk_size @@ -529,7 +641,7 @@ def test_upload_and_download_with_compress(setup, algo, module): pytest.importorskip(module) try: - test_table_name = tn('pyodps_test_zlib_tunnel') + test_table_name = tn("pyodps_test_zlib_tunnel_" + get_test_unique_name(5)) setup.create_table(test_table_name) data = setup.gen_data() @@ -569,8 +681,12 @@ def test_table_upload_and_download_tunnel(odps, setup): elif isinstance(it1, list): assert it1 == list(it2) else: - if isinstance(it1, float) and math.isnan(it1) \ - and isinstance(it2, float) and math.isnan(it2): + if ( + isinstance(it1, float) + and math.isnan(it1) + and isinstance(it2, float) + and math.isnan(it2) + ): continue assert it1 == it2 @@ -592,11 +708,12 @@ def test_multi_table_upload_and_download_tunnel(odps, setup): @py_and_c_deco def test_parallel_table_upload_and_download_tunnel(odps, setup): - p = 'ds=test' + p = "ds=test" table, data = setup.gen_table( - partition=p.split('=', 1)[0], partition_type='string', - partition_val=p.split('=', 1)[1] + partition=p.split("=", 1)[0], + partition_type="string", + partition_val=p.split("=", 1)[1], ) assert table.exist_partition(p) is True records = [table.new_record(values=d) for d in data] @@ -613,12 +730,13 @@ def gen_block_records(block_id): ed = int(c / n_blocks * (block_id + 1)) else: ed = c - return records[st: ed] + return records[st:ed] def write(w): def inner(arg): idx, r = arg w.write(idx, r) + return inner with table.open_writer(partition=p, blocks=blocks) as writer: @@ -649,17 +767,26 @@ def inner(arg): @odps2_typed_case @py_and_c_deco def test_primitive_types2(odps): - table_name = tn('test_hivetunnel_singleton_types') + table_name = tn("test_hivetunnel_singleton_types_" + get_code_mode()) odps.delete_table(table_name, if_exists=True) - table = odps.create_table(table_name, 'col1 tinyint, col2 smallint, col3 int, col4 float, col5 binary', - lifecycle=1) - assert table.table_schema.types == [types.tinyint, types.smallint, types.int_, types.float_, types.binary] + table = odps.create_table( + table_name, + "col1 tinyint, col2 smallint, col3 int, col4 float, col5 binary", + lifecycle=1, + ) + assert table.table_schema.types == [ + types.tinyint, + types.smallint, + types.int_, + types.float_, + types.binary, + ] contents = [ - [127, 32767, 1234321, 10.5432, b'Hello, world!'], - [-128, -32768, 4312324, 20.1234, b'Excited!'], - [-1, 10, 9875479, 20.1234, b'Bravo!'], + [127, 32767, 1234321, 10.5432, b"Hello, world!"], + [-128, -32768, 4312324, 20.1234, b"Excited!"], + [-1, 10, 9875479, 20.1234, b"Bravo!"], ] odps.write_table(table_name, contents) written = list(odps.read_table(table_name)) @@ -672,17 +799,13 @@ def test_primitive_types2(odps): @py_and_c_deco @odps2_typed_case def test_date(odps): - table_name = tn('test_hivetunnel_date_io') + table_name = tn("test_hivetunnel_date_io_" + get_code_mode()) odps.delete_table(table_name, if_exists=True) - table = odps.create_table(table_name, 'col1 int, col2 date', lifecycle=1) + table = odps.create_table(table_name, "col1 int, col2 date", lifecycle=1) assert table.table_schema.types == [types.int_, types.date] - contents = [ - [0, date(2020, 2, 12)], - [1, date(1900, 1, 1)], - [2, date(2000, 3, 20)] - ] + contents = [[0, date(2020, 2, 12)], [1, date(1900, 1, 1)], [2, date(2000, 3, 20)]] odps.write_table(table_name, contents) written = list(odps.read_table(table_name)) values = [list(v.values) for v in written] @@ -696,16 +819,17 @@ def test_date(odps): @odps2_typed_case def test_timestamp(odps): import pandas as pd - table_name = tn('test_hivetunnel_timestamp_io') + + table_name = tn("test_hivetunnel_timestamp_io_" + get_code_mode()) odps.delete_table(table_name, if_exists=True) - table = odps.create_table(table_name, 'col1 int, col2 timestamp', lifecycle=1) + table = odps.create_table(table_name, "col1 int, col2 timestamp", lifecycle=1) assert table.table_schema.types == [types.int_, types.timestamp] contents = [ - [0, pd.Timestamp('2013-09-21 11:23:35.196045321')], - [1, pd.Timestamp('1998-02-15 23:59:21.943829154')], - [2, pd.Timestamp('2017-10-31 00:12:39.396583106')], + [0, pd.Timestamp("2013-09-21 11:23:35.196045321")], + [1, pd.Timestamp("1998-02-15 23:59:21.943829154")], + [2, pd.Timestamp("2017-10-31 00:12:39.396583106")], ] odps.write_table(table_name, contents) written = list(odps.read_table(table_name)) @@ -723,15 +847,15 @@ def test_pandas_na(odps): if not hasattr(pd, "NA"): pytest.skip("Need pandas>1.0 to run this test") - table_name = tn('test_pandas_na_io') + table_name = tn("test_pandas_na_io_" + get_code_mode()) odps.delete_table(table_name, if_exists=True) - table = odps.create_table(table_name, 'col1 bigint, col2 string', lifecycle=1) + table = odps.create_table(table_name, "col1 bigint, col2 string", lifecycle=1) contents = [ - [0, 'agdesfdr'], + [0, "agdesfdr"], [1, pd.NA], - [pd.NA, 'aetlkakls;dfj'], - [3, 'aetlkakls;dfj'], + [pd.NA, "aetlkakls;dfj"], + [3, "aetlkakls;dfj"], ] odps.write_table(table_name, contents) written = list(odps.read_table(table_name)) @@ -744,10 +868,12 @@ def test_pandas_na(odps): @py_and_c_deco @odps2_typed_case def test_length_limit_types(odps): - table_name = tn('test_hivetunnel_length_limit_io') + table_name = tn("test_hivetunnel_length_limit_io_" + get_code_mode()) odps.delete_table(table_name, if_exists=True) - table = odps.create_table(table_name, 'col1 int, col2 varchar(20), col3 char(30)', lifecycle=1) + table = odps.create_table( + table_name, "col1 int, col2 varchar(20), col3 char(30)", lifecycle=1 + ) assert table.table_schema.types[0] == types.int_ assert isinstance(table.table_schema.types[1], types.Varchar) assert table.table_schema.types[1].size_limit == 20 @@ -755,14 +881,14 @@ def test_length_limit_types(odps): assert table.table_schema.types[2].size_limit == 30 contents = [ - [0, 'agdesfdr', 'sadfklaslkjdvvn'], - [1, 'sda;fkd', 'asdlfjjls;admc'], - [2, 'aetlkakls;dfj', 'sadffafafsafsaf'], + [0, "agdesfdr", "sadfklaslkjdvvn"], + [1, "sda;fkd", "asdlfjjls;admc"], + [2, "aetlkakls;dfj", "sadffafafsafsaf"], ] odps.write_table(table_name, contents) written = list(odps.read_table(table_name)) - contents = [r[:2] + [r[2] + ' ' * (30 - len(r[2]))] for r in contents] + contents = [r[:2] + [r[2] + " " * (30 - len(r[2]))] for r in contents] values = [list(v.values) for v in written] assert contents == values @@ -772,11 +898,14 @@ def test_length_limit_types(odps): @py_and_c_deco @odps2_typed_case def test_decimal2(odps): - table_name = tn('test_hivetunnel_decimal_io') + table_name = tn("test_hivetunnel_decimal_io_" + get_code_mode()) odps.delete_table(table_name, if_exists=True) - table = odps.create_table(table_name, 'col1 int, col2 decimal(6,2), ' - 'col3 decimal(10), col4 decimal(10,3)', lifecycle=1) + table = odps.create_table( + table_name, + "col1 int, col2 decimal(6,2), " "col3 decimal(10), col4 decimal(10,3)", + lifecycle=1, + ) assert table.table_schema.types[0] == types.int_ assert isinstance(table.table_schema.types[1], types.Decimal) # comment out due to behavior change of ODPS SQL @@ -789,9 +918,9 @@ def test_decimal2(odps): assert table.table_schema.types[3].scale == 3 contents = [ - [0, Decimal('2.34'), Decimal('34567'), Decimal('56.789')], - [1, Decimal('11.76'), Decimal('9321'), Decimal('19.125')], - [2, Decimal('134.21'), Decimal('1642'), Decimal('999.214')], + [0, Decimal("2.34"), Decimal("34567"), Decimal("56.789")], + [1, Decimal("11.76"), Decimal("9321"), Decimal("19.125")], + [2, Decimal("134.21"), Decimal("1642"), Decimal("999.214")], ] odps.write_table(table_name, contents) written = list(odps.read_table(table_name)) @@ -806,19 +935,24 @@ def test_decimal2(odps): @odps2_typed_case def test_intervals(odps): import pandas as pd - empty_table_name = tn('test_hivetunnel_interval_empty') + + empty_table_name = tn("test_hivetunnel_interval_empty_" + get_code_mode()) odps.delete_table(empty_table_name, if_exists=True) - empty_table = odps.create_table(empty_table_name, 'col1 int', if_not_exists=True) + empty_table = odps.create_table(empty_table_name, "col1 int", if_not_exists=True) - table_name = tn('test_hivetunnel_interval_io') + table_name = tn("test_hivetunnel_interval_io_" + get_code_mode()) odps.delete_table(table_name, if_exists=True) - odps.execute_sql("create table %s lifecycle 1 as\n" - "select interval_day_time('2 1:2:3') as col1," - " interval_year_month('10-11') as col2\n" - "from %s" % - (table_name, empty_table_name)) + odps.execute_sql( + "create table %s lifecycle 1 as\n" + "select interval_day_time('2 1:2:3') as col1," + " interval_year_month('10-11') as col2\n" + "from %s" % (table_name, empty_table_name) + ) table = odps.get_table(table_name) - assert table.table_schema.types == [types.interval_day_time, types.interval_year_month] + assert table.table_schema.types == [ + types.interval_day_time, + types.interval_year_month, + ] contents = [ [pd.Timedelta(seconds=1048576, nanoseconds=428571428), Monthdelta(13)], @@ -838,23 +972,25 @@ def test_intervals(odps): @odps2_typed_case @pytest.mark.parametrize("struct_as_dict", [False, True]) def test_struct(odps, struct_as_dict): - table_name = tn('test_hivetunnel_struct_io') + table_name = tn("test_hivetunnel_struct_io_" + get_test_unique_name(5)) odps.delete_table(table_name, if_exists=True) try: options.struct_as_dict = struct_as_dict - col_def = 'col1 int, col2 struct,hobbies:array>' + col_def = ( + "col1 int, col2 struct,hobbies:array>" + ) table = odps.create_table(table_name, col_def, lifecycle=1) assert table.table_schema.types[0] == types.int_ struct_type = table.table_schema.types[1] assert isinstance(struct_type, types.Struct) contents = [ - [0, ('user1', 20, {'fa': 5, 'mo': 6}, ['worship', 'yacht'])], - [1, ('user2', 65, {'fa': 2, 'mo': 7}, ['ukelele', 'chess'])], - [2, ('user3', 32, {'fa': 1, 'mo': 3}, ['poetry', 'calligraphy'])], + [0, ("user1", 20, {"fa": 5, "mo": 6}, ["worship", "yacht"])], + [1, ("user2", 65, {"fa": 2, "mo": 7}, ["ukelele", "chess"])], + [2, ("user3", 32, {"fa": 1, "mo": 3}, ["poetry", "calligraphy"])], ] if struct_as_dict: for c in contents: @@ -904,19 +1040,16 @@ def _patched(*_, **__): @py_and_c_deco @odps2_typed_case def test_decimal_with_complex_types(odps): - table_name = tn("test_decimal_with_complex_types") + table_name = tn("test_decimal_with_complex_types_" + get_code_mode()) odps.delete_table(table_name, if_exists=True) table = odps.create_table( table_name, - "col1 array, col3 struct" + "col1 array, col3 struct", + lifecycle=1, ) try: - data_to_write = [ - [ - [Decimal("12.345"), Decimal("18.41")], (Decimal("514.321"),) - ] - ] + data_to_write = [[[Decimal("12.345"), Decimal("18.41")], (Decimal("514.321"),)]] with table.open_writer() as writer: writer.write(data_to_write) @@ -931,11 +1064,10 @@ def test_decimal_with_complex_types(odps): @py_and_c_deco @odps2_typed_case @pandas_case -def test_json_timestamp_types(odps_daily): +def test_json_timestamp_types(odps): import pandas as pd - odps = odps_daily - table_name = tn("test_json_types") + table_name = tn("test_json_types_" + get_code_mode()) odps.delete_table(table_name, if_exists=True) hints = {"odps.sql.type.json.enable": "true"} table = odps.create_table( @@ -956,14 +1088,17 @@ def test_json_timestamp_types(odps_daily): @py_and_c_deco def test_antique_datetime(odps): - table_name = tn("test_datetime_overflow_table") + table_name = tn("test_datetime_overflow_table_" + get_code_mode()) odps.delete_table(table_name, if_exists=True) table = odps.create_table(table_name, "col datetime", lifecycle=1) options.allow_antique_date = False options.tunnel.overflow_date_as_none = False try: - odps.execute_sql("INSERT INTO %s(col) VALUES (cast('1900-01-01 00:00:00' as datetime))" % table_name) + odps.execute_sql( + "INSERT INTO %s(col) VALUES (cast('1900-01-01 00:00:00' as datetime))" + % table_name + ) with pytest.raises(DatetimeOverflowError): with table.open_reader() as reader: _ = next(reader) @@ -974,7 +1109,10 @@ def test_antique_datetime(odps): assert rec[0].year == 1900 table.truncate() - odps.execute_sql("INSERT INTO %s(col) VALUES (cast('0000-01-01 00:00:00' as datetime))" % table_name) + odps.execute_sql( + "INSERT INTO %s(col) VALUES (cast('0000-01-01 00:00:00' as datetime))" + % table_name + ) with pytest.raises(DatetimeOverflowError): with table.open_reader() as reader: _ = next(reader) @@ -990,17 +1128,58 @@ def test_antique_datetime(odps): table.drop() +@py_and_c_deco +def test_tunnel_read_with_retry(odps, setup): + from ..tabletunnel import TableDownloadSession + + test_table_name = tn("pyodps_t_tmp_tunnel_read_with_retry_" + get_code_mode()) + odps.delete_table(test_table_name, if_exists=True) + + table = odps.create_table(test_table_name, "col string") + + try: + data = [["str%d" % idx] for idx in range(10)] + with table.open_writer() as writer: + writer.write(data) + + ranges = [] + original = TableDownloadSession._build_input_stream + + def new_build_input_stream(self, start, count, *args, **kw): + ranges.append((start, count)) + assert start in (0, 2) + assert start == 0 or count == session.count - 2 + return original(self, start, count, *args, **kw) + + with mock.patch( + "odps.tunnel.tabletunnel.TableDownloadSession._build_input_stream", + new=new_build_input_stream, + ): + tunnel = TableTunnel(odps) + session = tunnel.create_download_session(table) + reader = session.open_record_reader(0, session.count) + + reader._inject_error(2, ValueError) + result = [r.values for r in reader] + + assert ranges == [(0, 10), (2, 8)] + setup.assert_reads_data_equal(result, data) + finally: + table.drop() + + @pyarrow_case -def test_tunnel_preview_table_simple_types(odps_daily, setup): +def test_tunnel_preview_table_simple_types(odps, setup): import pyarrow as pa - odps = odps_daily tunnel = TableTunnel(odps) - test_table_name = tn('pyodps_test_tunnel_preview_table_simple_types') + test_table_name = tn("pyodps_test_tunnel_preview_table_simple_types") odps.delete_table(test_table_name, if_exists=True) table = setup.create_partitioned_table(test_table_name, odps=odps) - with tunnel.open_preview_reader(table, limit=3, arrow=False) as reader: + with tunnel.open_preview_reader( + table, limit=3, arrow=False, tags="elephant" + ) as reader: records = list(reader) assert len(records) == 0 @@ -1019,7 +1198,7 @@ def test_tunnel_preview_table_simple_types(odps_daily, setup): kw = {"compress_algo": "zstd"} with tunnel.open_preview_reader( - table, limit=2, columns=['id', 'int_num', 'float_num'], arrow=True, **kw + table, limit=2, columns=["id", "int_num", "float_num"], arrow=True, **kw ) as reader: arrow_table = reader.read() result_rows = [tuple(x.as_py() for x in tp) for tp in zip(*arrow_table.columns)] @@ -1043,7 +1222,7 @@ def test_tunnel_preview_odps_extended_datetime(odps): tunnel = TableTunnel(odps) - test_table_name = tn('pyodps_test_tunnel_preview_odps_extended_types') + test_table_name = tn("pyodps_test_tunnel_preview_odps_extended_types") odps.delete_table(test_table_name, if_exists=True) odps.execute_sql( "create table " + test_table_name + " as " @@ -1057,7 +1236,12 @@ def test_tunnel_preview_odps_extended_datetime(odps): record = list(reader)[0] assert record["ts_col"] == pd.Timestamp("2023-10-12 11:05:11.123451231") assert record["intv_col"] == pd.Timedelta( - days=3, hours=12, minutes=30, seconds=11, microseconds=134512, nanoseconds=345 + days=3, + hours=12, + minutes=30, + seconds=11, + microseconds=134512, + nanoseconds=345, ) finally: odps.delete_table(test_table_name) @@ -1068,7 +1252,9 @@ def test_tunnel_preview_odps_extended_datetime(odps): def test_tunnel_preview_legacy_decimal(odps): tunnel = TableTunnel(odps) - test_table_name = tn('pyodps_test_tunnel_preview_odps_legacy_decimal') + test_table_name = tn( + "pyodps_test_tunnel_preview_odps_legacy_decimal_" + get_code_mode() + ) odps.delete_table(test_table_name, if_exists=True) values = [ @@ -1103,14 +1289,16 @@ def test_tunnel_preview_table_complex_types(odps, struct_as_dict): import pandas as pd tunnel = TableTunnel(odps) - test_table_name = tn('pyodps_test_tunnel_preview_table_complex_types') + test_table_name = tn( + "pyodps_test_tunnel_preview_table_complex_types_" + get_test_unique_name(5) + ) odps.delete_table(test_table_name, if_exists=True) table = odps.create_table( test_table_name, "col1 decimal(10, 2), col2 timestamp, col3 map>, " "col4 array>, col5 struct>", - lifecycle=1 + lifecycle=1, ) data = [ [ @@ -1148,15 +1336,18 @@ def test_tunnel_preview_table_complex_types(odps, struct_as_dict): @flaky(max_runs=3) @py_and_c_deco -def test_upsert_table(odps_daily): - table_name = tn("test_upsert_table") - odps_daily.delete_table(table_name, if_exists=True) - table = odps_daily.create_table( - table_name, "key string not null, value string", - transactional=True, primary_key="key", lifecycle=1, +def test_upsert_table(odps): + table_name = tn("test_upsert_table_" + get_code_mode()) + odps.delete_table(table_name, if_exists=True) + table = odps.create_table( + table_name, + "key string not null, value string", + transactional=True, + primary_key="key", + lifecycle=1, ) - tunnel = TableTunnel(odps_daily, endpoint=odps_daily._tunnel_endpoint) + tunnel = TableTunnel(odps, endpoint=odps._tunnel_endpoint) try: upsert_session = tunnel.create_upsert_session(table) @@ -1177,7 +1368,7 @@ def test_upsert_table(odps_daily): upsert_session.commit() - inst = odps_daily.execute_sql("SELECT * FROM %s" % table_name) + inst = odps.execute_sql("SELECT * FROM %s" % table_name) with inst.open_reader() as reader: records = [list(rec.values) for rec in reader] assert sorted(records) == [["0", "v3"], ["1", "v1"]] @@ -1224,3 +1415,38 @@ def patch_request(self, *args, **kw): assert list(reader)[0][0] == "data" tb.drop() + + +@py_and_c_deco +def test_read_write_long_binary(odps_with_long_string): + odps = odps_with_long_string + + data_len = 16 * 1024**2 + maxsize_prop = ( + int(odps.get_project().get_property("odps.sql.cfile2.field.maxsize", None) or 0) + * 1024 + ) + if data_len > maxsize_prop: + pytest.skip("maxsize on project %s not configured." % odps.project) + + test_str = "abcd" * (data_len // 4) + + test_table_name = tn("pyodps_t_tmp_long_binary_test_" + get_code_mode()) + odps.delete_table(test_table_name, if_exists=True) + + table = odps.create_table(test_table_name, "col1 string", lifecycle=1) + + try: + rec = table.new_record([test_str]) + assert rec[0] == test_str + + rec = table.new_record() + rec[0] = test_str + assert rec[0] == test_str + + with table.open_writer() as writer: + writer.write([test_str]) + with table.open_reader() as reader: + assert next(reader)[0] == test_str + finally: + table.drop() diff --git a/odps/tunnel/tests/test_volumetunnel.py b/odps/tunnel/tests/test_volumetunnel.py index 5a6bdc6e..23c44a96 100644 --- a/odps/tunnel/tests/test_volumetunnel.py +++ b/odps/tunnel/tests/test_volumetunnel.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,8 +24,8 @@ from .. import CompressOption from ..volumetunnel import VolumeTunnel -TEST_PARTITION_NAME = 'pyodps_test_partition' -TEST_FILE_NAME = 'test_output_file' +TEST_PARTITION_NAME = "pyodps_test_partition" +TEST_FILE_NAME = "test_output_file" TEST_BLOCK_SIZE = 1048500 TEST_MODULUS = 251 @@ -63,9 +63,7 @@ def wrapped(self, *args, **kwargs): _old_create_download_session = VolumeTunnel.create_download_session _old_create_upload_session = VolumeTunnel.create_upload_session - VolumeTunnel.create_download_session = wrap_fun( - _old_create_download_session - ) + VolumeTunnel.create_download_session = wrap_fun(_old_create_download_session) VolumeTunnel.create_upload_session = wrap_fun(_old_create_upload_session) test_funcs = namedtuple( @@ -87,9 +85,14 @@ def wrapped(self, *args, **kwargs): def test_text_upload_download(odps, setup): - text_content = 'Life is short, \r\n Java is tedious. \n\n\r\nI use PyODPS. \n\n' + text_content = "Life is short, \r\n Java is tedious. \n\n\r\nI use PyODPS. \n\n" expect_lines = [ - 'Life is short, \n', ' Java is tedious. \n', '\n', '\n', 'I use PyODPS. \n', '\n' + "Life is short, \n", + " Java is tedious. \n", + "\n", + "\n", + "I use PyODPS. \n", + "\n", ] partition = setup.get_test_partition() @@ -114,6 +117,7 @@ def test_raw_upload_download_greenlet(odps, setup): def test_raw_upload_download_thread(odps, setup): from .. import io + io._FORCE_THREAD = True block = setup.gen_byte_block() @@ -137,9 +141,7 @@ def test_z_lib_upload_download(odps, setup): with partition.open_writer(compress_option=comp_option) as writer: writer.write(TEST_FILE_NAME, block, compress=True) - with partition.open_reader( - TEST_FILE_NAME, compress_option=comp_option - ) as reader: + with partition.open_reader(TEST_FILE_NAME, compress_option=comp_option) as reader: assert reader.read() == block diff --git a/odps/tunnel/volumetunnel.py b/odps/tunnel/volumetunnel.py index a482118b..5f435059 100644 --- a/odps/tunnel/volumetunnel.py +++ b/odps/tunnel/volumetunnel.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,19 +15,19 @@ # limitations under the License. import logging -import sys import struct +import sys from requests.exceptions import StreamConsumedError +from .. import options, serializers +from ..compat import Enum, irange, six +from ..models import errors +from ..utils import to_binary, to_text from . import io from .base import BaseTunnel from .checksum import Checksum from .errors import TunnelError -from .. import serializers, options -from ..models import errors -from ..compat import irange, Enum, six -from ..utils import to_binary, to_text logger = logging.getLogger(__name__) @@ -35,42 +35,83 @@ MIN_CHUNK_SIZE = 1 CHECKSUM_SIZE = 4 -CHECKSUM_PACKER = '>i' if six.PY2 else '>I' +CHECKSUM_PACKER = ">i" if six.PY2 else ">I" class VolumeTunnel(BaseTunnel): - def create_download_session(self, volume, partition_spec, file_name, download_id=None, compress_option=None, - compress_algo=None, compress_level=None, compress_strategy=None): + def create_download_session( + self, + volume, + partition_spec, + file_name, + download_id=None, + compress_option=None, + compress_algo=None, + compress_level=None, + compress_strategy=None, + ): if not isinstance(volume, six.string_types): volume = volume.name volume = self._project.volumes[volume] if compress_option is None and compress_algo is not None: compress_option = io.CompressOption( - compress_algo=compress_algo, level=compress_level, strategy=compress_strategy) + compress_algo=compress_algo, + level=compress_level, + strategy=compress_strategy, + ) return VolumeDownloadSession( - self.tunnel_rest, volume, partition_spec, file_name, download_id=download_id, - compress_option=compress_option, quota_name=self._quota_name + self.tunnel_rest, + volume, + partition_spec, + file_name, + download_id=download_id, + compress_option=compress_option, + quota_name=self._quota_name, ) - def create_upload_session(self, volume, partition_spec, upload_id=None, compress_option=None, - compress_algo=None, compress_level=None, compress_strategy=None): + def create_upload_session( + self, + volume, + partition_spec, + upload_id=None, + compress_option=None, + compress_algo=None, + compress_level=None, + compress_strategy=None, + ): if not isinstance(volume, six.string_types): volume = volume.name volume = self._project.volumes[volume] if compress_option is None and compress_algo is not None: compress_option = io.CompressOption( - compress_algo=compress_algo, level=compress_level, strategy=compress_strategy) + compress_algo=compress_algo, + level=compress_level, + strategy=compress_strategy, + ) return VolumeUploadSession( - self.tunnel_rest, volume, partition_spec, upload_id=upload_id, - compress_option=compress_option, quota_name=self._quota_name + self.tunnel_rest, + volume, + partition_spec, + upload_id=upload_id, + compress_option=compress_option, + quota_name=self._quota_name, ) class VolumeFSTunnel(BaseTunnel): - def open_reader(self, volume, path, start=None, length=None, compress_option=None, compress_algo=None, - compress_level=None, compress_strategy=None): + def open_reader( + self, + volume, + path, + start=None, + length=None, + compress_option=None, + compress_algo=None, + compress_level=None, + compress_strategy=None, + ): if not isinstance(volume, six.string_types): volume = volume.name volume = self._project.volumes[volume] @@ -81,16 +122,25 @@ def open_reader(self, volume, path, start=None, length=None, compress_option=Non file_obj = volume[path] length = file_obj.length - headers = { - 'Range': 'bytes={0}-{1}'.format(start, start + length - 1), - 'x-odps-volume-fs-path': '/' + volume.name + '/' + path.lstrip('/'), - } + headers = VolumeDownloadSession.get_common_headers() + headers.update( + { + "Range": "bytes={0}-{1}".format(start, start + length - 1), + "x-odps-volume-fs-path": "/" + volume.name + "/" + path.lstrip("/"), + } + ) if compress_option is not None: - if compress_option.algorithm == io.CompressOption.CompressAlgorithm.ODPS_ZLIB: - headers['Accept-Encoding'] = 'deflate' - elif compress_option.algorithm != io.CompressOption.CompressAlgorithm.ODPS_RAW: - raise TunnelError('invalid compression option') + if ( + compress_option.algorithm + == io.CompressOption.CompressAlgorithm.ODPS_ZLIB + ): + headers["Accept-Encoding"] = "deflate" + elif ( + compress_option.algorithm + != io.CompressOption.CompressAlgorithm.ODPS_RAW + ): + raise TunnelError("invalid compression option") url = volume.resource(client=self.tunnel_rest) resp = self.tunnel_rest.get(url, headers=headers, stream=True) @@ -100,9 +150,12 @@ def open_reader(self, volume, path, start=None, length=None, compress_option=Non if compress_option is None and compress_algo is not None: compress_option = io.CompressOption( - compress_algo=compress_algo, level=compress_level, strategy=compress_strategy) + compress_algo=compress_algo, + level=compress_level, + strategy=compress_strategy, + ) - content_encoding = resp.headers.get('Content-Encoding') + content_encoding = resp.headers.get("Content-Encoding") if content_encoding is not None: compress = True else: @@ -111,63 +164,114 @@ def open_reader(self, volume, path, start=None, length=None, compress_option=Non option = compress_option if compress else None return VolumeReader(self.tunnel_rest, resp, option) - def open_writer(self, volume, path, replication=None, compress_option=None, compress_algo=None, - compress_level=None, compress_strategy=None): + def open_writer( + self, + volume, + path, + replication=None, + compress_option=None, + compress_algo=None, + compress_level=None, + compress_strategy=None, + ): if not isinstance(volume, six.string_types): volume = volume.name volume = self._project.volumes[volume] - headers = { - 'Content-Type': 'application/octet-stream', - 'Transfer-Encoding': 'chunked', - 'x-odps-volume-fs-path': '/' + volume.name + '/' + path.lstrip('/'), - } + headers = VolumeUploadSession.get_common_headers() + headers.update( + { + "Content-Type": "application/octet-stream", + "Transfer-Encoding": "chunked", + "x-odps-volume-fs-path": "/" + volume.name + "/" + path.lstrip("/"), + } + ) params = {} if compress_option is None and compress_algo is not None: compress_option = io.CompressOption( - compress_algo=compress_algo, level=compress_level, strategy=compress_strategy) + compress_algo=compress_algo, + level=compress_level, + strategy=compress_strategy, + ) if compress_option is not None: - if compress_option.algorithm == io.CompressOption.CompressAlgorithm.ODPS_ZLIB: - headers['Content-Encoding'] = 'deflate' - elif compress_option.algorithm != io.CompressOption.CompressAlgorithm.ODPS_RAW: - raise TunnelError('invalid compression option') + if ( + compress_option.algorithm + == io.CompressOption.CompressAlgorithm.ODPS_ZLIB + ): + headers["Content-Encoding"] = "deflate" + elif ( + compress_option.algorithm + != io.CompressOption.CompressAlgorithm.ODPS_RAW + ): + raise TunnelError("invalid compression option") if replication: - params['replication'] = replication + params["replication"] = replication url = volume.resource(client=self.tunnel_rest) - chunk_upload = lambda data: self.tunnel_rest.post(url, data=data, params=params, headers=headers) + chunk_upload = lambda data: self.tunnel_rest.post( + url, data=data, params=params, headers=headers + ) if compress_option is None and compress_algo is not None: compress_option = io.CompressOption( - compress_algo=compress_algo, level=compress_level, strategy=compress_strategy) - return VolumeFSWriter(self.tunnel_rest, chunk_upload, volume, path, compress_option) + compress_algo=compress_algo, + level=compress_level, + strategy=compress_strategy, + ) + return VolumeFSWriter( + self.tunnel_rest, chunk_upload, volume, path, compress_option + ) -class VolumeDownloadSession(serializers.JSONSerializableModel): +class BaseVolumeTunnelSession(serializers.JSONSerializableModel): + @staticmethod + def get_common_headers(content_length=None, tags=None): + header = {} + if content_length is not None: + header["Content-Length"] = content_length + tags = tags or options.tunnel.tags + if tags: + if isinstance(tags, six.string_types): + tags = tags.split(",") + header["odps-tunnel-tags"] = ",".join(tags) + return header + + +class VolumeDownloadSession(BaseVolumeTunnelSession): __slots__ = ( - '_client', 'project_name', '_compress_option', '_quota_name', + "_client", + "project_name", + "_compress_option", + "_quota_name", ) class Status(Enum): - UNKNOWN = 'UNKNOWN' - NORMAL = 'NORMAL' - CLOSED = 'CLOSED' - EXPIRED = 'EXPIRED' + UNKNOWN = "UNKNOWN" + NORMAL = "NORMAL" + CLOSED = "CLOSED" + EXPIRED = "EXPIRED" - id = serializers.JSONNodeField('DownloadID') + id = serializers.JSONNodeField("DownloadID") status = serializers.JSONNodeField( - 'Status', parse_callback=lambda v: VolumeDownloadSession.Status(v.upper()) + "Status", parse_callback=lambda v: VolumeDownloadSession.Status(v.upper()) ) - file_name = serializers.JSONNodeField('File', 'FileName') - file_length = serializers.JSONNodeField('File', 'FileLength') - volume_name = serializers.JSONNodeField('Partition', 'Volume') - partition_spec = serializers.JSONNodeField('Partition', 'Partition') + file_name = serializers.JSONNodeField("File", "FileName") + file_length = serializers.JSONNodeField("File", "FileLength") + volume_name = serializers.JSONNodeField("Partition", "Volume") + partition_spec = serializers.JSONNodeField("Partition", "Partition") def __init__( - self, client, volume, partition_spec, file_name=None, download_id=None, - compress_option=None, quota_name=None + self, + client, + volume, + partition_spec, + file_name=None, + download_id=None, + compress_option=None, + quota_name=None, + tags=None, ): super(VolumeDownloadSession, self).__init__() @@ -180,10 +284,10 @@ def __init__( self.file_name = file_name if download_id is None: - self._init() + self._init(tags=tags) else: self.id = download_id - self.reload() + self.reload(tags=tags) logger.info("Tunnel session created: %r", self) if options.tunnel_session_create_callback: @@ -196,16 +300,26 @@ def __repr__(self): ) def resource(self, client=None, endpoint=None): - endpoint = endpoint if endpoint is not None else (client or self._client).endpoint - return endpoint + '/projects/%s/tunnel/downloads' % self.project_name - - def _init(self): - headers = {'Content-Length': '0'} - params = dict(type='volumefile', target='/'.join( - [self.project_name, self.volume_name, self.partition_spec, self.file_name] - )) + endpoint = ( + endpoint if endpoint is not None else (client or self._client).endpoint + ) + return endpoint + "/projects/%s/tunnel/downloads" % self.project_name + + def _init(self, tags=None): + headers = self.get_common_headers(content_length=0, tags=tags) + params = dict( + type="volumefile", + target="/".join( + [ + self.project_name, + self.volume_name, + self.partition_spec, + self.file_name, + ] + ), + ) if self._quota_name is not None: - params['quotaName'] = self._quota_name + params["quotaName"] = self._quota_name url = self.resource() resp = self._client.post(url, {}, params=params, headers=headers) @@ -215,16 +329,16 @@ def _init(self): e = TunnelError.parse(resp) raise e - def reload(self): - headers = {'Content-Length': '0'} + def reload(self, tags=None): + headers = self.get_common_headers(content_length=0, tags=tags) params = {} if self.partition_spec is not None and len(self.partition_spec) > 0: - params['partition'] = self.partition_spec + params["partition"] = self.partition_spec if self._quota_name is not None: - params['quotaName'] = self._quota_name + params["quotaName"] = self._quota_name - url = self.resource() + '/' + str(self.id) + url = self.resource() + "/" + str(self.id) resp = self._client.get(url, params=params, headers=headers) if self._client.is_ok(resp): self.parse(resp, obj=self) @@ -237,30 +351,33 @@ def open(self, start=0, length=sys.maxsize): params = {} - headers = {'Content-Length': 0, 'x-odps-tunnel-version': 4} + headers = {"Content-Length": 0, "x-odps-tunnel-version": 4} if compress_option.algorithm == io.CompressOption.CompressAlgorithm.ODPS_ZLIB: - headers['Accept-Encoding'] = 'deflate' + headers["Accept-Encoding"] = "deflate" elif compress_option.algorithm != io.CompressOption.CompressAlgorithm.ODPS_RAW: - raise TunnelError('invalid compression option') + raise TunnelError("invalid compression option") - params['data'] = '' - params['range'] = '(%s,%s)' % (start, length) + params["data"] = "" + params["range"] = "(%s,%s)" % (start, length) if self._quota_name is not None: - params['quotaName'] = self._quota_name + params["quotaName"] = self._quota_name url = self.resource() - resp = self._client.get(url + '/' + self.id, params=params, headers=headers, stream=True) + resp = self._client.get( + url + "/" + self.id, params=params, headers=headers, stream=True + ) if not self._client.is_ok(resp): e = TunnelError.parse(resp) raise e - content_encoding = resp.headers.get('Content-Encoding') + content_encoding = resp.headers.get("Content-Encoding") if content_encoding is not None: - if content_encoding == 'deflate': + if content_encoding == "deflate": self._compress_option = io.CompressOption( - io.CompressOption.CompressAlgorithm.ODPS_ZLIB, -1, 0) + io.CompressOption.CompressAlgorithm.ODPS_ZLIB, -1, 0 + ) else: - raise TunnelError('Invalid content encoding') + raise TunnelError("Invalid content encoding") compress = True else: compress = False @@ -274,7 +391,7 @@ def __init__(self, client, response, compress_option): self._client = client self._response = io.RequestsInputStream(response) self._compress_option = compress_option - self._crc = Checksum(method='crc32') + self._crc = Checksum(method="crc32") self._buffer_size = 0 self._initialized = False self._last_line_ending = None @@ -287,17 +404,20 @@ def __init__(self, client, response, compress_option): # left part of checksum block when chunked, see _read_buf() self._chunk_left = None - def _raw_read(self, l): - return self._response.read(l) + def _raw_read(self, size): + return self._response.read(size) def _init_buf(self): size_buf = self._raw_read(4) if not size_buf: - raise IOError('Tunnel reader breaks unexpectedly.') + raise IOError("Tunnel reader breaks unexpectedly.") self._crc.update(size_buf) - chunk_size = struct.unpack('>I', size_buf)[0] + chunk_size = struct.unpack(">I", size_buf)[0] if chunk_size > MAX_CHUNK_SIZE or chunk_size < MIN_CHUNK_SIZE: - raise IOError("ChunkSize should be in [%d, %d], now is %d." % (MIN_CHUNK_SIZE, MAX_CHUNK_SIZE, chunk_size)) + raise IOError( + "ChunkSize should be in [%d, %d], now is %d." + % (MIN_CHUNK_SIZE, MAX_CHUNK_SIZE, chunk_size) + ) self._buffer_size = CHECKSUM_SIZE + chunk_size def _read_buf(self): @@ -328,17 +448,20 @@ def _read_buf(self): buf = data_buffer.getvalue() else: buf_all = data_buffer.getvalue() - buf, self._chunk_left = buf_all[:self._buffer_size], buf_all[self._buffer_size:] + buf, self._chunk_left = ( + buf_all[: self._buffer_size], + buf_all[self._buffer_size :], + ) if len(buf) >= CHECKSUM_SIZE: self._data_size = len(buf) - CHECKSUM_SIZE - self._crc.update(buf[:self._data_size]) + self._crc.update(buf[: self._data_size]) checksum = struct.unpack_from(CHECKSUM_PACKER, buf, self._data_size)[0] if checksum != self._crc.getvalue(): - raise IOError('CRC check error in VolumeReader.') + raise IOError("CRC check error in VolumeReader.") else: - raise IOError('Invalid VolumeReader.') - return bytearray(buf[:self._data_size]) + raise IOError("Invalid VolumeReader.") + return bytearray(buf[: self._data_size]) def read(self, size=None, break_line=False): if size is None: @@ -358,27 +481,29 @@ def read(self, size=None, break_line=False): if self._left_part: if break_line: # deal with Windows line endings - if self._left_part[self._left_part_pos] == ord('\n') and self._last_line_ending == ord('\r'): + if self._left_part[self._left_part_pos] == ord( + "\n" + ) and self._last_line_ending == ord("\r"): self._last_line_ending = None self._left_part_pos += 1 for idx in irange(self._left_part_pos, len(self._left_part)): - if self._left_part[idx] not in (ord('\r'), ord('\n')): + if self._left_part[idx] not in (ord("\r"), ord("\n")): continue self._last_line_ending = self._left_part[idx] - self._left_part[idx] = ord('\n') - ret = self._left_part[self._left_part_pos:idx + 1] + self._left_part[idx] = ord("\n") + ret = self._left_part[self._left_part_pos : idx + 1] self._left_part_pos = idx + 1 if self._left_part_pos == len(self._left_part): self._left_part = None self._left_part_pos = 0 return bytes(ret) if len(self._left_part) - self._left_part_pos >= size: - ret = self._left_part[self._left_part_pos:self._left_part_pos + size] + ret = self._left_part[self._left_part_pos : self._left_part_pos + size] self._left_part_pos += size return bytes(ret) else: - out_buf.write(bytes(self._left_part[self._left_part_pos:])) + out_buf.write(bytes(self._left_part[self._left_part_pos :])) self._left_part = None self._left_part_pos = 0 has_stuff = True @@ -391,42 +516,42 @@ def read(self, size=None, break_line=False): has_stuff = True start_pos = 0 if break_line: - if buf[0] == ord('\n') and self._last_line_ending == ord('\r'): + if buf[0] == ord("\n") and self._last_line_ending == ord("\r"): start_pos = 1 for idx in irange(start_pos, len(buf)): - if buf[idx] not in (ord('\r'), ord('\n')): + if buf[idx] not in (ord("\r"), ord("\n")): continue self._last_line_ending = buf[idx] - buf[idx] = ord('\n') - out_buf.write(bytes(buf[start_pos:idx + 1])) + buf[idx] = ord("\n") + out_buf.write(bytes(buf[start_pos : idx + 1])) if idx + 1 < len(buf): - self._left_part = buf[idx + 1:] + self._left_part = buf[idx + 1 :] self._left_part_pos = 0 return out_buf.getvalue() if len(buf) >= length_left: - out_buf.write(bytes(buf[start_pos:start_pos + length_left])) + out_buf.write(bytes(buf[start_pos : start_pos + length_left])) if len(buf) > length_left: - self._left_part = buf[start_pos + length_left:] + self._left_part = buf[start_pos + length_left :] self._left_part_pos = 0 length_left = 0 else: - out_buf.write(bytes(buf[start_pos:start_pos + self._data_size])) + out_buf.write(bytes(buf[start_pos : start_pos + self._data_size])) length_left -= self._data_size return out_buf.getvalue() if has_stuff else None - def _it(self, size=sys.maxsize, encoding='utf-8'): + def _it(self, size=sys.maxsize, encoding="utf-8"): while True: line = self.readline(size, encoding=encoding) if line is None: break yield line - def readline(self, size=sys.maxsize, encoding='utf-8'): + def readline(self, size=sys.maxsize, encoding="utf-8"): line = self.read(size, break_line=True) return to_text(line, encoding=encoding) - def readlines(self, size=sys.maxsize, encoding='utf-8'): + def readlines(self, size=sys.maxsize, encoding="utf-8"): return [line for line in self._it(size, encoding=encoding)] def __iter__(self): @@ -439,33 +564,44 @@ def __exit__(self, exc_type, exc_val, exc_tb): pass -class VolumeUploadSession(serializers.JSONSerializableModel): +class VolumeUploadSession(BaseVolumeTunnelSession): __slots__ = ( - '_client', '_compress_option', 'project_name', 'volume_name', - 'partition_spec', '_quota_name', + "_client", + "_compress_option", + "project_name", + "volume_name", + "partition_spec", + "_quota_name", ) class Status(Enum): - UNKNOWN = 'UNKNOWN' - NORMAL = 'NORMAL' - CLOSING = 'CLOSING' - CLOSED = 'CLOSED' - CANCELED = 'CANCELED' - EXPIRED = 'EXPIRED' - CRITICAL = 'CRITICAL' + UNKNOWN = "UNKNOWN" + NORMAL = "NORMAL" + CLOSING = "CLOSING" + CLOSED = "CLOSED" + CANCELED = "CANCELED" + EXPIRED = "EXPIRED" + CRITICAL = "CRITICAL" class UploadFile(serializers.JSONSerializableModel): - file_name = serializers.JSONNodeField('FileName') - file_length = serializers.JSONNodeField('FileLength') + file_name = serializers.JSONNodeField("FileName") + file_length = serializers.JSONNodeField("FileLength") - id = serializers.JSONNodeField('UploadID') - status = serializers.JSONNodeField('Status', - parse_callback=lambda v: VolumeUploadSession.Status(v.upper())) - file_list = serializers.JSONNodesReferencesField(UploadFile, 'FileList') + id = serializers.JSONNodeField("UploadID") + status = serializers.JSONNodeField( + "Status", parse_callback=lambda v: VolumeUploadSession.Status(v.upper()) + ) + file_list = serializers.JSONNodesReferencesField(UploadFile, "FileList") def __init__( - self, client, volume, partition_spec, upload_id=None, - compress_option=None, quota_name=None + self, + client, + volume, + partition_spec, + upload_id=None, + compress_option=None, + quota_name=None, + tags=None, ): super(VolumeUploadSession, self).__init__() @@ -477,10 +613,10 @@ def __init__( self.partition_spec = partition_spec if upload_id is None: - self._init() + self._init(tags=tags) else: self.id = upload_id - self.reload() + self.reload(tags=tags) self._compress_option = compress_option logger.info("Tunnel session created: %r", self) @@ -494,17 +630,20 @@ def __repr__(self): ) def resource(self, client=None, endpoint=None): - endpoint = endpoint if endpoint is not None else (client or self._client).endpoint - return endpoint + '/projects/%s/tunnel/uploads' % self.project_name + endpoint = ( + endpoint if endpoint is not None else (client or self._client).endpoint + ) + return endpoint + "/projects/%s/tunnel/uploads" % self.project_name - def _init(self): - headers = {'Content-Length': '0'} + def _init(self, tags=None): + headers = self.get_common_headers(content_length=0, tags=tags) params = dict( - type='volumefile', - target='/'.join([self.project_name, self.volume_name, self.partition_spec]) + '/' + type="volumefile", + target="/".join([self.project_name, self.volume_name, self.partition_spec]) + + "/", ) if self._quota_name is not None: - params['quotaName'] = self._quota_name + params["quotaName"] = self._quota_name url = self.resource() resp = self._client.post(url, {}, params=params, headers=headers) @@ -514,11 +653,11 @@ def _init(self): e = TunnelError.parse(resp) raise e - def reload(self): - headers = {'Content-Length': '0'} + def reload(self, tags=None): + headers = self.get_common_headers(content_length=0, tags=tags) params = {} - url = self.resource() + '/' + str(self.id) + url = self.resource() + "/" + str(self.id) resp = self._client.get(url, params=params, headers=headers) if self._client.is_ok(resp): self.parse(resp, obj=self) @@ -529,11 +668,13 @@ def reload(self): @staticmethod def _format_file_name(file_name): buf = six.StringIO() - if file_name and file_name[0] == '/': - raise TunnelError("FileName cannot start with '/', file name is " + file_name) + if file_name and file_name[0] == "/": + raise TunnelError( + "FileName cannot start with '/', file name is " + file_name + ) pre_slash = False for ch in file_name: - if ch == '/': + if ch == "/": if not pre_slash: buf.write(ch) pre_slash = True @@ -544,52 +685,70 @@ def _format_file_name(file_name): def open(self, file_name, compress=False, append=False): compress_option = self._compress_option or io.CompressOption() - headers = {'Content-Type': 'test/plain', 'Transfer-Encoding': 'chunked', 'x-odps-tunnel-version': 4} + headers = self.get_common_headers() + headers.update( + { + "Content-Type": "test/plain", + "Transfer-Encoding": "chunked", + "x-odps-tunnel-version": 4, + } + ) params = {} if compress: - if compress_option.algorithm == io.CompressOption.CompressAlgorithm.ODPS_ZLIB: - headers['Content-Encoding'] = 'deflate' - elif compress_option.algorithm != io.CompressOption.CompressAlgorithm.ODPS_RAW: - raise TunnelError('invalid compression option') + if ( + compress_option.algorithm + == io.CompressOption.CompressAlgorithm.ODPS_ZLIB + ): + headers["Content-Encoding"] = "deflate" + elif ( + compress_option.algorithm + != io.CompressOption.CompressAlgorithm.ODPS_RAW + ): + raise TunnelError("invalid compression option") file_name = self._format_file_name(file_name) - params['blockid'] = file_name + params["blockid"] = file_name if append: - params['resume'] = '' + params["resume"] = "" if self._quota_name is not None: - params['quotaName'] = self._quota_name + params["quotaName"] = self._quota_name - url = self.resource() + '/' + self.id + url = self.resource() + "/" + self.id - chunk_uploader = lambda data: self._client.post(url, data=data, params=params, headers=headers) + chunk_uploader = lambda data: self._client.post( + url, data=data, params=params, headers=headers + ) option = compress_option if compress else None return VolumeWriter(self._client, chunk_uploader, option) def commit(self, files): if not files: - raise ValueError('`files` not supplied') + raise ValueError("`files` not supplied") if isinstance(files, six.string_types): - files = [files, ] + files = [files] formatted = [self._format_file_name(fn) for fn in files] self.reload() files_uploading = set(f.file_name for f in self.file_list) if len(files_uploading) != len(formatted): - raise TunnelError("File number not match, server: %d, client: %d" % (len(files_uploading), len(formatted))) + raise TunnelError( + "File number not match, server: %d, client: %d" + % (len(files_uploading), len(formatted)) + ) for fn in (fn for fn in formatted if fn not in files_uploading): raise TunnelError("File not exits on server, file name is " + fn) self._complete_upload() def _complete_upload(self): - headers = {'Content-Length': '0'} + headers = self.get_common_headers(content_length=0) params = {} if self._quota_name is not None: - params['quotaName'] = self._quota_name + params["quotaName"] = self._quota_name - url = self.resource() + '/' + self.id + url = self.resource() + "/" + self.id resp = self._client.put(url, {}, params=params, headers=headers) if self._client.is_ok(resp): self.parse(resp, obj=self) @@ -608,29 +767,27 @@ def __init__(self, client, uploader, compress_option): if compress_option is None: self._writer = self._req_io - elif compress_option.algorithm == \ - io.CompressOption.CompressAlgorithm.ODPS_RAW: + elif compress_option.algorithm == io.CompressOption.CompressAlgorithm.ODPS_RAW: self._writer = self._req_io - elif compress_option.algorithm == \ - io.CompressOption.CompressAlgorithm.ODPS_ZLIB: + elif compress_option.algorithm == io.CompressOption.CompressAlgorithm.ODPS_ZLIB: self._writer = io.DeflateOutputStream(self._req_io) else: - raise errors.InvalidArgument('Invalid compression algorithm.') + raise errors.InvalidArgument("Invalid compression algorithm.") - self._crc = Checksum(method='crc32') + self._crc = Checksum(method="crc32") self._initialized = False self._chunk_offset = 0 def _init_writer(self): - chunk_bytes = struct.pack('>I', self.CHUNK_SIZE) + chunk_bytes = struct.pack(">I", self.CHUNK_SIZE) self._writer.write(chunk_bytes) self._crc.update(chunk_bytes) self._chunk_offset = 0 - def write(self, buf, encoding='utf-8'): + def write(self, buf, encoding="utf-8"): buf = to_binary(buf, encoding=encoding) if isinstance(buf, six.integer_types): - buf = bytes(bytearray([buf, ])) + buf = bytes(bytearray([buf])) elif isinstance(buf, six.BytesIO): buf = buf.getvalue() if not self._initialized: @@ -639,7 +796,7 @@ def write(self, buf, encoding='utf-8'): self._req_io.start() if not buf: - raise IOError('Invalid data buffer!') + raise IOError("Invalid data buffer!") processed = 0 while processed < len(buf): if self._chunk_offset == self.CHUNK_SIZE: @@ -647,9 +804,12 @@ def write(self, buf, encoding='utf-8'): self._writer.write(struct.pack(CHECKSUM_PACKER, checksum)) self._chunk_offset = 0 else: - size = self.CHUNK_SIZE - self._chunk_offset if len(buf) - processed > self.CHUNK_SIZE - self._chunk_offset\ + size = ( + self.CHUNK_SIZE - self._chunk_offset + if len(buf) - processed > self.CHUNK_SIZE - self._chunk_offset else len(buf) - processed - write_chunk = buf[processed:processed + size] + ) + write_chunk = buf[processed : processed + size] self._writer.write(write_chunk) self._crc.update(write_chunk) processed += size @@ -666,7 +826,7 @@ def close(self): self._writer.flush() result = self._req_io.finish() if result is None: - raise TunnelError('No results returned in VolumeWriter.') + raise TunnelError("No results returned in VolumeWriter.") if not self._client.is_ok(result): e = TunnelError.parse(result) raise e @@ -690,13 +850,23 @@ def __init__(self, client, uploader, volume, path, compress_option): def close(self): result = super(VolumeFSWriter, self).close() - if 'x-odps-volume-sessionid' not in result.headers: - raise TunnelError('No session id returned in response.') - headers = { - 'x-odps-volume-fs-path': '/' + self._volume.name + '/' + self._path.lstrip('/'), - 'x-odps-volume-sessionid': result.headers.get('x-odps-volume-sessionid'), - } - commit_result = self._client.put(self._volume.resource(client=self._client), None, headers=headers) + if "x-odps-volume-sessionid" not in result.headers: + raise TunnelError("No session id returned in response.") + headers = VolumeUploadSession.get_common_headers() + headers.update( + { + "x-odps-volume-fs-path": "/" + + self._volume.name + + "/" + + self._path.lstrip("/"), + "x-odps-volume-sessionid": result.headers.get( + "x-odps-volume-sessionid" + ), + } + ) + commit_result = self._client.put( + self._volume.resource(client=self._client), None, headers=headers + ) if not self._client.is_ok(commit_result): e = TunnelError.parse(commit_result) raise e diff --git a/odps/types.py b/odps/types.py index 58dfde1a..35208bbd 100644 --- a/odps/types.py +++ b/odps/types.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,15 +18,20 @@ import json as _json import warnings from collections import OrderedDict -from datetime import datetime as _datetime, timedelta as _timedelta, date as _date +from datetime import date as _date +from datetime import datetime as _datetime +from datetime import timedelta as _timedelta from . import compat, utils -from .compat import six, DECIMAL_TYPES, decimal as _decimal, east_asian_len, Monthdelta +from .compat import DECIMAL_TYPES, Monthdelta +from .compat import decimal as _decimal +from .compat import east_asian_len, six from .config import options from .lib.xnamedtuple import xnamedtuple try: from pandas import NA as _pd_na + pd_na_type = type(_pd_na) except (ImportError, ValueError): pd_na_type = None @@ -34,6 +39,8 @@ force_py = options.force_py force_c = options.force_c +_date_allow_int_conversion = False + class Column(object): def __init__( @@ -51,7 +58,7 @@ def __init__( def __repr__(self): not_null_str = ", not null" if not self.nullable else "" - return ''.format( + return "".format( utils.to_str(self.name), self.type.name.lower(), not_null_str ) @@ -61,11 +68,14 @@ def __hash__(self): class Partition(Column): def __repr__(self): - return ''.format(utils.to_str(self.name), self.type.name.lower()) + return "".format( + utils.to_str(self.name), self.type.name.lower() + ) class _CallableList(list): """Make sure keys and values properties also callable""" + def __call__(self): return self @@ -79,22 +89,22 @@ def __init__(self, spec=None): elif isinstance(spec, dict): self.kv = OrderedDict(spec) elif isinstance(spec, six.string_types): - splits = spec.split(',') + splits = spec.split(",") for sp in splits: - kv = sp.split('=') + kv = sp.split("=") if len(kv) != 2: raise ValueError( - 'Invalid partition spec: a partition spec should ' + "Invalid partition spec: a partition spec should " 'look like "part1=v1,part2=v2"' ) k, v = kv[0].strip(), kv[1].strip().strip('\'"') if len(k) == 0 or len(v) == 0: - raise ValueError('Invalid partition spec') + raise ValueError("Invalid partition spec") if k in self.kv: raise ValueError( - 'Invalid partition spec: found duplicate partition key ' + k + "Invalid partition spec: found duplicate partition key " + k ) self.kv[k] = v @@ -130,10 +140,10 @@ def __contains__(self, key): return key in self.kv def __str__(self): - return ','.join("%s='%s'" % (k, v) for k, v in six.iteritems(self.kv)) + return ",".join("%s='%s'" % (k, v) for k, v in six.iteritems(self.kv)) def __repr__(self): - return '' % str(self) + return "" % str(self) def __hash__(self): return hash(str(self)) @@ -159,7 +169,7 @@ def _init(self, names, types): if len(self._name_indexes) < len(self.names): duplicates = [n for n in self._name_indexes if self.names.count(n) > 1] - raise ValueError('Duplicate column names: %s' % ', '.join(duplicates)) + raise ValueError("Duplicate column names: %s" % ", ".join(duplicates)) self._snapshot = None @@ -177,9 +187,9 @@ def _repr(self): names = [self._to_printable(n) for n in self.names] space = 2 * max(len(it) for it in names) for name, tp in zip(names, self.types): - buf.write('\n{0}{1}'.format(name.ljust(space), repr(tp))) + buf.write("\n{0}{1}".format(name.ljust(space), repr(tp))) - return 'Schema {{{0}\n}}'.format(utils.indent(buf.getvalue(), 2)) + return "Schema {{{0}\n}}".format(utils.indent(buf.getvalue(), 2)) def __hash__(self): return hash((type(self), tuple(self.names), tuple(self.types))) @@ -193,8 +203,8 @@ def get_type(self, name): return self.types[self._name_indexes[utils.to_str(name)]] def append(self, name, typo): - names = self.names + [name, ] - types = self.types + [validate_data_type(typo), ] + names = self.names + [name] + types = self.types + [validate_data_type(typo)] return Schema(names, types) def extend(self, schema): @@ -210,13 +220,15 @@ def __init__(self, columns=None, partitions=None): if self._columns: super(OdpsSchema, self).__init__( - *compat.lzip(*[(c.name, c.type) for c in self._columns])) + *compat.lzip(*[(c.name, c.type) for c in self._columns]) + ) else: super(OdpsSchema, self).__init__([], []) if self._partitions: self._partition_schema = Schema( - *compat.lzip(*[(c.name, c.type) for c in self._partitions])) + *compat.lzip(*[(c.name, c.type) for c in self._partitions]) + ) else: self._partition_schema = Schema([], []) @@ -224,30 +236,40 @@ def __len__(self): return super(OdpsSchema, self).__len__() + len(self._partition_schema) def __setattr__(self, key, value): - if key == '_columns' and value and not getattr(self, 'names', None) and \ - not getattr(self, 'types', None): + if ( + key == "_columns" + and value + and not getattr(self, "names", None) + and not getattr(self, "types", None) + ): names = [c.name for c in value] types = [c.type for c in value] self._init(names, types) - elif key == '_partitions' and value: + elif key == "_partitions" and value: self._partition_schema = Schema( - *compat.lzip(*[(c.name, c.type) for c in value])) + *compat.lzip(*[(c.name, c.type) for c in value]) + ) object.__setattr__(self, key, value) def __contains__(self, name): - return super(OdpsSchema, self).__contains__(name) or \ - utils.to_str(name) in self._partition_schema + return ( + super(OdpsSchema, self).__contains__(name) + or utils.to_str(name) in self._partition_schema + ) def __eq__(self, other): if not isinstance(other, OdpsSchema): return False - return super(OdpsSchema, self).__eq__(other) and \ - self._partition_schema == other._partition_schema + return ( + super(OdpsSchema, self).__eq__(other) + and self._partition_schema == other._partition_schema + ) def __hash__(self): - return hash((type(self), tuple(self.names), tuple(self.types), - self._partition_schema)) + return hash( + (type(self), tuple(self.names), tuple(self.types), self._partition_schema) + ) def __getitem__(self, item): if isinstance(item, six.integer_types): @@ -255,9 +277,9 @@ def __getitem__(self, item): if item < n_columns: return self._columns[item] elif item < len(self): - return self._partitions[item-n_columns] + return self._partitions[item - n_columns] else: - raise IndexError('Index out of range') + raise IndexError("Index out of range") elif isinstance(item, six.string_types): item = utils.to_str(item) if item in self._name_indexes: @@ -266,9 +288,9 @@ def __getitem__(self, item): elif item in self._partition_schema: idx = self._partition_schema._name_indexes[item] n_columns = len(self._name_indexes) - return self[n_columns+idx] + return self[n_columns + idx] else: - raise ValueError('Unknown column name: %s' % item) + raise ValueError("Unknown column name: %s" % item) elif isinstance(item, (list, tuple)): return [self[it] for it in item] else: @@ -277,43 +299,57 @@ def __getitem__(self, item): def _repr(self): buf = six.StringIO() - name_dict = dict([(col.name, utils.str_to_printable(col.name)) for col in self.columns]) - name_display_lens = dict([(k, east_asian_len(utils.to_text(v), encoding=options.display.encoding)) - for k, v in six.iteritems(name_dict)]) + name_dict = dict( + [(col.name, utils.str_to_printable(col.name)) for col in self.columns] + ) + name_display_lens = dict( + [ + (k, east_asian_len(utils.to_text(v), encoding=options.display.encoding)) + for k, v in six.iteritems(name_dict) + ] + ) name_space = 2 * max(six.itervalues(name_display_lens)) type_space = 2 * max(len(repr(col.type)) for col in self.columns) has_not_null = any(not col.nullable for col in self.columns) not_empty = lambda field: field is not None and len(field.strip()) > 0 - buf.write('odps.Schema {\n') + buf.write("odps.Schema {\n") cols_strs = [] for col in self._columns: pad_spaces = name_space - name_display_lens[col.name] not_null = "not null" if not col.nullable else " " * 8 - cols_strs.append('{0}{1}{2}{3}'.format( - utils.to_str(name_dict[col.name] + ' ' * pad_spaces), - repr(col.type).ljust(type_space), - not_null + " " * 4 if has_not_null else "", - '# {0}'.format(utils.to_str(col.comment)) if not_empty(col.comment) else '' - )) - buf.write(utils.indent('\n'.join(cols_strs), 2)) - buf.write('\n') - buf.write('}\n') + cols_strs.append( + "{0}{1}{2}{3}".format( + utils.to_str(name_dict[col.name] + " " * pad_spaces), + repr(col.type).ljust(type_space), + not_null + " " * 4 if has_not_null else "", + "# {0}".format(utils.to_str(col.comment)) + if not_empty(col.comment) + else "", + ) + ) + buf.write(utils.indent("\n".join(cols_strs), 2)) + buf.write("\n") + buf.write("}\n") if self._partitions: - buf.write('Partitions {\n') + buf.write("Partitions {\n") partition_strs = [] for partition in self._partitions: - partition_strs.append('{0}{1}{2}'.format( - utils.to_str(name_dict[partition.name].ljust(name_space)), - repr(partition.type).ljust(type_space), - '# {0}'.format(utils.to_str(partition.comment)) if not_empty(partition.comment) else '' - )) - buf.write(utils.indent('\n'.join(partition_strs), 2)) - buf.write('\n') - buf.write('}\n') + partition_strs.append( + "{0}{1}{2}".format( + utils.to_str(name_dict[partition.name].ljust(name_space)), + repr(partition.type).ljust(type_space), + "# {0}".format(utils.to_str(partition.comment)) + if not_empty(partition.comment) + else "", + ) + ) + buf.write(utils.indent("\n".join(partition_strs), 2)) + buf.write("\n") + buf.write("}\n") return buf.getvalue() @@ -324,6 +360,7 @@ def build_snapshot(self): try: from .src.types_c import SchemaSnapshot + self._snapshot = SchemaSnapshot(self) except ImportError: pass @@ -345,24 +382,24 @@ def partitions(self): except AttributeError: return [] - @utils.deprecated('use simple_columns property instead') + @utils.deprecated("use simple_columns property instead") def get_columns(self): return self._columns - @utils.deprecated('use partitions property instead') + @utils.deprecated("use partitions property instead") def get_partitions(self): return self._partitions def get_column(self, name): index = self._name_indexes.get(utils.to_str(name)) if index is None: - raise ValueError('Column %s does not exists' % name) + raise ValueError("Column %s does not exists" % name) return self._columns[index] def get_partition(self, name): index = self._partition_schema._name_indexes.get(utils.to_str(name)) if index is None: - raise ValueError('Partition %s does not exists' % name) + raise ValueError("Partition %s does not exists" % name) return self._partitions[index] def is_partition(self, name): @@ -377,7 +414,7 @@ def get_type(self, name): return super(OdpsSchema, self).get_type(name) elif name in self._partition_schema: return self._partition_schema.get_type(name) - raise ValueError('Column does not exist: %s' % name) + raise ValueError("Column does not exist: %s" % name) def update(self, columns, partitions): self._columns = columns @@ -389,7 +426,8 @@ def update(self, columns, partitions): self._init(names, types) if self._partitions: self._partition_schema = Schema( - *compat.lzip(*[(c.name, c.type) for c in self._partitions])) + *compat.lzip(*[(c.name, c.type) for c in self._partitions]) + ) else: self._partition_schema = Schema([], []) @@ -406,12 +444,16 @@ def extend(self, schema): ) def to_ignorecase_schema(self): - cols = [Column(col.name.lower(), col.type, col.comment, col.label) - for col in self._columns] + cols = [ + Column(col.name.lower(), col.type, col.comment, col.label) + for col in self._columns + ] parts = None if self._partitions: - parts = [Partition(part.name.lower(), part.type, part.comment, part.label) - for part in self._partitions] + parts = [ + Partition(part.name.lower(), part.type, part.comment, part.label) + for part in self._partitions + ] return type(self)(columns=cols, partitions=parts) @@ -419,8 +461,10 @@ def to_ignorecase_schema(self): def from_lists(cls, names, types, partition_names=None, partition_types=None): columns = [Column(name=name, typo=typo) for name, typo in zip(names, types)] if partition_names is not None and partition_types is not None: - partitions = [Partition(name=name, typo=typo) - for name, typo in zip(partition_names, partition_types)] + partitions = [ + Partition(name=name, typo=typo) + for name, typo in zip(partition_names, partition_types) + ] else: partitions = None return cls(columns=columns, partitions=partitions) @@ -429,18 +473,26 @@ def from_lists(cls, names, types, partition_names=None, partition_types=None): def from_dict(cls, fields_dict, partitions_dict=None): fields = compat.lkeys(fields_dict) fields_types = compat.lvalues(fields_dict) - partitions = compat.lkeys(partitions_dict) \ - if partitions_dict is not None else None - partitions_types = compat.lvalues(partitions_dict) \ - if partitions_dict is not None else None + partitions = ( + compat.lkeys(partitions_dict) if partitions_dict is not None else None + ) + partitions_types = ( + compat.lvalues(partitions_dict) if partitions_dict is not None else None + ) - return cls.from_lists(fields, fields_types, - partition_names=partitions, - partition_types=partitions_types) + return cls.from_lists( + fields, + fields_types, + partition_names=partitions, + partition_types=partitions_types, + ) - def get_table_ddl(self, table_name='table_name', with_comments=True): + def get_table_ddl(self, table_name="table_name", with_comments=True): from .models.table import Table - return Table.gen_create_table_sql(table_name, self, with_column_comments=with_comments) + + return Table.gen_create_table_sql( + table_name, self, with_column_comments=with_comments + ) class RecordMeta(type): @@ -460,9 +512,8 @@ def is_record(obj): class BaseRecord(object): - # set __slots__ to save memory in the situation that records' size may be quite large - __slots__ = '_values', '_columns', '_name_indexes', '_max_field_size' + __slots__ = "_values", "_columns", "_name_indexes", "_max_field_size" def __init__(self, columns=None, schema=None, values=None, max_field_size=None): if columns is not None: @@ -475,14 +526,14 @@ def __init__(self, columns=None, schema=None, values=None, max_field_size=None): self._max_field_size = max_field_size if self._columns is None: - raise ValueError('Either columns or schema should not be provided') + raise ValueError("Either columns or schema should not be provided") - self._values = [None, ] * len(self._columns) + self._values = [None] * len(self._columns) if values is not None: self._sets(values) def _mode(self): - return 'py' + return "py" def _exclude_partition_columns(self): return [col for col in self._columns if not isinstance(col, Partition)] @@ -499,10 +550,13 @@ def _set(self, i, value): set = _set # to keep compatible def _sets(self, values): - if len(values) != len(self._columns) and \ - len(values) != len(self._exclude_partition_columns()): - raise ValueError('The values set to records are against the schema, ' - 'expect len %s, got len %s' % (len(self._columns), len(values))) + if len(values) != len(self._columns) and len(values) != len( + self._exclude_partition_columns() + ): + raise ValueError( + "The values set to records are against the schema, " + "expect len %s, got len %s" % (len(self._columns), len(values)) + ) [self._set(i, value) for i, value in enumerate(values)] def __getitem__(self, item): @@ -519,14 +573,14 @@ def __setitem__(self, key, value): self._set(key, value) def __getattr__(self, item): - if item == '_name_indexes': + if item == "_name_indexes": return object.__getattribute__(self, item) - if hasattr(self, '_name_indexes') and item in self._name_indexes: + if hasattr(self, "_name_indexes") and item in self._name_indexes: return self.get_by_name(item) return object.__getattribute__(self, item) def __setattr__(self, key, value): - if hasattr(self, '_name_indexes') and key in self._name_indexes: + if hasattr(self, "_name_indexes") and key in self._name_indexes: self.set_by_name(key, value) else: object.__setattr__(self, key, value) @@ -565,15 +619,18 @@ class RecordReprMixin(object): def __repr__(self): buf = six.StringIO() - buf.write('odps.Record {\n') + buf.write("odps.Record {\n") space = 2 * max(len(it.name) for it in self._columns) - content = '\n'.join( - ['{0}{1}'.format(col.name.ljust(space), repr(value)) - for col, value in zip(self._columns, self._values)]) + content = "\n".join( + [ + "{0}{1}".format(col.name.ljust(space), repr(value)) + for col, value in zip(self._columns, self._values) + ] + ) buf.write(utils.indent(content, 2)) - buf.write('\n}') + buf.write("\n}") return buf.getvalue() @@ -620,13 +677,14 @@ class DataType(object): """ Abstract data type """ + _singleton = True _type_id = -1 - __slots__ = 'nullable', + __slots__ = ("nullable",) def __new__(cls, *args, **kwargs): if cls._singleton: - if not hasattr(cls, '_instance'): + if not hasattr(cls, "_instance"): cls._instance = object.__new__(cls) cls._hash = hash(cls) return cls._instance @@ -673,7 +731,7 @@ def name(self): def __repr__(self): if self.nullable: return self.name - return '{0}[non-nullable]'.format(self.name) + return "{0}[non-nullable]".format(self.name) def __str__(self): return self.name.upper() @@ -693,8 +751,10 @@ def validate_value(self, val, max_field_size=None): def _can_cast_or_throw(self, value, data_type): if not self.can_implicit_cast(data_type): - raise ValueError('Cannot cast value(%s) from type(%s) to type(%s)' % ( - value, data_type, self)) + raise ValueError( + "Cannot cast value(%s) from type(%s) to type(%s)" + % (value, data_type, self) + ) def cast_value(self, value, data_type): raise NotImplementedError @@ -727,7 +787,7 @@ def validate_value(self, val, max_field_size=None): smallest, largest = self._bounds if smallest <= val <= largest: return True - raise ValueError('InvalidData: Bigint(%s) out of range' % val) + raise ValueError("InvalidData: Bigint(%s) out of range" % val) def cast_value(self, value, data_type): self._can_cast_or_throw(value, data_type) @@ -806,15 +866,15 @@ def validate_value(self, val, max_field_size=None): if len(val) <= max_field_size: return True raise ValueError( - "InvalidData: Length of string(%s) is more than %sM.'" % - (val, max_field_size / (1024 ** 2)) + "InvalidData: Length of string(%s) is more than %sM.'" + % (val, max_field_size / (1024**2)) ) def cast_value(self, value, data_type): self._can_cast_or_throw(value, data_type) if isinstance(data_type, Datetime): - return value.strftime('%Y-%m-%d %H:%M:%S') + return value.strftime("%Y-%m-%d %H:%M:%S") if options.tunnel.string_as_binary: val = utils.to_binary(value) else: @@ -830,7 +890,10 @@ def can_implicit_cast(self, other): if isinstance(other, six.string_types): other = validate_data_type(other) - if isinstance(other, (BaseTimestamp, Datetime, String)): + from_types = (BaseTimestamp, Datetime, String) + if _date_allow_int_conversion: + from_types += (Bigint,) + if isinstance(other, from_types): return True return super(Datetime, self).can_implicit_cast(other) @@ -838,9 +901,11 @@ def cast_value(self, value, data_type): self._can_cast_or_throw(value, data_type) if isinstance(data_type, String): - return _datetime.strptime(value, '%Y-%m-%d %H:%M:%S') + return _datetime.strptime(value, "%Y-%m-%d %H:%M:%S") elif isinstance(data_type, BaseTimestamp): return value.to_pydatetime() + elif _date_allow_int_conversion and isinstance(data_type, Bigint): + return utils.to_datetime(value) return value @@ -852,7 +917,10 @@ def can_implicit_cast(self, other): if isinstance(other, six.string_types): other = validate_data_type(other) - if isinstance(other, (BaseTimestamp, Datetime, String)): + from_types = (BaseTimestamp, Datetime, String) + if _date_allow_int_conversion: + from_types += (Bigint,) + if isinstance(other, from_types): return True return super(Date, self).can_implicit_cast(other) @@ -860,10 +928,12 @@ def cast_value(self, value, data_type): self._can_cast_or_throw(value, data_type) if isinstance(data_type, String): - datetime = _datetime.strptime(value, '%Y-%m-%d') + datetime = _datetime.strptime(value, "%Y-%m-%d") return _date(datetime.year, datetime.month, datetime.day) elif isinstance(data_type, BaseTimestamp): return value.to_pydatetime().date() + elif _date_allow_int_conversion and isinstance(data_type, Bigint): + return utils.to_date(value) return value @@ -896,15 +966,15 @@ def validate_value(self, val, max_field_size=None): if len(val) <= max_field_size: return True raise ValueError( - "InvalidData: Length of binary(%s) is more than %sM.'" % - (val, max_field_size / (1024 ** 2)) + "InvalidData: Length of binary(%s) is more than %sM.'" + % (val, max_field_size / (1024**2)) ) def cast_value(self, value, data_type): self._can_cast_or_throw(value, data_type) if isinstance(data_type, Datetime): - return value.strftime('%Y-%m-%d %H:%M:%S') + return value.strftime("%Y-%m-%d %H:%M:%S") return utils.to_binary(value) @@ -925,7 +995,7 @@ def cast_value(self, value, data_type): try: import pandas as pd except (ImportError, ValueError): - raise ImportError('To use TIMESTAMP in pyodps, you need to install pandas.') + raise ImportError("To use TIMESTAMP in pyodps, you need to install pandas.") if isinstance(data_type, String): return pd.to_datetime(value) @@ -960,14 +1030,16 @@ def can_implicit_cast(self, other): @property def name(self): - return 'interval_day_time' + return "interval_day_time" def cast_value(self, value, data_type): self._can_cast_or_throw(value, data_type) try: import pandas as pd except (ImportError, ValueError): - raise ImportError('To use INTERVAL_DAY_TIME in pyodps, you need to install pandas.') + raise ImportError( + "To use INTERVAL_DAY_TIME in pyodps, you need to install pandas." + ) if isinstance(value, float): return pd.Timedelta(seconds=value) @@ -990,7 +1062,7 @@ def can_implicit_cast(self, other): @property def name(self): - return 'interval_year_month' + return "interval_year_month" def cast_value(self, value, data_type): self._can_cast_or_throw(value, data_type) @@ -1013,30 +1085,30 @@ def validate_composite_values(self, value): class SizeLimitedString(String, CompositeMixin): _singleton = False - __slots__ = 'nullable', 'size_limit', '_hash' + __slots__ = "nullable", "size_limit", "_hash" _max_length = 65535 def __init__(self, size_limit, nullable=True): super(SizeLimitedString, self).__init__(nullable=nullable) if size_limit > self._max_length: - raise ValueError("InvalidData: Length of varchar(%d) is larger than %d." % - (size_limit, self._max_length)) + raise ValueError( + "InvalidData: Length of varchar(%d) is larger than %d." + % (size_limit, self._max_length) + ) self.size_limit = size_limit @property def name(self): - return '{0}({1})'.format(type(self).__name__.lower(), - self.size_limit) + return "{0}({1})".format(type(self).__name__.lower(), self.size_limit) def _equals(self, other): if isinstance(other, six.string_types): other = validate_data_type(other) - return DataType._equals(self, other) and \ - self.size_limit == other.size_limit + return DataType._equals(self, other) and self.size_limit == other.size_limit def __hash__(self): - if not hasattr(self, '_hash'): + if not hasattr(self, "_hash"): self._hash = hash((type(self), self.nullable, self.size_limit)) return self._hash @@ -1046,17 +1118,22 @@ def validate_value(self, val, max_field_size=None): if len(val) <= self.size_limit: return True raise ValueError( - "InvalidData: Length of string(%d) is more than %sM.'" % - (len(val), self.size_limit)) + "InvalidData: Length of string(%d) is more than %sM.'" + % (len(val), self.size_limit) + ) @classmethod def parse_composite(cls, args): if len(args) != 1: - raise ValueError('%s() only accept one length argument.' % cls.__name__.upper()) + raise ValueError( + "%s() only accept one length argument." % cls.__name__.upper() + ) try: return cls(int(args[0])) except TypeError: - raise ValueError('%s() only accept an integer length argument.' % cls.__name__.upper()) + raise ValueError( + "%s() only accept an integer length argument." % cls.__name__.upper() + ) def validate_composite_values(self, value): self.validate_value(value) @@ -1068,11 +1145,15 @@ def can_implicit_cast(self, other): if isinstance(other, six.string_types): other = validate_data_type(other) - if isinstance(other, (BaseInteger, BaseFloat, Datetime, Decimal, String, Binary)): + if isinstance( + other, (BaseInteger, BaseFloat, Datetime, Decimal, String, Binary) + ): return True - return isinstance(other, (Char, Varchar)) and \ - self.size_limit >= other.size_limit and \ - self.nullable == other.nullable + return ( + isinstance(other, (Char, Varchar)) + and self.size_limit >= other.size_limit + and self.nullable == other.nullable + ) class Char(SizeLimitedString): @@ -1080,15 +1161,19 @@ def can_implicit_cast(self, other): if isinstance(other, six.string_types): other = validate_data_type(other) - if isinstance(other, (BaseInteger, BaseFloat, Datetime, Decimal, String, Binary)): + if isinstance( + other, (BaseInteger, BaseFloat, Datetime, Decimal, String, Binary) + ): return True - return isinstance(other, (Char, Varchar)) and \ - self.size_limit >= other.size_limit and \ - self.nullable == other.nullable + return ( + isinstance(other, (Char, Varchar)) + and self.size_limit >= other.size_limit + and self.nullable == other.nullable + ) class Decimal(CompositeMixin): - __slots__ = 'nullable', 'precision', 'scale', '_hash' + __slots__ = "nullable", "precision", "scale", "_hash" _type_id = 5 _has_other_decimal_type = len(DECIMAL_TYPES) > 1 @@ -1109,7 +1194,9 @@ def __init__(self, precision=None, scale=None, nullable=True): "InvalidData: Scale(%d) is larger than %d." % (scale, self._max_scale) ) if precision is None and scale is not None: - raise ValueError('InvalidData: Scale should be provided along with precision.') + raise ValueError( + "InvalidData: Scale should be provided along with precision." + ) self.precision = precision self.scale = scale self._scale_decimal = _decimal.Decimal( @@ -1121,14 +1208,14 @@ def name(self): if self.precision is None: return type(self).__name__.lower() elif self.scale is None: - return '{0}({1})'.format(type(self).__name__.lower(), - self.precision) + return "{0}({1})".format(type(self).__name__.lower(), self.precision) else: - return '{0}({1},{2})'.format(type(self).__name__.lower(), - self.precision, self.scale) + return "{0}({1},{2})".format( + type(self).__name__.lower(), self.precision, self.scale + ) def __hash__(self): - if not hasattr(self, '_hash'): + if not hasattr(self, "_hash"): self._hash = hash((type(self), self.nullable, self.precision, self.scale)) return self._hash @@ -1161,7 +1248,9 @@ def validate_value(self, val, max_field_size=None): ): val = _decimal.Decimal(str(val)) - precision = self.precision if self.precision is not None else self._max_precision + precision = ( + self.precision if self.precision is not None else self._max_precision + ) scale = self.scale if self.scale is not None else self._max_scale scaled_val = val.quantize( self._scale_decimal, _decimal.ROUND_HALF_UP, self._decimal_ctx @@ -1169,8 +1258,8 @@ def validate_value(self, val, max_field_size=None): int_len = len(str(scaled_val)) - scale - 1 if int_len > precision: raise ValueError( - 'decimal value %s overflow, max integer digit number is %s.' % - (val, precision) + "decimal value %s overflow, max integer digit number is %s." + % (val, precision) ) return True @@ -1178,17 +1267,21 @@ def cast_value(self, value, data_type): self._can_cast_or_throw(value, data_type) if six.PY3 and isinstance(value, six.binary_type): - value = value.decode('utf-8') + value = value.decode("utf-8") return _builtin_decimal.Decimal(value) @classmethod def parse_composite(cls, args): if len(args) > 2: - raise ValueError('%s() accepts no more than two arguments.' % cls.__name__.upper()) + raise ValueError( + "%s() accepts no more than two arguments." % cls.__name__.upper() + ) try: return cls(*[int(v) for v in args]) except TypeError: - raise ValueError('%s() only accept integers as arguments.' % cls.__name__.upper()) + raise ValueError( + "%s() only accept integers as arguments." % cls.__name__.upper() + ) def validate_composite_values(self, value): if value is None and self.nullable: @@ -1200,7 +1293,7 @@ def validate_composite_values(self, value): class Array(CompositeMixin): - __slots__ = 'nullable', 'value_type', '_hash' + __slots__ = "nullable", "value_type", "_hash" def __init__(self, value_type, nullable=True): super(Array, self).__init__(nullable=nullable) @@ -1209,18 +1302,16 @@ def __init__(self, value_type, nullable=True): @property def name(self): - return '{0}<{1}>'.format(type(self).__name__.lower(), - self.value_type.name) + return "{0}<{1}>".format(type(self).__name__.lower(), self.value_type.name) def _equals(self, other): if isinstance(other, six.string_types): other = validate_data_type(other) - return DataType._equals(self, other) and \ - self.value_type == other.value_type + return DataType._equals(self, other) and self.value_type == other.value_type def __hash__(self): - if not hasattr(self, '_hash'): + if not hasattr(self, "_hash"): self._hash = hash((type(self), self.nullable, hash(self.value_type))) return self._hash @@ -1228,9 +1319,11 @@ def can_implicit_cast(self, other): if isinstance(other, six.string_types): other = validate_data_type(other) - return isinstance(other, Array) and \ - self.value_type == other.value_type and \ - self.nullable == other.nullable + return ( + isinstance(other, Array) + and self.value_type == other.value_type + and self.nullable == other.nullable + ) def cast_value(self, value, data_type): self._can_cast_or_throw(value, data_type) @@ -1239,20 +1332,22 @@ def cast_value(self, value, data_type): @classmethod def parse_composite(cls, args): if len(args) != 1: - raise ValueError('%s<> should be supplied with exactly one type.' % cls.__name__.upper()) + raise ValueError( + "%s<> should be supplied with exactly one type." % cls.__name__.upper() + ) return cls(args[0]) def validate_composite_values(self, value): if value is None and self.nullable: return value if not isinstance(value, list): - raise ValueError('Array data type requires `list`, instead of %s' % value) + raise ValueError("Array data type requires `list`, instead of %s" % value) element_data_type = self.value_type return [validate_value(element, element_data_type) for element in value] class Map(CompositeMixin): - __slots__ = 'nullable', 'key_type', 'value_type', '_hash' + __slots__ = "nullable", "key_type", "value_type", "_hash" def __init__(self, key_type, value_type, nullable=True): super(Map, self).__init__(nullable=nullable) @@ -1263,32 +1358,37 @@ def __init__(self, key_type, value_type, nullable=True): @property def name(self): - return '{0}<{1},{2}>'.format(type(self).__name__.lower(), - self.key_type.name, - self.value_type.name) + return "{0}<{1},{2}>".format( + type(self).__name__.lower(), self.key_type.name, self.value_type.name + ) def _equals(self, other): if isinstance(other, six.string_types): other = validate_data_type(other) - return DataType._equals(self, other) and \ - self.key_type == other.key_type and \ - self.value_type == other.value_type + return ( + DataType._equals(self, other) + and self.key_type == other.key_type + and self.value_type == other.value_type + ) def __hash__(self): - if not hasattr(self, '_hash'): - self._hash = hash((type(self), self.nullable, - hash(self.key_type), hash(self.value_type))) + if not hasattr(self, "_hash"): + self._hash = hash( + (type(self), self.nullable, hash(self.key_type), hash(self.value_type)) + ) return self._hash def can_implicit_cast(self, other): if isinstance(other, six.string_types): other = validate_data_type(other) - return isinstance(other, Map) and \ - self.key_type == other.key_type and \ - self.value_type == other.value_type and \ - self.nullable == other.nullable + return ( + isinstance(other, Map) + and self.key_type == other.key_type + and self.value_type == other.value_type + and self.nullable == other.nullable + ) def cast_value(self, value, data_type): self._can_cast_or_throw(value, data_type) @@ -1297,24 +1397,28 @@ def cast_value(self, value, data_type): @classmethod def parse_composite(cls, args): if len(args) != 2: - raise ValueError('%s<> should be supplied with exactly two types.' % cls.__name__.upper()) + raise ValueError( + "%s<> should be supplied with exactly two types." % cls.__name__.upper() + ) return cls(*args) def validate_composite_values(self, value): if value is None and self.nullable: return value if not isinstance(value, dict): - raise ValueError('Map data type requires `dict`, instead of %s' % value) + raise ValueError("Map data type requires `dict`, instead of %s" % value) key_data_type = self.key_type value_data_type = self.value_type - convert = lambda k, v: (validate_value(k, key_data_type), - validate_value(v, value_data_type)) + convert = lambda k, v: ( + validate_value(k, key_data_type), + validate_value(v, value_data_type), + ) return OrderedDict(convert(k, v) for k, v in six.iteritems(value)) class Struct(CompositeMixin): - __slots__ = 'nullable', 'field_types', '_hash' + __slots__ = "nullable", "field_types", "_hash" def __init__(self, field_types, nullable=True): super(Struct, self).__init__(nullable=nullable) @@ -1332,26 +1436,36 @@ def __init__(self, field_types, nullable=True): warnings.warn( "Representing struct values as dicts is now deprecated. Try config " "`options.struct_as_dict=False` and return structs as named tuples " - "instead.", DeprecationWarning + "instead.", + DeprecationWarning, ) utils.add_survey_call("options.struct_as_dict") @property def name(self): - parts = ','.join('`%s`:%s' % (k, v.name) for k, v in six.iteritems(self.field_types)) - return '{0}<{1}>'.format(type(self).__name__.lower(), parts) + parts = ",".join( + "`%s`:%s" % (k, v.name) for k, v in six.iteritems(self.field_types) + ) + return "{0}<{1}>".format(type(self).__name__.lower(), parts) def _equals(self, other): if isinstance(other, six.string_types): other = validate_data_type(other) - return isinstance(other, Struct) and \ - len(self.field_types) == len(other.field_types) and \ - all(self.field_types[k] == other.field_types[k] for k in six.iterkeys(self.field_types)) + return ( + isinstance(other, Struct) + and len(self.field_types) == len(other.field_types) + and all( + self.field_types[k] == other.field_types[k] + for k in six.iterkeys(self.field_types) + ) + ) def __hash__(self): - if not hasattr(self, '_hash'): - fields_hash = hash(tuple((hash(k), hash(v)) for k, v in six.iteritems(self.field_types))) + if not hasattr(self, "_hash"): + fields_hash = hash( + tuple((hash(k), hash(v)) for k, v in six.iteritems(self.field_types)) + ) self._hash = hash((type(self), self.nullable, fields_hash)) return self._hash @@ -1359,8 +1473,11 @@ def can_implicit_cast(self, other): if isinstance(other, six.string_types): other = validate_data_type(other) - return isinstance(other, Struct) and self == other and \ - self.nullable == other.nullable + return ( + isinstance(other, Struct) + and self == other + and self.nullable == other.nullable + ) def cast_value(self, value, data_type): self._can_cast_or_throw(value, data_type) @@ -1368,15 +1485,14 @@ def cast_value(self, value, data_type): @classmethod def parse_composite(cls, args): - if any(not isinstance(a, tuple) and ':' not in a - for a in args): - raise ValueError('Every field defined in STRUCT should be given a name.') + if any(not isinstance(a, tuple) and ":" not in a for a in args): + raise ValueError("Every field defined in STRUCT should be given a name.") def conv_type_tuple(type_tuple): if isinstance(type_tuple, tuple): return type_tuple else: - return tuple(v.strip().strip('`') for v in type_tuple.split(':', 1)) + return tuple(v.strip().strip("`") for v in type_tuple.split(":", 1)) return cls(conv_type_tuple(a) for a in args) @@ -1395,7 +1511,10 @@ def validate_composite_values(self, value): else: if isinstance(value, tuple): return self.namedtuple_type( - *(validate_value(v, t) for v, t in zip(value, self.field_types.values())) + *( + validate_value(v, t) + for v, t in zip(value, self.field_types.values()) + ) ) elif isinstance(value, dict): list_val = [ @@ -1404,7 +1523,7 @@ def validate_composite_values(self, value): ] return self.namedtuple_type(*list_val) raise ValueError( - 'Struct data type requires `tuple` or `dict`, instead of %s' % type(value) + "Struct data type requires `tuple` or `dict`, instead of %s" % type(value) ) @@ -1427,10 +1546,12 @@ def validate_value(self, val, max_field_size=None): max_field_size = max_field_size or self._max_length if len(val) > max_field_size: raise ValueError( - "InvalidData: Length of string(%s) is more than %sM.'" % - (val, max_field_size / (1024 ** 2)) + "InvalidData: Length of string(%s) is more than %sM.'" + % (val, max_field_size / (1024**2)) ) - if not isinstance(val, (six.string_types, list, dict, six.integer_types, float)): + if not isinstance( + val, (six.string_types, list, dict, six.integer_types, float) + ): raise ValueError("InvalidData: cannot accept %r as json", val) return True @@ -1460,10 +1581,27 @@ def cast_value(self, value, data_type): json = Json() _odps_primitive_data_types = dict( - [(t.name, t) for t in ( - tinyint, smallint, int_, bigint, float_, double, string, datetime, date, boolean, - binary, timestamp, timestamp_ntz, interval_day_time, interval_year_month, json, - )] + [ + (t.name, t) + for t in ( + tinyint, + smallint, + int_, + bigint, + float_, + double, + string, + datetime, + date, + boolean, + binary, + timestamp, + timestamp_ntz, + interval_day_time, + interval_year_month, + json, + ) + ] ) @@ -1482,12 +1620,12 @@ def parse_composite_types(type_str, handlers=None): def _create_composite_type(typ, *args): prefix = None - if ':' in typ: - prefix, typ = typ.split(':') - prefix = prefix.strip().strip('`') + if ":" in typ: + prefix, typ = typ.split(":") + prefix = prefix.strip().strip("`") typ = typ.strip() if typ not in handlers: - raise ValueError('Composite type %s not supported.' % typ.upper()) + raise ValueError("Composite type %s not supported." % typ.upper()) ctype = handlers[typ].parse_composite(args) if prefix is None: @@ -1502,15 +1640,15 @@ def _create_composite_type(typ, *args): quoted = False for idx, ch in enumerate(type_str): - if ch == '`': + if ch == "`": quoted = not quoted elif not quoted: - if ch == '<' or ch == '(': + if ch == "<" or ch == "(": bracket_stack.append(len(token_stack)) token = type_str[token_start:idx].strip() token_stack.append(token) token_start = idx + 1 - elif ch == '>' or ch == ')': + elif ch == ">" or ch == ")": token = type_str[token_start:idx].strip() if token: token_stack.append(token) @@ -1519,7 +1657,7 @@ def _create_composite_type(typ, *args): token_stack = token_stack[:bracket_pos] token_stack.append(ctype) token_start = idx + 1 - elif ch == ',': + elif ch == ",": token = type_str[token_start:idx].strip() if token: token_stack.append(token) @@ -1545,8 +1683,10 @@ def validate_data_type(data_type): composite_err_msg = str(ex) if composite_err_msg is not None: - raise ValueError('Invalid data type: %s. %s' % (repr(data_type), composite_err_msg)) - raise ValueError('Invalid data type: %s' % repr(data_type)) + raise ValueError( + "Invalid data type: %s. %s" % (repr(data_type), composite_err_msg) + ) + raise ValueError("Invalid data type: %s" % repr(data_type)) integer_builtins = six.integer_types @@ -1559,21 +1699,23 @@ def validate_data_type(data_type): except ImportError: pass -_odps_primitive_to_builtin_types = OrderedDict(( - (bigint, integer_builtins), - (tinyint, integer_builtins), - (smallint, integer_builtins), - (int_, integer_builtins), - (double, float_builtins), - (float_, float_builtins), - (string, (six.text_type, six.binary_type)), - (binary, six.binary_type), - (datetime, _datetime), - (boolean, bool), - (interval_year_month, Monthdelta), - (date, _date), - (json, (list, dict, six.string_types, six.integer_types, float)), -)) +_odps_primitive_to_builtin_types = OrderedDict( + ( + (bigint, integer_builtins), + (tinyint, integer_builtins), + (smallint, integer_builtins), + (int_, integer_builtins), + (double, float_builtins), + (float_, float_builtins), + (string, (six.text_type, six.binary_type)), + (binary, six.binary_type), + (datetime, _datetime), + (boolean, bool), + (interval_year_month, Monthdelta), + (date, _date), + (json, (list, dict, six.string_types, six.integer_types, float)), + ) +) integer_types = (tinyint, smallint, int_, bigint) @@ -1593,12 +1735,14 @@ def _patch_pd_types(data_type): ) and isinstance(data_type, (BaseTimestamp, IntervalDayTime)): try: import pandas as pd + _odps_primitive_to_builtin_types[timestamp] = pd.Timestamp _odps_primitive_to_builtin_types[timestamp_ntz] = pd.Timestamp _odps_primitive_to_builtin_types[interval_day_time] = pd.Timedelta except (ImportError, ValueError): raise ImportError( - 'To use %s in pyodps, you need to install pandas.', data_type.name.upper() + "To use %s in pyodps, you need to install pandas.", + data_type.name.upper(), ) @@ -1608,10 +1752,10 @@ def _validate_primitive_value(value, data_type): if options.tunnel.string_as_binary: if isinstance(value, six.text_type): - value = value.encode('utf-8') + value = value.encode("utf-8") else: if isinstance(value, (bytearray, six.binary_type)): - value = value.decode('utf-8') + value = value.decode("utf-8") builtin_types = _odps_primitive_to_builtin_types[data_type] if isinstance(value, builtin_types): @@ -1620,7 +1764,9 @@ def _validate_primitive_value(value, data_type): inferred_data_type = infer_primitive_data_type(value) if inferred_data_type is None: raise ValueError( - 'Unknown value type, cannot infer from value: %s, type: %s' % (value, type(value))) + "Unknown value type, cannot infer from value: %s, type: %s" + % (value, type(value)) + ) return data_type.cast_value(value, inferred_data_type) @@ -1633,7 +1779,7 @@ def validate_value(value, data_type, max_field_size=None): elif isinstance(data_type, CompositeMixin): res = data_type.validate_composite_values(value) else: - raise ValueError('Unknown data type: %s' % data_type) + raise ValueError("Unknown data type: %s" % data_type) data_type.validate_value(res, max_field_size=max_field_size) return res diff --git a/odps/udf/__init__.py b/odps/udf/__init__.py index 8e9a3973..1626c318 100644 --- a/odps/udf/__init__.py +++ b/odps/udf/__init__.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,8 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""UDF support -""" +"""UDF debug support""" from __future__ import absolute_import diff --git a/odps/udf/runtime.py b/odps/udf/runtime.py index 83007261..bd442160 100644 --- a/odps/udf/runtime.py +++ b/odps/udf/runtime.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,15 +11,34 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import odps.counters -_origin_int = int +import sys + +from ..compat import six +from ..counters import Counters + +PY2 = sys.version_info[0] == 2 +if PY2: + long_type = long +else: + long_type = type("DummyLong", (object,), {}) + +_original_int = int _annotated_classes = {} -__all__ = ['get_execution_context', 'ExecutionContext', 'BaseUDAF', 'BaseUDTF', 'int', 'annotate', 'get_annotation'] +__all__ = [ + "get_execution_context", + "ExecutionContext", + "BaseUDAF", + "BaseUDTF", + "int", + "annotate", + "get_annotation", +] + class ExecutionContext(object): - counters = odps.counters.Counters() + counters = Counters() def get_counters(self): return self.counters @@ -27,9 +46,11 @@ def get_counters(self): def get_counters_as_json_string(self): return self.counters.to_json_string() + class BaseUDAF(object): pass + class BaseUDTF(object): def forward(self, *args): pass @@ -37,23 +58,36 @@ def forward(self, *args): def close(self): pass + def get_execution_context(): return ExecutionContext() + def int(v, silent=True): - v = _int(v) - if type(v) is long: + v = _original_int(v) + try: + if not PY2: + # when in python 3, check long value by bytes conversion + v.to_bytes(8, byteorder="little", signed=True) + elif type(v) is long_type: + raise OverflowError + except OverflowError: if silent: return None else: - raise OverflowError('Python int too large to convert to bigint: %s' % v) + six.raise_from( + OverflowError("Python int too large to convert to bigint: %s" % v), None + ) return v + def annotate(prototype): def ann(clz): _annotated_classes[clz] = prototype return clz + return ann + def get_annotation(clz): return _annotated_classes.get(clz) diff --git a/odps/udf/tests/test_executioncontext.py b/odps/udf/tests/test_executioncontext.py index f9dceabb..1779ed9c 100644 --- a/odps/udf/tests/test_executioncontext.py +++ b/odps/udf/tests/test_executioncontext.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .. import get_execution_context, ExecutionContext +from .. import ExecutionContext, get_execution_context def test_get_counter(): diff --git a/odps/udf/tests/test_resource.py b/odps/udf/tests/test_resource.py index 447960b9..43dc40b2 100644 --- a/odps/udf/tests/test_resource.py +++ b/odps/udf/tests/test_resource.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,13 +16,13 @@ def test_get_cache_archive(): - assert distcache.get_cache_archive('x') is None - assert distcache.get_cache_archive('x', 'y') is None + assert distcache.get_cache_archive("x") is None + assert distcache.get_cache_archive("x", "y") is None def test_get_cache_tabledesc(): - assert distcache.get_cache_tabledesc('x') is None + assert distcache.get_cache_tabledesc("x") is None def test_get_cache_tableinfo(): - assert distcache.get_cache_tableinfo('x') is None + assert distcache.get_cache_tableinfo("x") is None diff --git a/odps/udf/tests/test_runners.py b/odps/udf/tests/test_runners.py new file mode 100644 index 00000000..fcf177f3 --- /dev/null +++ b/odps/udf/tests/test_runners.py @@ -0,0 +1,105 @@ +# Copyright 1999-2024 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...compat import StringIO +from ...config import options +from ...tests.core import tn +from ..tools.runners import DirectCollector, get_csv_runner, get_table_runner +from .udf_examples import CatStrings, Plus, ZipArray + +csv_data = """ +0,1 +3,4 +2,1 +""".strip() + + +def test_csv_runner(): + text_io = StringIO(csv_data) + runner = get_csv_runner(Plus, stdin=text_io, collector_cls=DirectCollector) + runner.run() + assert runner.collector.results == [1, 7, 3] + + +def test_table_runner(odps): + table_name = tn("pyodps_t_tmp_test_udf_table_runner") + try: + options.sql.use_odps2_extension = True + odps.delete_table(table_name, if_exists=True) + table = odps.create_table( + table_name, + "col1 array, col2 array, col3 string", + lifecycle=1, + ) + + with table.open_writer() as writer: + writer.write([[["abcd", "efgh"], [1342, 5412], "uvfw"]]) + writer.write([[["alkf"], [1261], "asfd"]]) + writer.write([[["uvews", "asdfsaf"], [3245, 2345], "poes"]]) + writer.write([[["kslazd", "fdsal"], [342, 244], "poes"]]) + + runner = get_table_runner( + ZipArray, odps, table_name + ".c(col1, col2)", collector_cls=DirectCollector + ) + runner.run() + assert [ + {"abcd": 1342, "efgh": 5412}, + {"alkf": 1261}, + {"uvews": 3245, "asdfsaf": 2345}, + {"kslazd": 342, "fdsal": 244}, + ] == runner.collector.results + + runner = get_table_runner( + ZipArray, + odps, + table_name + ".c(col1, col2)", + record_limit=2, + collector_cls=DirectCollector, + ) + runner.run() + assert [ + {"abcd": 1342, "efgh": 5412}, + {"alkf": 1261}, + ] == runner.collector.results + finally: + odps.delete_table(table_name, if_exists=True) + options.sql.use_odps2_extension = False + + +def test_table_runner_with_parts(odps): + table_name = tn("pyodps_t_tmp_test_udf_table_runner_with_part") + try: + odps.delete_table(table_name, if_exists=True) + table = odps.create_table( + table_name, + ("col1 bigint, col2 bigint", "pt string"), + lifecycle=1, + ) + + with table.open_writer("pt=abcd", create_partition=True) as writer: + writer.write([[123, 541], [11, 92]]) + + table_spec = table_name + ".p(pt=abcd).c(col1,col2)" + runner = get_table_runner(Plus, odps, table_spec, collector_cls=DirectCollector) + runner.run() + assert [664, 103] == runner.collector.results + + table_spec = table_name + ".p(pt=abcd).c(col1,pt)" + runner = get_table_runner( + CatStrings, odps, table_spec, collector_cls=DirectCollector + ) + runner.run() + assert ["123abcd", "11abcd"] == runner.collector.results + finally: + odps.delete_table(table_name, if_exists=True) diff --git a/odps/udf/tests/test_simple_run.py b/odps/udf/tests/test_simple_run.py index e7b874c8..f76c5400 100644 --- a/odps/udf/tests/test_simple_run.py +++ b/odps/udf/tests/test_simple_run.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,24 +14,24 @@ import pytest +from ... import distcache from ..tools import runners from .udf_examples import * def test_udf(): - assert [2,3] == runners.simple_run(Plus, [(1,1), (2,1)]) - assert [None] == runners.simple_run(Plus, [(None,1) ]) + assert [2, 3] == runners.simple_run(Plus, [(1, 1), (2, 1)]) + assert [None] == runners.simple_run(Plus, [(None, 1)]) def test_udaf(): - assert [2] == runners.simple_run(Avg, [(1,),(2,),(3,)]) + assert [2] == runners.simple_run(Avg, [(1,), (2,), (3,)]) def test_udtf(): - assert ['a', 'b', 'ok'] == runners.simple_run(Explode, [('a|b',),]) + assert ["a", "b", "ok"] == runners.simple_run(Explode, [("a|b",)]) @pytest.mark.skip("Not implemented yet") def test_get_cache_table(): - from odps import distcache - assert distcache.get_cache_table('dual') == ('0',) + assert distcache.get_cache_table("dual") == ("0",) diff --git a/odps/udf/tests/test_types_py2.py b/odps/udf/tests/test_types_py2.py new file mode 100644 index 00000000..56602b12 --- /dev/null +++ b/odps/udf/tests/test_types_py2.py @@ -0,0 +1,43 @@ +# Copyright 1999-2024 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import time + +import pytest + +from ... import types as odps_types +from ..tools.runners import _convert_value + +pytestmark = pytest.mark.skipif( + sys.version_info[0] != 2, reason="Only needed for Python 2" +) + + +def test_py2_convert(): + cur_mills = int(1000 * time.time()) + assert 123 == _convert_value("123", odps_types.bigint) + assert cur_mills == _convert_value(cur_mills, odps_types.datetime) + assert [cur_mills] == _convert_value( + [cur_mills], odps_types.Array(odps_types.datetime) + ) + assert {"key": cur_mills} == _convert_value( + {"key": cur_mills}, odps_types.Map(odps_types.string, odps_types.datetime) + ) + + struct_type = odps_types.Struct({"key": odps_types.datetime}) + assert (cur_mills,) == _convert_value( + struct_type.namedtuple_type(cur_mills), struct_type + ) + assert (cur_mills,) == _convert_value({"key": cur_mills}, struct_type) diff --git a/odps/udf/tests/test_usercounter.py b/odps/udf/tests/test_usercounter.py index d61f1c6f..63f5f284 100644 --- a/odps/udf/tests/test_usercounter.py +++ b/odps/udf/tests/test_usercounter.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -43,37 +43,38 @@ def test_counters(): def _normalize_counter(json_str): obj = json.loads(json_str) for v in six.itervalues(obj): - if 'counters' not in v: + if "counters" not in v: continue - v['counters'] = sorted(v['counters'], key=lambda item: item['name']) + v["counters"] = sorted(v["counters"], key=lambda item: item["name"]) return json.dumps(obj, sort_keys=True) - result_json = ''' - { - "group1" : { - "name" : "group1", - "counters" : [ - { - "name" : "test1", - "value" : 1 - }, - { - "name" : "test2", - "value" : 2 - } - ]}, - "group2" : { - "name" : "group2", - "counters" : [ - { - "name" : "test3", - "value" : 3 - } - ] - } - } - ''' + result_json = """ + { + "group1": { + "name": "group1", + "counters": [ + { + "name": "test1", + "value": 1 + }, + { + "name": "test2", + "value": 2 + } + ] + }, + "group2": { + "name": "group2", + "counters": [ + { + "name": "test3", + "value": 3 + } + ] + } + } + """ counters = Counters() c1 = counters.get_group("group1").get_counter("test1") @@ -84,4 +85,6 @@ def _normalize_counter(json_str): c3.increment(3) assert 2 == counters.size() - assert _normalize_counter(result_json) == _normalize_counter(counters.to_json_string()) + assert _normalize_counter(result_json) == _normalize_counter( + counters.to_json_string() + ) diff --git a/odps/udf/tests/udf_examples.py b/odps/udf/tests/udf_examples.py index 6e0306d2..b21b3aa2 100644 --- a/odps/udf/tests/udf_examples.py +++ b/odps/udf/tests/udf_examples.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,21 +12,35 @@ # See the License for the specific language governing permissions and # limitations under the License. -from odps.udf import (annotate, BaseUDAF, BaseUDTF) +from odps.udf import BaseUDAF, BaseUDTF, annotate -@annotate(' bigint, bigint -> bigint ') +@annotate(" bigint, bigint -> bigint ") class Plus(object): - def evaluate(self, a, b): if None in (a, b): return None return a + b -@annotate('bigint->double') -class Avg(BaseUDAF): +@annotate(" * -> string ") +class CatStrings(object): + def evaluate(self, *args): + if None in args: + return None + return "".join(str(x) for x in args) + +@annotate(" array, array -> map ") +class ZipArray(object): + def evaluate(self, a, b): + if None in (a, b): + return None + return dict(zip(a, b)) + + +@annotate("bigint->double") +class Avg(BaseUDAF): def new_buffer(self): return [0, 0] @@ -45,35 +59,32 @@ def terminate(self, buffer): return float(buffer[0]) / buffer[1] -@annotate('string -> string') +@annotate("string -> string") class Explode(BaseUDTF): - def process(self, arg): if arg is None: return - props = arg.split('|') + props = arg.split("|") for p in props: self.forward(p) def close(self): - self.forward('ok') + self.forward("ok") -@annotate('*-> string') +@annotate("*-> string") class Star(BaseUDTF): - def process(self, *args): [self.forward(arg) for arg in args] def close(self): - self.forward('ok') + self.forward("ok") -@annotate('-> string') +@annotate("-> string") class Empty(BaseUDTF): - def process(self): self.forward("empty") def close(self): - self.forward('ok') \ No newline at end of file + self.forward("ok") diff --git a/odps/udf/tools/runners.py b/odps/udf/tools/runners.py index 1556db7e..0fe02ff4 100644 --- a/odps/udf/tools/runners.py +++ b/odps/udf/tools/runners.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,38 +12,59 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""UDF runners implementing the local run framework. -""" +"""UDF runners implementing the local run framework.""" -import sys import csv -from datetime import datetime +import re +import sys -from ... import udf from ... import distcache -from ...compat import six +from ... import types as odps_types +from ... import udf +from ...utils import to_date, to_milliseconds from . import utils +__all__ = ["get_csv_runner", "get_table_runner"] -__all__ = ['get_default_runner'] +PY2 = sys.version_info[0] == 2 +_table_bracket_re = re.compile(r"[^\(]+\([^\)]+\)") -def get_default_runner(udf_class, input_col_delim=',', null_indicator='NULL', stdin=None): - """Create a default runner with specified udf class. - """ +def get_csv_runner( + udf_class, + input_col_delim=",", + null_indicator="NULL", + stdin=None, + collector_cls=None, +): + """Create a runner to read csv with specified udf class.""" proto = udf.get_annotation(udf_class) in_types, out_types = parse_proto(proto) stdin = stdin or sys.stdin arg_parser = ArgParser(in_types, stdin, input_col_delim, null_indicator) - stdin_feed = make_feed(arg_parser) - collector = StdoutCollector(out_types) + stdin_feed = arg_parser.parse() + + collector_cls = collector_cls or StdoutCollector + collector = collector_cls(out_types) ctor = _get_runner_class(udf_class) return ctor(udf_class, stdin_feed, collector) +def get_table_runner( + udf_class, odps_entry, table_desc, record_limit=None, collector_cls=None +): + """Create a runner to read table with specified udf class.""" + proto = udf.get_annotation(udf_class) + in_types, out_types = parse_proto(proto) + tb_feed = table_feed(odps_entry, table_desc, in_types, record_limit) + + collector_cls = collector_cls or StdoutCollector + collector = collector_cls(out_types) + ctor = _get_runner_class(udf_class) + return ctor(udf_class, tb_feed, collector) + + def simple_run(udf_class, args): - """ - """ proto = udf.get_annotation(udf_class) in_types, out_types = parse_proto(proto) feed = direct_feed(args) @@ -55,25 +76,38 @@ def simple_run(udf_class, args): def initialize(): - """Initialize the local run environment. - """ + """Initialize the local run environment.""" distcache.get_cache_table = utils.get_cache_table -def _get_types(types): +def _split_data_types(types_str): + bracket_level = 0 + ret_types = [""] + for ch in types_str: + if bracket_level == 0 and ch == ",": + ret_types[-1] = ret_types[-1].strip() + ret_types.append("") + else: + ret_types[-1] += ch + if ch in ("<", "("): + bracket_level += 1 + elif ch in (">", ")"): + bracket_level -= 1 + return [s for s in ret_types if s] + + +def _get_types(types_str): entries = [] - for t in types.split(','): + for t in _split_data_types(types_str): t = t.strip() - if t not in _allowed_data_types: - raise Exception('type not in '+ ','.join(_allowed_data_types)) - entries.append(_type_registry[t]) + entries.append(odps_types.validate_data_type(t)) return entries def _get_in_types(types): - if types == '': + if types == "": return [] - return _get_types(types) if types != '*' else [_type_registry[types], ] + return _get_types(types) if types != "*" else ["*"] def _get_runner_class(udf_class): @@ -87,27 +121,76 @@ def _get_runner_class(udf_class): def parse_proto(proto): - tokens = proto.lower().split('->') + tokens = proto.lower().split("->") if len(tokens) != 2: - raise Exception('@annotate(' + proto + ') error') + raise ValueError("Illegal format of @annotate(%s)" % proto) return _get_in_types(tokens[0].strip()), _get_types(tokens[1].strip()) -def make_feed(arg_parser): - for r in arg_parser.parse(): - yield r - - def direct_feed(args): for a in args: yield a -class ArgParser(object): +def _convert_value(value, tp): + try: + odps_types._date_allow_int_conversion = True + value = odps_types.validate_value(value, tp) + finally: + odps_types._date_allow_int_conversion = False + + if not PY2: + return value + + if isinstance(tp, odps_types.Datetime): + return to_milliseconds(value) + elif isinstance(tp, odps_types.Date): + return to_date(value) + elif isinstance(tp, odps_types.Array): + return [_convert_value(v, tp.value_type) for v in value] + elif isinstance(tp, odps_types.Map): + return { + _convert_value(k, tp.key_type): _convert_value(v, tp.value_type) + for k, v in value.items() + } + elif isinstance(tp, odps_types.Struct): + if isinstance(value, dict): + vals = { + k: _convert_value(value[k], ftp) for k, ftp in tp.field_types.items() + } + else: + vals = { + k: _convert_value(getattr(value, k), ftp) + for k, ftp in tp.field_types.items() + } + return tp.namedtuple_type(**vals) + else: + return value + + +def _validate_values(values, types): + if types == ["*"]: + return values + if len(values) != len(types): + raise ValueError( + "Input length mismatch: %d expected, %d provided" + % (len(types), len(values)) + ) + ret_vals = [None] * len(values) + for idx, (tp, d) in enumerate(zip(types, values)): + if d is None: + continue + try: + ret_vals[idx] = _convert_value(d, tp) + except: + raise ValueError("Input type mismatch: expected %s, received %r" % (tp, d)) + return ret_vals - NULL_INDICATOR = 'NULL' - def __init__(self, types, fileobj, delim=',', null_indicator='NULL'): +class ArgParser(object): + NULL_INDICATOR = "NULL" + + def __init__(self, types, fileobj, delim=",", null_indicator="NULL"): self.types = types self.delim = delim self.null_indicator = null_indicator @@ -122,22 +205,46 @@ def parse(self): tokens.append(None) else: tokens.append(token) - if len(self.types) == 1 and self.types[0].typestr == '*': - yield tokens - continue + if len(self.types) == 0 and len(tokens) == 0: - yield '' + yield "" continue + yield _validate_values(tokens, self.types) + + +def _get_table_or_partition(odps_entry, table_desc): + table_names = [] + table_part = None + table_cols = None + for part in table_desc.split("."): + part = part.strip() + if not _table_bracket_re.match(part): + table_names.append(part) + elif part.startswith("p("): + table_part = part[2:-1] + elif part.startswith("c("): + table_cols = [s.strip() for s in part[2:-1].split(",")] + data_obj = odps_entry.get_table(".".join(table_names)) + if table_part is not None: + data_obj = data_obj.get_partition(table_part) + return data_obj, table_cols + + +def table_feed(odps_entry, table_desc, in_types, record_limit): + data_obj, cols = _get_table_or_partition(odps_entry, table_desc) + with data_obj.open_reader(columns=cols) as reader: + if record_limit is not None: + data_src = reader[:record_limit] + else: + data_src = reader - if len(tokens) != len(self.types): - raise Exception('Schema error: %r' % record) - yield map(lambda tp, data: tp.converter(data), self.types, tokens) - + for row in data_src: + yield _validate_values(row.values, in_types) -class ArgFormater(object): - DELIM = '\t' - NULL_INDICATOR = 'NULL' +class ArgFormatter(object): + DELIM = "\t" + NULL_INDICATOR = "NULL" def __init__(self, types): self.types = types @@ -148,40 +255,32 @@ def format(self, *args): class BaseCollector(object): - """Basic common logic of collector. - """ + """Basic common logic of collector.""" + def __init__(self, schema): self.schema = schema - def _validate_records(self, *args): - if len(args) != len(self.schema): - raise Exception('Schema error: ' + repr(args)) - for i, a in enumerate(args): - if a is None: - continue - elif not isinstance(a, self.schema[i].type): - raise Exception('Schema error: ' + repr(args)) - def collect(self, *args): - self._validate_records(*args) + _validate_values(args, self.schema) self.handle_collect(*args) + def handle_collect(self, *args): + raise NotImplementedError + class StdoutCollector(BaseCollector): - """Collect the results to stdout. - """ + """Collect the results to stdout.""" def __init__(self, schema): super(StdoutCollector, self).__init__(schema) - self.formater = ArgFormater(schema) + self.formatter = ArgFormatter(schema) def handle_collect(self, *args): - print(self.formater.format(*args)) + print(self.formatter.format(*args)) class DirectCollector(BaseCollector): - """Collect results which can be fetched via self.results into memory. - """ + """Collect results which can be fetched via self.results into memory.""" def __init__(self, schema): super(DirectCollector, self).__init__(schema) @@ -195,7 +294,6 @@ def handle_collect(self, *args): class BaseRunner(object): - def __init__(self, udf_class, feed, collector): self.udf_class = udf_class self.feed = feed @@ -205,7 +303,6 @@ def __init__(self, udf_class, feed, collector): class UDFRunner(BaseRunner): - def run(self): obj = self.obj collector = self.collector @@ -215,12 +312,13 @@ def run(self): class UDTFRunner(BaseRunner): - def run(self): obj = self.obj collector = self.collector + def local_forward(*r): collector.collect(*r) + obj.forward = local_forward for args in self.feed: obj.process(*args) @@ -228,7 +326,6 @@ def local_forward(*r): class UDAFRunner(BaseRunner): - def run(self): obj = self.obj collector = self.collector @@ -249,53 +346,4 @@ def run(self): collector.collect(obj.terminate(merge_buf)) - -########################################### -### Static type registry ### - -def register_type(enum, typestr, tp): - type_obj = TypeEntry(typestr, tp) - globals()[enum] = type_obj - _type_registry[typestr] = type_obj - - -_type_registry = { -} - - -def _gen_converter(typestr, tp): - def f(v): - if v == "NULL" or v is None: - return None - if typestr in ('bigint', 'datetime'): - return int(v) - elif typestr == 'string': - return str(v) - return tp(v) - return f - - -class TypeEntry(object): - - def __init__(self, typestr, tp): - self.typestr = typestr - self.type = tp - self.converter = _gen_converter(typestr, tp) - -register_type('TP_BIGINT', 'bigint', six.integer_types) -register_type('TP_STRING', 'string', six.string_types) -if sys.version_info[0] == 2: - register_type('TP_DATETIME', 'datetime', six.integer_types) -else: - register_type('TP_DATETIME', 'datetime', datetime) -register_type('TP_DOUBLE', 'double', float) -register_type('TP_BOOLEAN', 'boolean', bool) -register_type('TP_STAR', '*', lambda x: x) - -_allowed_data_types = [k for k in _type_registry.keys() if k != '*'] - -########################################### - - initialize() - diff --git a/odps/udf/tools/utils.py b/odps/udf/tools/utils.py index ecc81625..700a0862 100644 --- a/odps/udf/tools/utils.py +++ b/odps/udf/tools/utils.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,14 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import sys import logging +import os -from odps import (core, accounts) - +from ...compat import ConfigParser +from ...core import ODPS -CONF_FILENAME = '.pyouconfig' +CONF_FILENAME = "~/.pyouconfig" logger = logging.getLogger(__name__) _odps = None @@ -34,31 +33,27 @@ def f(*args, **kwargs): if _odps is None: get_conf() return func(*args, **kwargs) + return f - + def get_conf(): - import ConfigParser global _odps - home_dir = os.environ.get('HOME') - if not home_dir: - raise UDFToolError('Cannot find home dir, ' - 'perhaps you are using windowns :(') - f = os.path.join(home_dir, CONF_FILENAME) + f = os.path.expanduser(CONF_FILENAME) if not os.path.exists(f): - access_id = raw_input('access_id:') - + return + config = ConfigParser.RawConfigParser() config.read(f) - access_id = config.get('access_id') - access_key = config.get('secret_access_key') - end_point = config.get('endpoint') - project = config.get('project') - _odps = core.ODPS(access_id, access_key, - project, end_point) + access_id = config.get("access_id") + access_key = config.get("secret_access_key") + end_point = config.get("endpoint") + project = config.get("project") + _odps = ODPS(access_id, access_key, project, end_point) @require_conf def get_cache_table(name): - return _odps.read_table(name) + odps_entry = _odps or ODPS.from_global() + return odps_entry.read_table(name) diff --git a/odps/ui/__init__.py b/odps/ui/__init__.py index 92413084..88ef1dbe 100644 --- a/odps/ui/__init__.py +++ b/odps/ui/__init__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,8 +15,8 @@ from .common import html_notify from .progress import ( + ProgressGroupUI, create_instance_group, fetch_instance_group, reload_instance_status, - ProgressGroupUI, ) diff --git a/odps/ui/common.py b/odps/ui/common.py index fb733838..b434ee0d 100644 --- a/odps/ui/common.py +++ b/odps/ui/common.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,23 +15,29 @@ import json - MAX_SCRIPT_LOAD_SEC = 5 SCRIPT_LOAD_CHECK_INTERVAL = 0.01 -COMMON_JS = 'common' +COMMON_JS = "common" CSS_REGISTER_JS = """ require(['pyodps'], function(p) { p.register_css('##CSS_STR##'); }); """.strip() try: - from ..console import widgets, ipython_major_version, in_ipython_frontend, is_widgets_available + from ..console import ( + in_ipython_frontend, + ipython_major_version, + is_widgets_available, + widgets, + ) + if any(v is None for v in (widgets, ipython_major_version, in_ipython_frontend)): raise ImportError if ipython_major_version < 4: - from IPython.utils.traitlets import Unicode, List + from IPython.utils.traitlets import List, Unicode + traitlets_version = (3, 0) else: from traitlets import Unicode, List # noqa: F401 @@ -59,9 +65,9 @@ def build_trait(trait_cls, default_value=None, **metadata): return trait_cls().tag(**metadata) class HTMLNotifier(widgets.DOMWidget): - _view_name = build_trait(Unicode, 'HTMLNotifier', sync=True) - _view_module = build_trait(Unicode, 'pyodps/html-notify', sync=True) - msg = build_trait(Unicode, 'msg', sync=True) + _view_name = build_trait(Unicode, "HTMLNotifier", sync=True) + _view_module = build_trait(Unicode, "pyodps/html-notify", sync=True) + msg = build_trait(Unicode, "msg", sync=True) def notify(self, msg): self.msg = json.dumps(dict(body=msg)) diff --git a/odps/ui/progress.py b/odps/ui/progress.py index b50f5f1f..3e0c1f42 100644 --- a/odps/ui/progress.py +++ b/odps/ui/progress.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,11 +20,11 @@ import uuid from collections import OrderedDict -from ..config import options from ..compat import six +from ..config import options from ..errors import InternalServerError, RequestTimeTooSkewed from ..models.instance import Instance -from ..serializers import JSONSerializableModel, JSONNodeField, JSONNodesReferencesField +from ..serializers import JSONNodeField, JSONNodesReferencesField, JSONSerializableModel from .common import build_trait PROGRESS_RETRY = 3 @@ -40,75 +40,95 @@ class _StageProgressJSON(JSONSerializableModel): - name = JSONNodeField('name') - backup_workers = JSONNodeField('backup_workers', parse_callback=int, default=0) - terminated_workers = JSONNodeField('terminated_workers', parse_callback=int, default=0) - running_workers = JSONNodeField('running_workers', parse_callback=int, default=0) - total_workers = JSONNodeField('total_workers', parse_callback=int, default=0) - input_records = JSONNodeField('input_records', parse_callback=int, default=0) - output_records = JSONNodeField('output_records', parse_callback=int, default=0) - finished_percentage = JSONNodeField('finished_percentage', parse_callback=int, default=0) + name = JSONNodeField("name") + backup_workers = JSONNodeField("backup_workers", parse_callback=int, default=0) + terminated_workers = JSONNodeField( + "terminated_workers", parse_callback=int, default=0 + ) + running_workers = JSONNodeField("running_workers", parse_callback=int, default=0) + total_workers = JSONNodeField("total_workers", parse_callback=int, default=0) + input_records = JSONNodeField("input_records", parse_callback=int, default=0) + output_records = JSONNodeField("output_records", parse_callback=int, default=0) + finished_percentage = JSONNodeField( + "finished_percentage", parse_callback=int, default=0 + ) def __init__(self, **kwargs): super(_StageProgressJSON, self).__init__(**kwargs) class _TaskProgressJSON(JSONSerializableModel): - name = JSONNodeField('name') - status = JSONNodeField('status', parse_callback=lambda v: Instance.Task.TaskStatus(v.upper()), - serialize_callback=lambda v: v.value) - stages = JSONNodesReferencesField(_StageProgressJSON, 'stages') + name = JSONNodeField("name") + status = JSONNodeField( + "status", + parse_callback=lambda v: Instance.Task.TaskStatus(v.upper()), + serialize_callback=lambda v: v.value, + ) + stages = JSONNodesReferencesField(_StageProgressJSON, "stages") def get_stage_progress_formatted_string(self): buf = six.StringIO() - buf.write(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) - buf.write(' ') + buf.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + buf.write(" ") for stage in self.stages: - buf.write('{0}:{1}/{2}/{3}{4}[{5}%]\t'.format( - stage.name, - stage.running_workers, - stage.terminated_workers, - stage.total_workers, - '(+%s backups)' % stage.backup_workers if stage.backup_workers > 0 else '', - stage.finished_percentage - )) + buf.write( + "{0}:{1}/{2}/{3}{4}[{5}%]\t".format( + stage.name, + stage.running_workers, + stage.terminated_workers, + stage.total_workers, + "(+%s backups)" % stage.backup_workers + if stage.backup_workers > 0 + else "", + stage.finished_percentage, + ) + ) return buf.getvalue() class _InstanceProgressJSON(JSONSerializableModel): - id = JSONNodeField('id') - logview = JSONNodeField('logview') - status = JSONNodeField('status', parse_callback=lambda v: Instance.Status(v.upper()), - serialize_callback=lambda v: v.value) - tasks = JSONNodeField('tasks', parse_callback=lambda v: _InstanceProgressJSON._parse_tasks(v), - serialize_callback=lambda v: [d.serial() for d in six.itervalues(v)]) + id = JSONNodeField("id") + logview = JSONNodeField("logview") + status = JSONNodeField( + "status", + parse_callback=lambda v: Instance.Status(v.upper()), + serialize_callback=lambda v: v.value, + ) + tasks = JSONNodeField( + "tasks", + parse_callback=lambda v: _InstanceProgressJSON._parse_tasks(v), + serialize_callback=lambda v: [d.serial() for d in six.itervalues(v)], + ) @staticmethod def _parse_tasks(obj): - return OrderedDict([(o['name'], _TaskProgressJSON.parse(o)) for o in obj]) + return OrderedDict([(o["name"], _TaskProgressJSON.parse(o)) for o in obj]) class _InstancesProgressJSON(JSONSerializableModel): - name = JSONNodeField('name') - key = JSONNodeField('key') - gen_time = JSONNodeField('gen_time') - logview = JSONNodeField('logview') - instances = JSONNodeField('instances', parse_callback=lambda v: _InstancesProgressJSON._parse_instances(v), - serialize_callback=lambda v: [d.serial() for d in six.itervalues(v)]) + name = JSONNodeField("name") + key = JSONNodeField("key") + gen_time = JSONNodeField("gen_time") + logview = JSONNodeField("logview") + instances = JSONNodeField( + "instances", + parse_callback=lambda v: _InstancesProgressJSON._parse_instances(v), + serialize_callback=lambda v: [d.serial() for d in six.itervalues(v)], + ) @staticmethod def _parse_instances(obj): - return OrderedDict([(o['id'], _InstanceProgressJSON.parse(o)) for o in obj]) + return OrderedDict([(o["id"], _InstanceProgressJSON.parse(o)) for o in obj]) def update_instance(self, inst): self.instances[inst.id] = inst def create_instance_group(name): - key = '%x_%s' % (int(time.time()), str(uuid.uuid4()).lower()) + key = "%x_%s" % (int(time.time()), str(uuid.uuid4()).lower()) group_json = _InstancesProgressJSON(name=name, key=key, instances=OrderedDict()) group_json.gen_time = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) PROGRESS_REPO[key] = group_json @@ -120,7 +140,7 @@ def create_instance_group(name): def _reload_instance_status(odps, group_id, instance_id): if group_id not in PROGRESS_REPO: - raise KeyError('Instance group ID not exist.') + raise KeyError("Instance group ID not exist.") group_json = PROGRESS_REPO[group_id] if instance_id in group_json.instances: @@ -145,10 +165,14 @@ def _reload_instance_status(odps, group_id, instance_id): if task_name in inst_json.tasks: task_json = inst_json.tasks[task_name] task_json.status = task.status - if task.status not in set([Instance.Task.TaskStatus.RUNNING, Instance.Task.TaskStatus.WAITING]): + if task.status not in set( + [Instance.Task.TaskStatus.RUNNING, Instance.Task.TaskStatus.WAITING] + ): continue else: - task_json = _TaskProgressJSON(name=task_name, status=task.status, stages=[]) + task_json = _TaskProgressJSON( + name=task_name, status=task.status, stages=[] + ) inst_json.tasks[task_name] = task_json task_json.stages = [] @@ -180,7 +204,7 @@ def reload_instance_status(odps, group_id, instance_id): def fetch_instance_group(group_id): if group_id not in PROGRESS_REPO: - raise KeyError('Instance group ID not exist.') + raise KeyError("Instance group ID not exist.") return PROGRESS_REPO[group_id] @@ -194,9 +218,15 @@ def exist_instance_group(group_id): try: - from ..console import widgets, ipython_major_version, in_ipython_frontend, is_widgets_available + from ..console import ( + in_ipython_frontend, + ipython_major_version, + is_widgets_available, + widgets, + ) + if ipython_major_version < 4: - from IPython.utils.traitlets import Unicode, List + from IPython.utils.traitlets import List, Unicode else: from traitlets import Unicode, List # noqa: F401 from IPython.display import display @@ -204,11 +234,12 @@ def exist_instance_group(group_id): InstancesProgress = None else: if widgets and in_ipython_frontend(): + class InstancesProgress(widgets.DOMWidget): - _view_name = build_trait(Unicode, 'InstancesProgress', sync=True) - _view_module = build_trait(Unicode, 'pyodps/progress', sync=True) - prefix = build_trait(Unicode, 'prefix', sync=True) - suffix = build_trait(Unicode, 'suffix', sync=True) + _view_name = build_trait(Unicode, "InstancesProgress", sync=True) + _view_module = build_trait(Unicode, "pyodps/progress", sync=True) + prefix = build_trait(Unicode, "prefix", sync=True) + suffix = build_trait(Unicode, "suffix", sync=True) def __init__(self, **kwargs): """Constructor""" @@ -218,29 +249,30 @@ def __init__(self, **kwargs): self.errors = widgets.CallbackDispatcher(accepted_nargs=[0, 1]) def update(self): - self.send(json.dumps(dict(action='update', content=[]))) + self.send(json.dumps(dict(action="update", content=[]))) def update_group(self, group_jsons): if isinstance(group_jsons, six.string_types): - group_jsons = [group_jsons, ] + group_jsons = [group_jsons] try: - self.send(json.dumps(dict(action='update', content=group_jsons))) + self.send(json.dumps(dict(action="update", content=group_jsons))) except: pass def delete_group(self, group_keys): if isinstance(group_keys, six.string_types): - group_keys = [group_keys, ] + group_keys = [group_keys] try: - self.send(json.dumps(dict(action='delete', content=group_keys))) + self.send(json.dumps(dict(action="delete", content=group_keys))) except: pass def clear_groups(self): try: - self.send(json.dumps(dict(action='clear'))) + self.send(json.dumps(dict(action="clear"))) except: pass + else: InstancesProgress = None @@ -249,11 +281,11 @@ class ProgressGroupUI(object): def __init__(self, ipython_widget=False): self._ipython_widget = ipython_widget if ipython_widget and InstancesProgress is None: - raise RuntimeError('Cannot create group ui when InstancesProgress is None') + raise RuntimeError("Cannot create group ui when InstancesProgress is None") self._widget = None self._group_keys = set() - self._prefix = '' - self._suffix = '' + self._prefix = "" + self._suffix = "" @property def prefix(self): @@ -275,18 +307,18 @@ def suffix(self, value): def has_keys(self, keys): if isinstance(keys, six.string_types): - keys = [keys, ] + keys = [keys] return all(k in self._group_keys for k in keys) def add_keys(self, keys): if isinstance(keys, six.string_types): - keys = [keys, ] + keys = [keys] self._group_keys.update(keys) self._update_group(keys) def remove_keys(self, keys): if isinstance(keys, six.string_types): - keys = [keys, ] + keys = [keys] self._group_keys -= set(keys) self._widget.delete_group(keys) @@ -311,13 +343,21 @@ def _update_group(self, keys): if is_widgets_available(): display(self._widget) if isinstance(keys, six.string_types): - keys = [keys, ] - data = [fetch_instance_group(key).serialize() for key in keys if exist_instance_group(key)] + keys = [keys] + data = [ + fetch_instance_group(key).serialize() + for key in keys + if exist_instance_group(key) + ] self._widget.update_group(data) def update(self): self._update_text() - data = [fetch_instance_group(key).serialize() for key in self._group_keys if exist_instance_group(key)] + data = [ + fetch_instance_group(key).serialize() + for key in self._group_keys + if exist_instance_group(key) + ] self._widget.update_group(data) def close(self): diff --git a/odps/ui/tests/base.py b/odps/ui/tests/base.py index d02b0bf1..d0f45d4c 100644 --- a/odps/ui/tests/base.py +++ b/odps/ui/tests/base.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,16 @@ import os import sys import time - from contextlib import contextmanager -from subprocess import Popen, PIPE +from subprocess import PIPE, Popen from ...compat import Empty from ...tests.core import ignore_case try: - from jupyter_core import paths - from jupyter_client import BlockingKernelClient from ipython_genutils import py3compat + from jupyter_client import BlockingKernelClient + from jupyter_core import paths _has_jupyter = True except ImportError: @@ -46,21 +45,24 @@ def ui_case(func): if _has_jupyter: return func else: - return ignore_case(func, "UI case skipped, since no Jupyter installation found.") + return ignore_case( + func, "UI case skipped, since no Jupyter installation found." + ) def grab_iopub_messages(client, msg_id): try: iopub_msg = {} - while (not iopub_msg or - iopub_msg['parent_header']['msg_id'] != msg_id or - iopub_msg['msg_type'] != 'status' or - 'execution_state' not in iopub_msg['content'] or - iopub_msg['content']['execution_state'] != "idle"): - + while ( + not iopub_msg + or iopub_msg["parent_header"]["msg_id"] != msg_id + or iopub_msg["msg_type"] != "status" + or "execution_state" not in iopub_msg["content"] + or iopub_msg["content"]["execution_state"] != "idle" + ): iopub_msg = client.get_iopub_msg(timeout=TIMEOUT) - if iopub_msg['parent_header']['msg_id'] != msg_id: + if iopub_msg["parent_header"]["msg_id"] != msg_id: continue yield iopub_msg except Empty: @@ -70,21 +72,21 @@ def grab_iopub_messages(client, msg_id): def grab_iopub_comm(client, msg_id): iopub_data = {} for iopub_msg in grab_iopub_messages(client, msg_id): - content = iopub_msg['content'] - if 'comm_id' not in content: + content = iopub_msg["content"] + if "comm_id" not in content: continue - comm_id = content['comm_id'] - if iopub_msg['msg_type'] == 'comm_open': + comm_id = content["comm_id"] + if iopub_msg["msg_type"] == "comm_open": iopub_data[comm_id] = [] - elif iopub_msg['msg_type'] == 'comm_msg' and comm_id in iopub_data: - iopub_data[comm_id].append(content['data']) + elif iopub_msg["msg_type"] == "comm_msg" and comm_id in iopub_data: + iopub_data[comm_id].append(content["data"]) return iopub_data def grab_execute_result(client, msg_id): for iopub_msg in grab_iopub_messages(client, msg_id): - content = iopub_msg['content'] - if iopub_msg['msg_type'] == 'execute_result': + content = iopub_msg["content"] + if iopub_msg["msg_type"] == "execute_result": return content @@ -96,16 +98,18 @@ def setup_kernel(cmd=DEFAULT_CMD): ------- kernel_manager: connected KernelManager instance """ - kernel = Popen([sys.executable, '-c', cmd], stdout=PIPE, stderr=PIPE) + kernel = Popen([sys.executable, "-c", cmd], stdout=PIPE, stderr=PIPE) connection_file = os.path.join( paths.jupyter_runtime_dir(), - 'kernel-%i.json' % kernel.pid, + "kernel-%i.json" % kernel.pid, ) # wait for connection file to exist, timeout after 5s tic = time.time() - while not os.path.exists(connection_file) \ - and kernel.poll() is None \ - and time.time() < tic + SETUP_TIMEOUT: + while ( + not os.path.exists(connection_file) + and kernel.poll() is None + and time.time() < tic + SETUP_TIMEOUT + ): time.sleep(0.1) if kernel.poll() is not None: diff --git a/odps/ui/tests/test_ui.py b/odps/ui/tests/test_ui.py index b8207583..57465bf7 100644 --- a/odps/ui/tests/test_ui.py +++ b/odps/ui/tests/test_ui.py @@ -1,4 +1,4 @@ -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import json from ...compat import six -from ..tests.base import setup_kernel, ui_case, grab_iopub_comm +from ..tests.base import grab_iopub_comm, setup_kernel, ui_case TIMEOUT = 10 @@ -23,16 +23,15 @@ @ui_case def test_html_notify(): with setup_kernel() as client: - client.execute('from odps.ui import html_notify') + client.execute("from odps.ui import html_notify") shell_msg = client.get_shell_msg(timeout=TIMEOUT) - content = shell_msg['content'] - assert content['status'] == 'ok' + content = shell_msg["content"] + assert content["status"] == "ok" msg_id = client.execute('html_notify("TestMessage")') iopub_data = grab_iopub_comm(client, msg_id) - assert any(u"TestMessage" in json.dumps(l) - for l in six.itervalues(iopub_data)) + assert any(u"TestMessage" in json.dumps(l) for l in six.itervalues(iopub_data)) shell_msg = client.get_shell_msg(timeout=TIMEOUT) - content = shell_msg['content'] - assert content['status'] == 'ok' + content = shell_msg["content"] + assert content["status"] == "ok" diff --git a/odps/utils.py b/odps/utils.py index 5f136330..bf7e24ee 100644 --- a/odps/utils.py +++ b/odps/utils.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 1999-2022 Alibaba Group Holding Ltd. +# Copyright 1999-2024 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -40,17 +40,18 @@ import warnings import xml.dom.minidom from base64 import b64encode -from datetime import datetime, date, timedelta -from email.utils import parsedate_tz, formatdate -from hashlib import sha1, md5 +from datetime import date, datetime, timedelta +from email.utils import formatdate, parsedate_tz +from hashlib import md5, sha1 try: - from collections.abc import Hashable, Mapping, Iterable + from collections.abc import Hashable, Iterable, Mapping except ImportError: from collections import Hashable, Mapping, Iterable from . import compat, options -from .compat import six, getargspec, FixedOffset, parsedate_to_datetime, utc +from .compat import FixedOffset, getargspec, parsedate_to_datetime, six, utc +from .lib.monotonic import monotonic try: import zoneinfo @@ -82,11 +83,12 @@ def _decorator(func): """This is a decorator which can be used to mark functions as deprecated. It will result in a warning being emmitted when the function is used.""" + @six.wraps(func) def _new_func(*args, **kwargs): warn_msg = "Call to deprecated function %s." % func.__name__ if isinstance(msg, six.string_types): - warn_msg += ' ' + msg + warn_msg += " " + msg if cond is None or cond(): warnings.warn(warn_msg, category=DeprecationWarning, stacklevel=2) return func(*args, **kwargs) @@ -105,22 +107,31 @@ class ExperimentalNotAllowed(Exception): def experimental(msg, cond=None): warn_cache = set() + real_cond = cond + if callable(cond): + cond_spec = getargspec(cond) + if not cond_spec.args and not cond_spec.varargs: + real_cond = lambda *_, **__: cond() + def _decorator(func): @six.wraps(func) def _new_func(*args, **kwargs): - warn_msg = "Call to experimental function %s." % func.__name__ - if isinstance(msg, six.string_types): - warn_msg += ' ' + msg - - if not str_to_bool(os.environ.get('PYODPS_EXPERIMENTAL', 'true')): - err_msg = "Calling to experimental method %s is denied." % func.__name__ - if isinstance(msg, six.string_types): - err_msg += ' ' + msg - raise ExperimentalNotAllowed(err_msg) - - if func not in warn_cache and (cond is None or cond()): - warnings.warn(warn_msg, category=FutureWarning, stacklevel=2) - warn_cache.add(func) + if real_cond is None or real_cond(*args, **kwargs): + if not str_to_bool(os.environ.get("PYODPS_EXPERIMENTAL", "true")): + err_msg = ( + "Calling to experimental method %s is denied." % func.__name__ + ) + if isinstance(msg, six.string_types): + err_msg += " " + msg + raise ExperimentalNotAllowed(err_msg) + + if func not in warn_cache: + warn_msg = "Call to experimental function %s." % func.__name__ + if isinstance(msg, six.string_types): + warn_msg += " " + msg + + warnings.warn(warn_msg, category=FutureWarning, stacklevel=2) + warn_cache.add(func) return func(*args, **kwargs) # intentionally eliminate __doc__ for Volume 2 @@ -147,18 +158,22 @@ def fixed_writexml(self, writer, indent="", addindent="", newl=""): xml.dom.minidom._write_data(writer, attrs[a_name].value) writer.write("\"") if self.childNodes: - if len(self.childNodes) == 1 \ - and self.childNodes[0].nodeType == xml.dom.minidom.Node.TEXT_NODE: + if ( + len(self.childNodes) == 1 + and self.childNodes[0].nodeType == xml.dom.minidom.Node.TEXT_NODE + ): writer.write(">") self.childNodes[0].writexml(writer, "", "", "") writer.write("%s" % (self.tagName, newl)) return - writer.write(">%s"%(newl)) + writer.write(">%s" % (newl)) for node in self.childNodes: - node.writexml(writer,indent+addindent,addindent,newl) - writer.write("%s%s" % (indent,self.tagName,newl)) + node.writexml(writer, indent + addindent, addindent, newl) + writer.write("%s%s" % (indent, self.tagName, newl)) else: - writer.write("/>%s"%(newl)) + writer.write("/>%s" % (newl)) + + # replace minidom's function with ours xml.dom.minidom.Element.writexml = fixed_writexml xml_fixed = lambda: None @@ -173,8 +188,7 @@ def md5_hexdigest(data): def rshift(val, n): - return val >> n if val >= 0 else \ - (val+0x100000000) >> n + return val >> n if val >= 0 else (val + 0x100000000) >> n def long_bits_to_double(bits): @@ -186,7 +200,7 @@ def long_bits_to_double(bits): @return: the double-precision floating-point value corresponding to the given bit pattern C{bits}. """ - return struct.unpack('d', struct.pack('Q', bits))[0] + return struct.unpack("d", struct.pack("Q", bits))[0] def double_to_raw_long_bits(value): @@ -199,21 +213,21 @@ def double_to_raw_long_bits(value): of the given double-precision floating-point value. """ # pack double into 64 bits, then unpack as long int - return struct.unpack('Q', struct.pack('d', float(value)))[0] + return struct.unpack("Q", struct.pack("d", float(value)))[0] def camel_to_underline(name): - s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) - return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() + s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) + return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower() def underline_to_capitalized(name): - return "".join([s[0].upper() + s[1:len(s)] for s in name.strip('_').split('_')]) + return "".join([s[0].upper() + s[1 : len(s)] for s in name.strip("_").split("_")]) def underline_to_camel(name): - parts = name.split('_') - return parts[0] + ''.join(v.title() for v in parts[1:]) + parts = name.split("_") + return parts[0] + "".join(v.title() for v in parts[1:]) def long_to_int(value): @@ -236,27 +250,38 @@ def long_to_uint(value): def stringify_expt(): lines = traceback.format_exception(*sys.exc_info()) - return '\n'.join(lines) + return "\n".join(lines) def str_to_printable(field_name, auto_quote=True): if not field_name: return field_name - escapes = {'\\': '\\\\', '\'': '\\\'', '"': '\\"', '\a': '\\a', '\b': '\\b', '\f': '\\f', - '\n': '\\n', '\r': '\\r', '\t': '\\t', '\v': '\\v', ' ': ' '} + escapes = { + "\\": "\\\\", + '\'': '\\\'', + '"': '\\"', + "\a": "\\a", + "\b": "\\b", + "\f": "\\f", + "\n": "\\n", + "\r": "\\r", + "\t": "\\t", + "\v": "\\v", + " ": " ", + } def _escape_char(c): if c in escapes: return escapes[c] - elif c < ' ': - return '\\x%02x' % ord(c) + elif c < " ": + return "\\x%02x" % ord(c) else: return c - need_escape = lambda c: c <= ' ' or c in escapes + need_escape = lambda c: c <= " " or c in escapes if any(need_escape(c) for c in field_name): - ret = ''.join(_escape_char(ch) for ch in field_name) + ret = "".join(_escape_char(ch) for ch in field_name) return '"' + ret + '"' if auto_quote else ret return field_name @@ -264,9 +289,8 @@ def _escape_char(c): def indent(text, n_spaces): if n_spaces <= 0: return text - block = ' ' * n_spaces - return '\n'.join((block + it) if len(it) > 0 else it - for it in text.split('\n')) + block = " " * n_spaces + return "\n".join((block + it) if len(it) > 0 else it for it in text.split("\n")) def parse_rfc822(s, use_legacy_parsedate=None): @@ -274,7 +298,8 @@ def parse_rfc822(s, use_legacy_parsedate=None): return None use_legacy_parsedate = ( - use_legacy_parsedate if use_legacy_parsedate is not None + use_legacy_parsedate + if use_legacy_parsedate is not None else options.use_legacy_parsedate ) if use_legacy_parsedate: @@ -300,17 +325,22 @@ def gen_rfc822(dt=None, localtime=False, usegmt=False): try: _antique_mills = time.mktime(datetime(1928, 1, 1).timetuple()) * 1000 except OverflowError: - _antique_mills = int( - (datetime(1928, 1, 1) - datetime.utcfromtimestamp(0)).total_seconds() - ) * 1000 + _antique_mills = ( + int((datetime(1928, 1, 1) - datetime.utcfromtimestamp(0)).total_seconds()) + * 1000 + ) _min_datetime_mills = int( (datetime.min - datetime.utcfromtimestamp(0)).total_seconds() * 1000 ) -_antique_errmsg = 'Date older than 1928-01-01 and may contain errors. ' \ - 'Ignore this error by configuring `options.allow_antique_date` to True.' -_min_datetime_errmsg = 'Date exceed range Python can handle. If you are reading data with tunnel, read '\ - 'the value as None by setting options.tunnel.overflow_date_as_none to True, ' \ - 'or convert the value into strings with SQL before processing them with Python.' +_antique_errmsg = ( + "Date older than 1928-01-01 and may contain errors. " + "Ignore this error by configuring `options.allow_antique_date` to True." +) +_min_datetime_errmsg = ( + "Date exceed range Python can handle. If you are reading data with tunnel, read " + "the value as None by setting options.tunnel.overflow_date_as_none to True, " + "or convert the value into strings with SQL before processing them with Python." +) def to_timestamp(dt, local_tz=None, is_dst=False): @@ -324,7 +354,9 @@ class MillisecondsConverter(object): def _get_tz(cls, tz): if isinstance(tz, six.string_types): if pytz is None and zoneinfo is None: - raise ImportError('Package `pytz` is needed when specifying string-format time zone.') + raise ImportError( + "Package `pytz` is needed when specifying string-format time zone." + ) else: return get_zone_from_name(tz) else: @@ -354,7 +386,9 @@ def _windows_mktime(self, timetuple): return int((dt - epoch).total_seconds()) def _windows_fromtimestamp(self, seconds): - fromtimestamp = datetime.fromtimestamp if self._local_tz else datetime.utcfromtimestamp + fromtimestamp = ( + datetime.fromtimestamp if self._local_tz else datetime.utcfromtimestamp + ) if seconds >= 0: return fromtimestamp(seconds) epoch = fromtimestamp(0) @@ -382,7 +416,7 @@ def __init__(self, local_tz=None, is_dst=False): self._fromtimestamp = self._windows_fromtimestamp self._tz = self._get_tz(self._local_tz) if not self._use_default_tz else None - if hasattr(self._tz, 'localize'): + if hasattr(self._tz, "localize"): self._localize = lambda dt: self._tz.localize(dt, is_dst=is_dst) else: self._localize = lambda dt: dt.replace(tzinfo=self._tz) @@ -394,11 +428,17 @@ def to_milliseconds(self, dt): dt = self._localize(dt) if dt.tzinfo is not None: - mills = int((calendar.timegm( - dt.astimezone(compat.utc).timetuple()) + dt.microsecond / 1000000.0 - ) * 1000) + mills = int( + ( + calendar.timegm(dt.astimezone(compat.utc).timetuple()) + + dt.microsecond / 1000000.0 + ) + * 1000 + ) else: - mills = int((self._mktime(dt.timetuple()) + dt.microsecond / 1000000.0) * 1000) + mills = int( + (self._mktime(dt.timetuple()) + dt.microsecond / 1000000.0) * 1000 + ) if not self._allow_antique and mills < _antique_mills: raise DatetimeOverflowError(_antique_errmsg) @@ -417,9 +457,11 @@ def from_milliseconds(self, milliseconds): if self._use_default_tz: return self._fromtimestamp(seconds).replace(microsecond=microseconds) else: - return datetime.utcfromtimestamp(seconds)\ - .replace(microsecond=microseconds, tzinfo=compat.utc)\ + return ( + datetime.utcfromtimestamp(seconds) + .replace(microsecond=microseconds, tzinfo=compat.utc) .astimezone(self._tz) + ) def to_milliseconds(dt, local_tz=None, is_dst=False, force_py=False): @@ -448,11 +490,11 @@ def to_datetime(milliseconds, local_tz=None, force_py=False): return f.from_milliseconds(milliseconds) -def strptime_with_tz(dt, format='%Y-%m-%d %H:%M:%S'): +def strptime_with_tz(dt, format="%Y-%m-%d %H:%M:%S"): try: return datetime.strptime(dt, format) except ValueError: - naive_date_str, _, offset_str = dt.rpartition(' ') + naive_date_str, _, offset_str = dt.rpartition(" ") naive_dt = datetime.strptime(naive_date_str, format) offset = int(offset_str[-4:-2]) * 60 + int(offset_str[-2:]) if offset_str[0] == "-": @@ -460,7 +502,7 @@ def strptime_with_tz(dt, format='%Y-%m-%d %H:%M:%S'): return naive_dt.replace(tzinfo=FixedOffset(offset)) -def to_binary(text, encoding='utf-8'): +def to_binary(text, encoding="utf-8"): if text is None: return text if isinstance(text, six.text_type): @@ -471,7 +513,7 @@ def to_binary(text, encoding='utf-8'): return str(text).encode(encoding) if six.PY3 else str(text) -def to_text(binary, encoding='utf-8'): +def to_text(binary, encoding="utf-8"): if binary is None: return binary if isinstance(binary, (six.binary_type, bytearray)): @@ -482,8 +524,12 @@ def to_text(binary, encoding='utf-8'): return str(binary) if six.PY3 else str(binary).decode(encoding) -def to_str(text, encoding='utf-8'): - return to_text(text, encoding=encoding) if six.PY3 else to_binary(text, encoding=encoding) +def to_str(text, encoding="utf-8"): + return ( + to_text(text, encoding=encoding) + if six.PY3 + else to_binary(text, encoding=encoding) + ) def get_zone_from_name(tzname): @@ -495,7 +541,8 @@ def get_zone_name(tz): # fix encoding conversion problem under windows -if sys.platform == 'win32': +if sys.platform == "win32": + def _replace_default_encoding(func): def _fun(s, encoding=None): return func(s, encoding=encoding or options.display.encoding) @@ -516,8 +563,8 @@ def is_lambda(f): def str_to_kv(string, typ=None): d = dict() - for pair in string.split(','): - k, v = pair.split(':', 1) + for pair in string.split(","): + k, v = pair.split(":", 1) if typ: v = typ(v) d[k] = v @@ -529,16 +576,16 @@ def interval_select(val, intervals, targets): def is_namedtuple(obj): - return isinstance(obj, tuple) and hasattr(obj, '_fields') + return isinstance(obj, tuple) and hasattr(obj, "_fields") def str_to_bool(s): if isinstance(s, bool): return s s = s.lower().strip() - if s == 'true': + if s == "true": return True - elif s == 'false': + elif s == "false": return False else: raise ValueError @@ -556,53 +603,59 @@ def load_text_file(path): file_path = get_root_dir() + path if not os.path.exists(file_path): return None - with codecs.open(file_path, encoding='utf-8') as f: + with codecs.open(file_path, encoding="utf-8") as f: inp_file = f.read() f.close() return inp_file def load_file_paths(pattern): - file_path = os.path.normpath(os.path.dirname(sys.modules[__name__].__file__) + pattern) + file_path = os.path.normpath( + os.path.dirname(sys.modules[__name__].__file__) + pattern + ) return glob.glob(file_path) def load_static_file_paths(path): - return load_file_paths('/static/' + path) + return load_file_paths("/static/" + path) def load_text_files(pattern, func=None): - file_path = os.path.normpath(os.path.dirname(sys.modules[__name__].__file__) + pattern) + file_path = os.path.normpath( + os.path.dirname(sys.modules[__name__].__file__) + pattern + ) content_dict = dict() for file_path in glob.glob(file_path): _, fn = os.path.split(file_path) if func and not func(fn): continue - with codecs.open(file_path, encoding='utf-8') as f: + with codecs.open(file_path, encoding="utf-8") as f: content_dict[fn] = f.read() f.close() return content_dict def load_static_text_file(path): - return load_text_file('/static/' + path) + return load_text_file("/static/" + path) def load_internal_static_text_file(path): - return load_text_file('/internal/static/' + path) + return load_text_file("/internal/static/" + path) def load_static_text_files(pattern, func=None): - return load_text_files('/static/' + pattern, func) + return load_text_files("/static/" + pattern, func) def init_progress_bar(val=1, use_console=True): try: from traitlets import TraitError + ipython = True except ImportError: try: from IPython.utils.traitlets import TraitError + ipython = True except ImportError: ipython = False @@ -647,6 +700,7 @@ def inner(*args, **kwargs): return func(*args, **kwargs) else: return func(*args, **kwargs) + return inner class ProgressUI(object): @@ -657,17 +711,17 @@ def update(self, value=None): @ui_method def current_progress(self): - if bar and hasattr(bar, '_current_value'): + if bar and hasattr(bar, "_current_value"): return bar._current_value @ui_method def inc(self, value): - if bar and hasattr(bar, '_current_value'): + if bar and hasattr(bar, "_current_value"): current_val = bar._current_value bar.update(current_val + value) @ui_method - def status(self, prefix, suffix='', clear_keys=False): + def status(self, prefix, suffix="", clear_keys=False): if progress_group: if clear_keys: progress_group.clear_keys() @@ -720,7 +774,7 @@ def escape_odps_string(src): def replace_sql_parameters(sql, ns): - param_re = re.compile(r':([a-zA-Z_][a-zA-Z0-9_]*)') + param_re = re.compile(r":([a-zA-Z_][a-zA-Z0-9_]*)") def is_numeric(val): return isinstance(val, (six.integer_types, float)) @@ -735,8 +789,10 @@ def format_numeric(val): return repr(val) def format_sequence(val): - escaped = [format_numeric(v) if is_numeric(v) else format_string(v) for v in val] - return '({0})'.format(', '.join(escaped)) + escaped = [ + format_numeric(v) if is_numeric(v) else format_string(v) for v in val + ] + return "({0})".format(", ".join(escaped)) def replace(matched): name = matched.group(1) @@ -754,7 +810,7 @@ def replace(matched): def is_main_process(): - return 'main' in multiprocessing.current_process().name.lower() + return "main" in multiprocessing.current_process().name.lower() survey_calls = dict() @@ -765,15 +821,15 @@ def survey(func): def wrapped(*args, **kwargs): arg_spec = getargspec(func) - if 'self' in arg_spec.args: + if "self" in arg_spec.args: func_cls = args[0].__class__ else: func_cls = None if func_cls: - func_sig = '.'.join([func_cls.__module__, func_cls.__name__, func.__name__]) + func_sig = ".".join([func_cls.__module__, func_cls.__name__, func.__name__]) else: - func_sig = '.'.join([func.__module__, func.__name__]) + func_sig = ".".join([func.__module__, func.__name__]) add_survey_call(func_sig) return func(*args, **kwargs) @@ -801,7 +857,7 @@ def clear_survey_calls(): def require_package(pack_name): def _decorator(func): try: - __import__(pack_name, fromlist=['']) + __import__(pack_name, fromlist=[""]) return func except ImportError: return None @@ -810,33 +866,38 @@ def _decorator(func): def gen_repr_object(**kwargs): - obj = type('ReprObject', (), {}) - text = kwargs.pop('text', None) + obj = type("ReprObject", (), {}) + text = kwargs.pop("text", None) if six.PY2 and isinstance(text, unicode): - text = text.encode('utf-8') + text = text.encode("utf-8") if text: - setattr(obj, 'text', text) - setattr(obj, '__repr__', lambda self: text) + setattr(obj, "text", text) + setattr(obj, "__repr__", lambda self: text) for k, v in six.iteritems(kwargs): setattr(obj, k, v) - setattr(obj, '_repr_{0}_'.format(k), lambda self: v) - if 'gv' in kwargs: + setattr(obj, "_repr_{0}_".format(k), lambda self: v) + if "gv" in kwargs: try: from graphviz import Source - setattr(obj, '_repr_svg_', lambda self: Source(self._repr_gv_(), encoding='utf-8')._repr_svg_()) + + setattr( + obj, + "_repr_svg_", + lambda self: Source(self._repr_gv_(), encoding="utf-8")._repr_svg_(), + ) except ImportError: pass return obj() def build_pyodps_dir(*args): - default_dir = os.path.join(os.path.expanduser('~'), '.pyodps') - if sys.platform == 'win32' and 'APPDATA' in os.environ: - win_default_dir = os.path.join(os.environ['APPDATA'], 'pyodps') + default_dir = os.path.join(os.path.expanduser("~"), ".pyodps") + if sys.platform == "win32" and "APPDATA" in os.environ: + win_default_dir = os.path.join(os.environ["APPDATA"], "pyodps") if os.path.exists(default_dir): shutil.move(default_dir, win_default_dir) default_dir = win_default_dir - home_dir = os.environ.get('PYODPS_DIR') or default_dir + home_dir = os.environ.get("PYODPS_DIR") or default_dir return os.path.join(home_dir, *args) @@ -848,15 +909,16 @@ def object_getattr(obj, attr, default=None): def attach_internal(cls): - cls_path = cls.__module__ + '.' + cls.__name__ + cls_path = cls.__module__ + "." + cls.__name__ try: from .internal.core import MIXIN_TARGETS + mixin_cls = MIXIN_TARGETS[cls_path] for method_name in dir(mixin_cls): - if method_name.startswith('_'): + if method_name.startswith("_"): continue att = getattr(mixin_cls, method_name) - if six.PY2 and type(att).__name__ in ('instancemethod', 'method'): + if six.PY2 and type(att).__name__ in ("instancemethod", "method"): att = att.__func__ setattr(cls, method_name, att) return cls @@ -867,7 +929,7 @@ def attach_internal(cls): def is_main_thread(): if hasattr(threading, "main_thread"): return threading.current_thread() is threading.main_thread() - return threading.current_thread().__class__.__name__ == '_MainThread' + return threading.current_thread().__class__.__name__ == "_MainThread" def write_log(msg): @@ -875,13 +937,13 @@ def write_log(msg): logger.info(msg) -def split_quoted(s, delimiter=',', maxsplit=0): +def split_quoted(s, delimiter=",", maxsplit=0): pattern = r"""((?:[^""" + delimiter + r""""']|"[^"]*"|'[^']*')+)""" return re.split(pattern, s, maxsplit=maxsplit)[1::2] def gen_temp_table(): - return '%s%s' % (TEMP_TABLE_PREFIX, str(uuid.uuid4()).replace('-', '_')) + return "%s%s" % (TEMP_TABLE_PREFIX, str(uuid.uuid4()).replace("-", "_")) def hashable(obj): @@ -898,7 +960,7 @@ def hashable(obj): def thread_local_attribute(thread_local_name, default_value=None): - attr_name = '_local_attr_%d' % random.randint(0, 99999999) + attr_name = "_local_attr_%d" % random.randint(0, 99999999) def _get_thread_local(self): thread_local = getattr(self, thread_local_name, None) @@ -923,20 +985,28 @@ def _setter(self, value): def call_with_retry(func, *args, **kwargs): retry_num = 0 retry_times = kwargs.pop("retry_times", options.retry_times) + retry_timeout = kwargs.pop("retry_timeout", None) delay = kwargs.pop("delay", options.retry_delay) + reset_func = kwargs.pop("reset_func", None) exc_type = kwargs.pop("exc_type", BaseException) + + start_time = monotonic() if retry_timeout is not None else None while True: try: return func(*args, **kwargs) except exc_type: retry_num += 1 time.sleep(delay) - if retry_num > retry_times: + if retry_times is not None and retry_num > retry_times: + raise + if retry_timeout is not None and monotonic() - start_time > retry_timeout: raise + if callable(reset_func): + reset_func() def get_id(n): - if hasattr(n, '_node_id'): + if hasattr(n, "_node_id"): return n._node_id return id(n) @@ -951,11 +1021,17 @@ def strip_if_str(s): def with_wait_argument(func): - func_spec = compat.getfullargspec(func) if compat.getfullargspec else compat.getargspec(func) + func_spec = ( + compat.getfullargspec(func) + if compat.getfullargspec + else compat.getargspec(func) + ) args_set = set(func_spec.args) if hasattr(func_spec, "kwonlyargs"): args_set |= set(func_spec.kwonlyargs or []) - has_varkw = (getattr(func_spec, "varkw", None) or getattr(func_spec, "keywords", None)) is not None + has_varkw = ( + getattr(func_spec, "varkw", None) or getattr(func_spec, "keywords", None) + ) is not None try: async_index = func_spec.args.index("async_") @@ -967,7 +1043,8 @@ def wrapped(*args, **kwargs): if async_index is not None and len(args) >= async_index + 1: warnings.warn( "Please use async_ as a keyword argument, like obj.func(async_=True)", - DeprecationWarning, stacklevel=2 + DeprecationWarning, + stacklevel=2, ) add_survey_call(".".join([func.__module__, func.__name__, "async_"])) elif "wait" in kwargs: @@ -992,7 +1069,7 @@ def wrapped(*args, **kwargs): def split_sql_by_semicolon(sql_statement): sql_statement = sql_statement.replace("\r\n", "\n").replace("\r", "\n") - left_brackets = {'}': '{', ']': '[', ')': '('} + left_brackets = {"}": "{", "]": "[", ")": "("} def cut_statement(stmt_start, stmt_end=None): stmt_end = stmt_end or len(sql_statement) @@ -1003,9 +1080,9 @@ def cut_statement(stmt_start, stmt_end=None): continue if comm_start > stmt_end: break - parts.append(sql_statement[left: comm_start]) + parts.append(sql_statement[left:comm_start]) left = comm_end - parts.append(sql_statement[left: stmt_end]) + parts.append(sql_statement[left:stmt_end]) combined_lines = "".join(parts).splitlines() return "\n".join(line.rstrip() for line in combined_lines).strip() @@ -1018,18 +1095,18 @@ def cut_statement(stmt_start, stmt_end=None): bracket_stack = [] while pos < len(sql_statement): ch = sql_statement[pos] - dch = sql_statement[pos: pos + 2] if pos + 1 < len(sql_statement) else None + dch = sql_statement[pos : pos + 2] if pos + 1 < len(sql_statement) else None if quote_sign is None and comment_sign is None: - if ch in ('{', '[', '('): + if ch in ("{", "[", "("): # start of brackets bracket_stack.append(ch) pos += 1 - elif ch in ('}', ']', ')'): + elif ch in ("}", "]", ")"): # end of brackets assert bracket_stack[-1] == left_brackets[ch] bracket_stack.pop() pos += 1 - elif ch in ('"', "'", '`'): + elif ch in ('"', "'", "`"): # start of quote quote_sign = ch pos += 1 @@ -1038,10 +1115,10 @@ def cut_statement(stmt_start, stmt_end=None): comment_sign = dch comment_pos = pos pos += 2 - elif ch == ';' and not bracket_stack: + elif ch == ";" and not bracket_stack: # semicolon without brackets, quotes and comments part_statement = cut_statement(start, pos + 1) - if part_statement and part_statement != ';': + if part_statement and part_statement != ";": statements.append(part_statement) pos += 1 start = pos @@ -1050,7 +1127,7 @@ def cut_statement(stmt_start, stmt_end=None): elif quote_sign is not None and ch == quote_sign: quote_sign = None pos += 1 - elif quote_sign is not None and ch == '\\': + elif quote_sign is not None and ch == "\\": # skip escape char pos += 2 elif comment_sign == "--" and ch == "\n": @@ -1066,12 +1143,15 @@ def cut_statement(stmt_start, stmt_end=None): else: pos += 1 part_statement = cut_statement(start) - if part_statement and part_statement != ';': + if part_statement and part_statement != ";": statements.append(part_statement) return statements -_convert_host_hash = ("6fe9b6c02efc24159c09863c1aadffb5", "9b75728355160c10b5eb75e4bf105a76") +_convert_host_hash = ( + "6fe9b6c02efc24159c09863c1aadffb5", + "9b75728355160c10b5eb75e4bf105a76", +) _default_host_hash = "c7f4116fbf820f99284dcc89f340b372" @@ -1088,7 +1168,10 @@ def get_default_logview_endpoint(default_endpoint, odps_endpoint): hashed_host = md5_hexdigest(to_binary(parsed_host)) hashed_default_host = md5_hexdigest(to_binary(default_host)) - if hashed_default_host == _default_host_hash and hashed_host in _convert_host_hash: + if ( + hashed_default_host == _default_host_hash + and hashed_host in _convert_host_hash + ): suffix = r"\1" + string.ascii_letters[1::-1] * 2 + parsed_host[-8:] return re.sub(r"([a-z])[a-z]{3}\.[a-z]{3}$", suffix, default_endpoint) return default_endpoint @@ -1154,7 +1237,7 @@ def show_versions(): # pragma: no cover results["USE_CLIB"] = False try: - from . import internal + from . import internal # noqa: F401 results["HAS_INTERNAL"] = True except ImportError: @@ -1186,3 +1269,17 @@ def show_versions(): # pragma: no cover key_size = 1 + max(len(key) for key in results.keys()) for key, val in results.items(): print(key + " " * (key_size - len(key)) + ": " + str(val)) + + +def get_supported_python_tag(align=None): + if align is None: + align = options.align_supported_python_tag + if align: + if sys.version_info[:2] >= (3, 11): + return "cp311" + elif sys.version_info[:2] >= (3, 6): + return "cp37" + else: + return "cp27" + else: + return "cp" + str(sys.version_info[0]) + str(sys.version_info[1]) diff --git a/odps_scripts/pyodps_pack.py b/odps_scripts/pyodps_pack.py index b48b146b..14397bbb 100644 --- a/odps_scripts/pyodps_pack.py +++ b/odps_scripts/pyodps_pack.py @@ -1,8 +1,10 @@ #!/usr/bin/env python from __future__ import print_function + import argparse import contextlib import errno +import functools import glob import json import logging @@ -15,7 +17,6 @@ import sys import threading import time -import warnings from collections import defaultdict _DEFAULT_PYABI = "cp37-cp37m" @@ -37,6 +38,10 @@ _SEGFAULT_ERR_CODE = 139 +_COLOR_WARNING = "\033[93m" +_COLOR_FAIL = "\033[91m" +_COLOR_ENDC = "\033[0m" + python_abi_env = os.getenv("PYABI") cmd_before_build = os.getenv("BEFORE_BUILD") or "" cmd_after_build = os.getenv("AFTER_BUILD") or "" @@ -49,15 +54,20 @@ if docker_path and os.path.isfile(docker_path): docker_path = os.path.dirname(docker_path) +_is_py2 = sys.version_info[0] == 2 _is_linux = sys.platform.lower().startswith("linux") _is_macos = sys.platform.lower().startswith("darwin") _is_windows = sys.platform.lower().startswith("win") _is_sudo = "SUDO_USER" in os.environ -_vcs_prefixes = [prefix + "+" for prefix in "git hg svn bzr".split()] +_vcs_names = "git hg svn bzr" +_vcs_dirs = ["." + prefix for prefix in _vcs_names.split()] +_vcs_prefixes = [prefix + "+" for prefix in _vcs_names.split()] + +_color_supported = None logger = logging.getLogger(__name__) -if sys.version_info[0] == 3: +if not _is_py2: unicode = str dynlibs_pyproject_toml = """ @@ -131,8 +141,8 @@ if [[ "{no_deps}" == "true" ]]; then PYPI_NO_DEPS_ARG="--no-deps" fi -if [[ "{without_merge}" == "true" ]]; then - WITHOUT_MERGE="true" +if [[ "{no_merge}" == "true" ]]; then + NO_MERGE="true" fi if [[ "{pypi_pre}" == "true" ]]; then PYPI_EXTRA_ARG="$PYPI_EXTRA_ARG --pre" @@ -231,7 +241,8 @@ }} echo "Building user-defined packages..." -for path in `ls 2>/dev/null`; do +IFS=":" read -ra PACKAGE_PATHS <<< "$SRC_PACKAGE_PATHS" +for path in "${{PACKAGE_PATHS[@]}}"; do if [[ -d "$path" ]]; then path="$path/" fi @@ -344,7 +355,7 @@ HAS_PROTOBUF="1" fi -if [[ -n "$WITHOUT_MERGE" ]]; then +if [[ -n "$NO_MERGE" ]]; then # move all wheels into wheelhouse if [[ -n "$NON_DOCKER_MODE" ]]; then mv "$WHEELS_PATH"/*.whl "$WHEELHOUSE_PATH" @@ -394,7 +405,7 @@ # make sure the package is handled as a binary touch "$INSTALL_PATH/{package_site}/.pyodps-force-bin.so" - if [[ "{skip_scan_pkg_resources}" != "true" ]]; then + if [[ "{skip_scan_pkg_resources}" != "true" ]] && [[ -z "$PYPI_NO_DEPS_ARG" ]]; then echo "" echo "Scanning and installing dependency for pkg_resources if needed..." if [[ $(egrep --include=\*.py -Rnw "$INSTALL_PATH/{package_site}" -m 1 -e '^\s*(from|import) +pkg_resources' | grep -n 1) ]]; then @@ -450,6 +461,7 @@ def _indent(text, prefix, predicate=None): """ """Copied from textwrap.indent method of Python 3""" if predicate is None: + def predicate(line): return line.strip() @@ -457,7 +469,38 @@ def prefixed_lines(): for line in text.splitlines(True): yield (prefix + line if predicate(line) else line) - return ''.join(prefixed_lines()) + return "".join(prefixed_lines()) + + +def _print_color(s, *args, **kw): + global _color_supported + + color = kw.pop("color", None) + if _color_supported is None: + plat = sys.platform + if plat == "win32": + try: + from pip._vendor.rich._windows import get_windows_console_features + + supported_platform = get_windows_console_features().truecolor + except: + supported_platform = False + else: + supported_platform = plat != "Pocket PC" + # isatty is not always implemented, #6223. + is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty() + + _color_supported = supported_platform and is_a_tty + + kw["file"] = sys.stderr + if color and _color_supported: + print(color + s + _COLOR_ENDC, *args, **kw) + else: + print(s, *args, **kw) + + +_print_warning = functools.partial(_print_color, color=_COLOR_WARNING) +_print_fail = functools.partial(_print_color, color=_COLOR_FAIL) def _makedirs(name, mode=0o777, exist_ok=False): @@ -480,8 +523,8 @@ def _makedirs(name, mode=0o777, exist_ok=False): # Defeats race condition when another thread created the path pass cdir = os.curdir - if isinstance(tail, bytes): - cdir = bytes(os.curdir, 'ASCII') + if not _is_py2 and isinstance(tail, bytes): + cdir = bytes(os.curdir, "ASCII") if tail == cdir: # xxx/newdir/. exists if xxx/newdir exists return try: @@ -514,21 +557,62 @@ def _copy_to_workdir(src_path, work_dir): shutil.copytree(src_path, dest_dir) -def _copy_package_paths(package_paths=None, work_dir=None, skip_user_path=True): +def _find_source_vcs_root(package_path): + def is_root(p): + return p == os.path.sep or p == os.path.dirname(p) + + package_path = package_path.rstrip(os.path.sep) + parent_path = package_path + while not is_root(parent_path) and not any( + os.path.isdir(os.path.join(parent_path, d)) for d in _vcs_dirs + ): + parent_path = os.path.dirname(parent_path) + if is_root(parent_path) or parent_path == package_path: + # if no vcs or vcs on package root, use package path directly + parent_path = package_path + rel_install_path = os.path.basename(parent_path) + else: + # work out relative path for real Python package + rel_install_path = os.path.join( + os.path.basename(parent_path), os.path.relpath(package_path, parent_path) + ).replace(os.path.sep, "/") + return parent_path, rel_install_path + + +def _copy_package_paths( + package_paths=None, work_dir=None, skip_user_path=True, find_vcs_root=False +): remained = [] - for package_path in (package_paths or ()): - base_name = os.path.basename(package_path.rstrip("/").rstrip("\\")) - abs_path = os.path.abspath(package_path) + rel_dirs = [] + for package_path in package_paths or (): + if find_vcs_root: + real_root, rel_install_path = _find_source_vcs_root(package_path) + else: + real_root = package_path + rel_install_path = os.path.basename(package_path.rstrip("/")) + rel_dirs.append(rel_install_path) + + base_name = os.path.basename(real_root.rstrip("/").rstrip("\\")) + abs_path = os.path.abspath(real_root) if not skip_user_path or not abs_path.startswith(os.path.expanduser("~")): # not on user path, copy it into build path _copy_to_workdir(base_name, work_dir) else: remained.append(abs_path) - return remained + return remained, rel_dirs -def _build_docker_run_command(container_name, docker_image, work_dir, package_paths, docker_args): - docker_executable = "docker" if not docker_path else os.path.join(docker_path, "docker") +def _build_docker_run_command( + container_name, + docker_image, + work_dir, + package_paths, + docker_args, + find_vcs_root=False, +): + docker_executable = ( + "docker" if not docker_path else os.path.join(docker_path, "docker") + ) script_path_mapping = work_dir + "/scripts:/scripts" wheelhouse_path_mapping = work_dir + "/wheelhouse:/wheelhouse" build_path_mapping = work_dir + "/build:/build" @@ -548,18 +632,23 @@ def _build_docker_run_command(container_name, docker_image, work_dir, package_pa _makedirs(os.path.join(work_dir, "build"), exist_ok=True) cmdline.extend(["-v", build_path_mapping]) - remained = _copy_package_paths(package_paths, work_dir) + remained, rel_paths = _copy_package_paths( + package_paths, work_dir, find_vcs_root=find_vcs_root + ) for abs_path in remained: base_name = os.path.basename(abs_path.rstrip("/").rstrip("\\")) cmdline.extend(["-v", "%s:/build/%s" % (abs_path, base_name)]) - cmdline.extend( - [docker_image, "/bin/bash", "/scripts/%s" % _PACK_SCRIPT_FILE_NAME] - ) + + if rel_paths: + cmdline.extend(["-e", "SRC_PACKAGE_PATHS=%s" % ":".join(rel_paths)]) + cmdline.extend([docker_image, "/bin/bash", "/scripts/%s" % _PACK_SCRIPT_FILE_NAME]) return cmdline def _build_docker_rm_command(container_name): - docker_executable = "docker" if not docker_path else os.path.join(docker_path, "docker") + docker_executable = ( + "docker" if not docker_path else os.path.join(docker_path, "docker") + ) return [docker_executable, "rm", "-f", container_name] @@ -578,7 +667,7 @@ def _create_temp_work_dir( **script_kwargs ): if _is_macos and _is_sudo: - logger.warning( + _print_warning( "You are calling pyodps-pack with sudo under MacOS, which is not needed and may cause " "unexpected permission errors. Try calling pyodps-pack without sudo if you encounter " "such problems." @@ -593,7 +682,7 @@ def _create_temp_work_dir( except ImportError: cache_root = os.path.expanduser("~/.cache/pyodps-pack") - tmp_path = "%s/pack-root-%d" % (cache_root, int(time.time())) + tmp_path = "%s%spack-root-%d" % (cache_root, os.path.sep, int(time.time())) try: _makedirs(tmp_path, exist_ok=True) script_path = os.path.join(tmp_path, "scripts") @@ -603,7 +692,9 @@ def _create_temp_work_dir( if requirement_list: req_text = "\n".join(requirement_list) + "\n" _log_indent("Content of requirements.txt:", req_text) - with open(os.path.join(script_path, _REQUIREMENT_FILE_NAME), "wb") as res_file: + with open( + os.path.join(script_path, _REQUIREMENT_FILE_NAME), "wb" + ) as res_file: res_file.write(_to_unix(req_text)) if vcs_list: @@ -615,17 +706,23 @@ def _create_temp_work_dir( if install_requires: install_req_text = "\n".join(install_requires) + "\n" _log_indent("Content of install-requires.txt:", install_req_text) - with open(os.path.join(script_path, _INSTALL_REQ_FILE_NAME), "wb") as install_req_file: + with open( + os.path.join(script_path, _INSTALL_REQ_FILE_NAME), "wb" + ) as install_req_file: install_req_file.write(_to_unix(install_req_text)) if exclude_list: exclude_text = "\n".join(exclude_list) + "\n" _log_indent("Content of excludes.txt:", exclude_text) - with open(os.path.join(script_path, _EXCLUDE_FILE_NAME), "wb") as exclude_file: + with open( + os.path.join(script_path, _EXCLUDE_FILE_NAME), "wb" + ) as exclude_file: exclude_file.write(_to_unix(exclude_text)) if before_script or cmd_before_build: - with open(os.path.join(script_path, _BEFORE_SCRIPT_FILE_NAME), "wb") as before_script_file: + with open( + os.path.join(script_path, _BEFORE_SCRIPT_FILE_NAME), "wb" + ) as before_script_file: if before_script: with open(before_script, "rb") as src_before_file: before_script_file.write(_to_unix(src_before_file.read())) @@ -633,10 +730,16 @@ def _create_temp_work_dir( if cmd_before_build: before_script_file.write(_to_unix(cmd_before_build.encode())) if logger.getEffectiveLevel() <= logging.DEBUG: - with open(os.path.join(script_path, _BEFORE_SCRIPT_FILE_NAME), "r") as before_script_file: - _log_indent("Content of before-script.sh:", before_script_file.read()) + with open( + os.path.join(script_path, _BEFORE_SCRIPT_FILE_NAME), "r" + ) as before_script_file: + _log_indent( + "Content of before-script.sh:", before_script_file.read() + ) - with open(os.path.join(script_path, _DYNLIB_PYPROJECT_TOML_FILE_NAME), "wb") as toml_file: + with open( + os.path.join(script_path, _DYNLIB_PYPROJECT_TOML_FILE_NAME), "wb" + ) as toml_file: toml_file.write(_to_unix(dynlibs_pyproject_toml.encode())) with open(os.path.join(script_path, _PACK_SCRIPT_FILE_NAME), "wb") as pack_file: @@ -647,7 +750,16 @@ def _create_temp_work_dir( yield tmp_path finally: if tmp_path and os.path.exists(tmp_path): - shutil.rmtree(tmp_path) + if _is_windows: + # permission error may occur when using shutil.rmtree in Windows. + os.system("rd /s /q \"" + tmp_path + "\"") + else: + shutil.rmtree(tmp_path) + if os.path.exists(tmp_path): + _print_warning( + "The temp path %s created by pyodps-pack still exists, you may " + "delete it manually later." % tmp_path + ) try: os.rmdir(cache_root) except OSError: @@ -663,10 +775,10 @@ def split_config(config_str): ) proc.wait() if proc.returncode != 0: - warnings.warn( - 'Failed to call `pip config list`, return code is %s. ' + _print_warning( + "Failed to call `pip config list`, return code is %s. " 'Will use default index instead. Specify "-i " ' - 'if you want to use another package index.' % proc.returncode + "if you want to use another package index." % proc.returncode ) return {} @@ -748,7 +860,7 @@ def _collect_env_packages(exclude_editable=False, exclude=None, index_url=None): else: specifiers.append("%s==%s" % (desc["name"], desc["version"])) if missing_packs: - warnings.warn( + _print_warning( "Cannot find packages %s in package index. These packages cannot be included." % ",".join(missing_packs) ) @@ -796,7 +908,9 @@ def _get_python_abi_version(python_version=None, mcpy27=None, dwpy27=None): python_abi_version += "m" if dwpy27: if mcpy27: - raise PackException("You should not specify '--dwpy27' and '--mcpy27' at the same time.") + raise PackException( + "You should not specify '--dwpy27' and '--mcpy27' at the same time." + ) python_abi_version = _DWPY27_PYABI elif mcpy27: python_abi_version = _MCPY27_PYABI @@ -808,7 +922,10 @@ def _get_bash_path(): if not _is_windows: return "/bin/bash" - import winreg + try: + import winreg + except ImportError: + import _winreg as winreg key = None try: @@ -891,7 +1008,9 @@ def rewrite_docker_cmd_part(part): def _main(parsed_args): if parsed_args.debug: logging.basicConfig(level=logging.DEBUG) - logger.info("System environment variables: %s", json.dumps(dict(os.environ), indent=2)) + logger.info( + "System environment variables: %s", json.dumps(dict(os.environ), indent=2) + ) if parsed_args.pack_env: if parsed_args.specifiers: @@ -905,7 +1024,11 @@ def _main(parsed_args): _filter_local_package_paths(parsed_args) _collect_install_requires(parsed_args) - if not parsed_args.specifiers and not parsed_args.package_path and not parsed_args.vcs_urls: + if ( + not parsed_args.specifiers + and not parsed_args.package_path + and not parsed_args.vcs_urls + ): raise PackException("ERROR: You must give at least one requirement to install.") file_cfg = _get_default_pypi_config() @@ -923,9 +1046,11 @@ def _first_or_none(list_val): prefer_binary_str = "true" if parsed_args.prefer_binary else "" no_deps_str = "true" if parsed_args.no_deps else "" debug_str = "true" if parsed_args.debug else "" - without_merge_str = "true" if parsed_args.without_merge else "" + no_merge_str = "true" if parsed_args.no_merge else "" use_pep517_str = str(parsed_args.use_pep517).lower() - check_build_dependencies_str = "true" if parsed_args.check_build_dependencies else "" + check_build_dependencies_str = ( + "true" if parsed_args.check_build_dependencies else "" + ) skip_scan_pkg_resources_str = "true" if parsed_args.skip_scan_pkg_resources else "" pre_str = "true" if parsed_args.pre else "" timeout_str = parsed_args.timeout or _first_or_none(file_cfg.get("timeout")) or "" @@ -964,7 +1089,7 @@ def _first_or_none(list_val): check_build_dependencies=check_build_dependencies_str, skip_scan_pkg_resources=skip_scan_pkg_resources_str, no_deps=no_deps_str, - without_merge=without_merge_str, + no_merge=no_merge_str, python_abi_version=python_abi_version, pypi_trusted_hosts=trusted_hosts_str, dynlibs=dynlibs_str, @@ -972,25 +1097,37 @@ def _first_or_none(list_val): ) as work_dir: container_name = "pack-cnt-%d" % int(time.time()) - use_legacy_image = parsed_args.legacy_image or parsed_args.mcpy27 or parsed_args.dwpy27 + use_legacy_image = ( + parsed_args.legacy_image or parsed_args.mcpy27 or parsed_args.dwpy27 + ) default_image = _get_default_image(use_legacy_image, parsed_args.arch) docker_image = docker_image_env or default_image minikube_mount_proc = None - if pack_in_cluster or parsed_args.without_docker: - _copy_package_paths(parsed_args.package_path, work_dir, skip_user_path=False) + if pack_in_cluster or parsed_args.no_docker: + _, rel_dirs = _copy_package_paths( + parsed_args.package_path, + work_dir, + skip_user_path=False, + find_vcs_root=parsed_args.find_vcs_root, + ) pyversion, pyabi = python_abi_version.split("-", 1) pyversion = pyversion[2:] - build_cmd = [_get_bash_path(), os.path.join(work_dir, "scripts", _PACK_SCRIPT_FILE_NAME)] + build_cmd = [ + _get_bash_path(), + os.path.join(work_dir, "scripts", _PACK_SCRIPT_FILE_NAME), + ] build_env = { - "PACK_ROOT": work_dir, + "PACK_ROOT": str(work_dir), "PYPLATFORM": default_image.replace("quay.io/pypa/", ""), "PYVERSION": pyversion, "PYABI": pyabi, "TARGET_ARCH": _get_arch(parsed_args.arch), } - if parsed_args.without_docker: + if rel_dirs: + build_env["SRC_PACKAGE_PATHS"] = ":".join(rel_dirs) + if parsed_args.no_docker: build_env["NON_DOCKER_MODE"] = "true" build_env["PYEXECUTABLE"] = _get_local_pack_executable(work_dir) else: @@ -998,8 +1135,9 @@ def _first_or_none(list_val): build_env = os.environ.copy() build_env.update(temp_env) build_env["PACK_IN_CLUSTER"] = "true" + build_cwd = os.getcwd() logger.debug("Command: %r", build_cmd) - logger.debug("Environment variables: %r", build_cmd) + logger.debug("Environment variables: %r", build_env) else: build_cmd = _build_docker_run_command( container_name, @@ -1007,19 +1145,23 @@ def _first_or_none(list_val): work_dir, parsed_args.package_path, parsed_args.docker_args, + find_vcs_root=parsed_args.find_vcs_root, ) build_cmd, minikube_mount_proc = _rewrite_minikube_command(build_cmd) build_env = None + build_cwd = None logger.debug("Docker command: %r", build_cmd) try: - proc = subprocess.Popen(build_cmd, env=build_env) + proc = subprocess.Popen(build_cmd, env=build_env, cwd=build_cwd) except OSError as ex: if ex.errno != errno.ENOENT: raise - logger.error("Failed to execute command %r, the error message is %s.", build_cmd, ex) - if pack_in_cluster or parsed_args.without_docker: + logger.error( + "Failed to execute command %r, the error message is %s.", build_cmd, ex + ) + if pack_in_cluster or parsed_args.no_docker: if _is_windows: raise PackException( "Cannot locate git bash. Please install Git for Windows or " @@ -1031,7 +1173,7 @@ def _first_or_none(list_val): else: raise PackException( "Cannot locate docker. Please install it, reopen your terminal and " - "retry. Or you may try `--without-docker` instead. If you've already " + "retry. Or you may try `--no-docker` instead. If you've already " "installed Docker, you may specify the path of its executable via " "DOCKER_PATH environment." ) @@ -1040,7 +1182,7 @@ def _first_or_none(list_val): proc.wait() except KeyboardInterrupt: cancelled = True - if not parsed_args.without_docker and not pack_in_cluster: + if not parsed_args.no_docker and not pack_in_cluster: docker_rm_cmd = _build_docker_rm_command(container_name) logger.debug("Docker rm command: %r", docker_rm_cmd) subprocess.Popen(docker_rm_cmd, stdout=subprocess.PIPE) @@ -1050,20 +1192,22 @@ def _first_or_none(list_val): minikube_mount_proc.terminate() if proc.returncode != 0: - cancelled = cancelled or os.path.exists(os.path.join(work_dir, "scripts", ".cancelled")) + cancelled = cancelled or os.path.exists( + os.path.join(work_dir, "scripts", ".cancelled") + ) if cancelled: - print("Cancelled by user.") + _print_warning("Cancelled by user.") else: - if parsed_args.without_docker: - print( + if parsed_args.no_docker: + _print_fail( "Errors occurred when creating your package. This is often caused " "by mismatching Python version, platform or architecture when " "encountering binary packages. Please check outputs for details. " "You may try building your packages inside Docker by removing " - "--without-docker option, which often resolves the issue." + "--no-docker option, which often resolves the issue." ) else: - print( + _print_fail( "Errors occurred when creating your package. Please check outputs " "for details. You may add a `--debug` option to obtain more " "information. Please provide all outputs with `--debug` specified " @@ -1071,18 +1215,18 @@ def _first_or_none(list_val): ) if proc.returncode == _SEGFAULT_ERR_CODE and use_legacy_image: - print( + _print_fail( "Image manylinux1 might crash silently under some Docker environments. " "You may try under a native Linux environment. Details can be seen at " "https://mail.python.org/pipermail/wheel-builders/2016-December/000239.html." ) elif _is_linux and "SUDO_USER" not in os.environ: - print( + _print_fail( "You need to run pyodps-pack with sudo to make sure docker is " "executed properly." ) else: - if parsed_args.without_merge: + if parsed_args.no_merge: src_path = os.path.join(work_dir, "wheelhouse", "*.whl") for wheel_name in glob.glob(src_path): shutil.move(wheel_name, os.path.basename(wheel_name)) @@ -1092,11 +1236,11 @@ def _first_or_none(list_val): if _is_linux and "SUDO_UID" in os.environ and "SUDO_GID" in os.environ: own_desc = "%s:%s" % (os.environ["SUDO_UID"], os.environ["SUDO_GID"]) - target_path = "*.whl" if parsed_args.without_merge else parsed_args.output + target_path = "*.whl" if parsed_args.no_merge else parsed_args.output chown_proc = subprocess.Popen(["chown", own_desc, target_path]) chown_proc.wait() - if parsed_args.without_merge: + if parsed_args.no_merge: print("Result wheels stored at current dir") else: print("Result package stored as %s" % parsed_args.output) @@ -1106,145 +1250,227 @@ def _first_or_none(list_val): def main(): parser = argparse.ArgumentParser() parser.add_argument( - "specifiers", metavar="REQ", nargs="*", + "specifiers", + metavar="REQ", + nargs="*", help="a requirement item compatible with pip command", ) parser.add_argument( - "--requirement", "-r", action="append", default=[], - metavar="PATH", help="Path of requirements.txt including file name", + "--requirement", + "-r", + action="append", + default=[], + metavar="PATH", + help="Path of requirements.txt including file name", ) parser.add_argument( - "--install-requires", action="append", default=[], help="Requirement for install time" + "--install-requires", + action="append", + default=[], + help="Requirement for install time", ) parser.add_argument( - "--install-requires-file", action="append", default=[], + "--install-requires-file", + action="append", + default=[], help="Requirement file for install time", ) + parser.add_argument("--run-before", help="Prepare script before package build.") parser.add_argument( - "--run-before", help="Prepare script before package build." - ) - parser.add_argument( - "--no-deps", action="store_true", default=False, + "--no-deps", + action="store_true", + default=False, help="Don't put package dependencies into archives", ) parser.add_argument( - "--pre", action="store_true", default=False, + "--pre", + action="store_true", + default=False, help="Include pre-release and development versions. " - "By default, pyodps-pack only finds stable versions.", + "By default, pyodps-pack only finds stable versions.", ) parser.add_argument( - "--proxy", metavar="proxy", + "--proxy", + metavar="proxy", help="Specify a proxy in the form scheme://[user:passwd@]proxy.server:port.", ) parser.add_argument( - "--retries", metavar="retries", + "--retries", + metavar="retries", help="Maximum number of retries each connection should attempt (default 5 times).", ) parser.add_argument( - "--timeout", metavar="sec", help="Set the socket timeout (default 15 seconds).", + "--timeout", + metavar="sec", + help="Set the socket timeout (default 15 seconds).", ) parser.add_argument( - "--exclude", "-X", action="append", default=[], - metavar="DEPEND", help="Requirements to exclude from the package", + "--exclude", + "-X", + action="append", + default=[], + metavar="DEPEND", + help="Requirements to exclude from the package", ) parser.add_argument( - "--index-url", "-i", default="", + "--index-url", + "-i", + default="", help="Base URL of PyPI package. If absent, will use " - "`global.index-url` in `pip config list` command by default.", + "`global.index-url` in `pip config list` command by default.", ) parser.add_argument( - "--extra-index-url", metavar="url", action="append", default=[], + "--extra-index-url", + metavar="url", + action="append", + default=[], help="Extra URLs of package indexes to use in addition to --index-url. " - "Should follow the same rules as --index-url.", + "Should follow the same rules as --index-url.", ) parser.add_argument( - "--trusted-host", metavar="host", action="append", default=[], + "--trusted-host", + metavar="host", + action="append", + default=[], help="Mark this host or host:port pair as trusted, " - "even though it does not have valid or any HTTPS.", + "even though it does not have valid or any HTTPS.", ) parser.add_argument( - "--legacy-image", "-l", action="store_true", default=False, + "--legacy-image", + "-l", + action="store_true", + default=False, help="Use legacy image to make packages", ) parser.add_argument( - "--mcpy27", action="store_true", default=False, + "--mcpy27", + action="store_true", + default=False, help="Build package for Python 2.7 on MaxCompute. " - "If enabled, will assume `legacy-image` to be true.", + "If enabled, will assume `legacy-image` to be true.", ) parser.add_argument( - "--dwpy27", action="store_true", default=False, + "--dwpy27", + action="store_true", + default=False, help="Build package for Python 2.7 on DataWorks. " - "If enabled, will assume `legacy-image` to be true.", + "If enabled, will assume `legacy-image` to be true.", ) parser.add_argument( - "--prefer-binary", action="store_true", default=False, + "--prefer-binary", + action="store_true", + default=False, help="Prefer older binary packages over newer source packages", ) parser.add_argument( - "--output", "-o", default="packages.tar.gz", help="Target archive file name to store" + "--output", + "-o", + default="packages.tar.gz", + help="Target archive file name to store", ) parser.add_argument( - "--dynlib", action="append", default=[], + "--dynlib", + action="append", + default=[], help="Dynamic library to include. Can be an absolute path to a .so library " - "or library name with or without 'lib' prefix." + "or library name with or without 'lib' prefix.", ) parser.add_argument( "--pack-env", action="store_true", default=False, help="Pack full environment" ) parser.add_argument( - "--exclude-editable", action="store_true", default=False, + "--exclude-editable", + action="store_true", + default=False, help="Exclude editable packages when packing", ) parser.add_argument( - "--use-pep517", action="store_true", default=None, + "--use-pep517", + action="store_true", + default=None, help="Use PEP 517 for building source distributions (use --no-use-pep517 to force legacy behaviour).", ) parser.add_argument( - "--no-use-pep517", action="store_false", dest="use_pep517", default=None, help=argparse.SUPPRESS, + "--no-use-pep517", + action="store_false", + dest="use_pep517", + default=None, + help=argparse.SUPPRESS, ) parser.add_argument( - "--check-build-dependencies", action="store_true", default=None, + "--check-build-dependencies", + action="store_true", + default=None, help="Check the build dependencies when PEP517 is used.", ) parser.add_argument( - "--arch", default="x86_64", + "--arch", + default="x86_64", help="Architecture of target package, x86_64 by default. Currently only x86_64 " - "and aarch64 supported. Do not use this argument if you are not running " - "your code in a proprietary cloud." + "and aarch64 supported. Do not use this argument if you are not running " + "your code in a proprietary cloud.", ) parser.add_argument( "--python-version", help="Version of Python your environment is on, for instance 3.6. " - "You may also use 36 instead. Do not use this argument if you " - "are not running your code in a proprietary cloud." + "You may also use 36 instead. Do not use this argument if you " + "are not running your code in a proprietary cloud.", ) + parser.add_argument("--docker-args", help="Extra arguments for Docker.") parser.add_argument( - "--docker-args", help="Extra arguments for Docker." + "--no-docker", + action="store_true", + default=False, + help="Create packages without Docker. May cause errors if incompatible " + "binaries involved.", ) parser.add_argument( - "--without-docker", action="store_true", default=False, - help="Create packages without Docker. May cause errors if incompatible " - "binaries involved.", + "--without-docker", action="store_true", default=False, help=argparse.SUPPRESS ) parser.add_argument( - "--without-merge", action="store_true", default=False, + "--no-merge", + action="store_true", + default=False, help="Create or download wheels without merging them.", ) parser.add_argument( - "--skip-scan-pkg-resources", action="store_true", default=False, + "--without-merge", action="store_true", default=False, help=argparse.SUPPRESS + ) + parser.add_argument( + "--skip-scan-pkg-resources", + action="store_true", + default=False, help="Skip scanning for usage of pkg-resources package.", ) parser.add_argument( - "--debug", action="store_true", default=False, + "--find-vcs-root", + action="store_true", + default=False, + help="Find VCS root when building local source code.", + ) + parser.add_argument( + "--debug", + action="store_true", + default=False, help="Dump debug messages for diagnose purpose", ) args = parser.parse_args() + if args.without_docker: + _print_warning( + "DEPRECATION: --without-docker is deprecated, use --no-docker instead." + ) + args.no_docker = True + if args.without_merge: + _print_warning( + "DEPRECATION: --without-merge is deprecated, use --no-merge instead." + ) + args.no_merge = True try: sys.exit(_main(args) or 0) except PackException as ex: - print(ex.args[0], file=sys.stderr) + _print_fail(ex.args[0]) sys.exit(1) diff --git a/odps_scripts/pyou.py b/odps_scripts/pyou.py index 56eebb39..c6ecd850 100644 --- a/odps_scripts/pyou.py +++ b/odps_scripts/pyou.py @@ -1,8 +1,9 @@ #!/usr/bin/env python - +import argparse import os import sys +from odps import ODPS from odps.udf.tools import runners @@ -17,35 +18,86 @@ def _chr_if_necessary(s): def main(): sys.path.insert(0, os.getcwd()) - from odps import udf - # Arguments parsing - import argparse - parser = argparse.ArgumentParser(description='ODPS Python UDF tools') - parser.add_argument('-D', metavar='delim', type=str, default=',', - help='Line delimiter that separates lines into columns, ' - 'default is ","') - parser.add_argument('-N', metavar='null', type=str, default='NULL', - help='NULL indicator') - parser.add_argument('-I', metavar='stdin', type=str, default='sys.stdin', - help='standard input, sys.stdin as default') - parser.add_argument('clz', metavar='your_script.class_name', type=str, help='The full import path of your UDF class') + parser = argparse.ArgumentParser(description="ODPS Python UDF tools") + parser.add_argument( + "-D", + metavar="delim", + type=str, + default=",", + help="Line delimiter that separates lines into columns, default is \",\"", + ) + parser.add_argument( + "-N", metavar="null", type=str, default="NULL", help="NULL indicator" + ) + parser.add_argument( + "-I", + metavar="stdin", + type=str, + default="sys.stdin", + help="standard input, sys.stdin as default", + ) + parser.add_argument( + "-t", + "--table", + metavar="table", + type=str, + help="table name, can also specify partitions and columns like table.p(p1=1,p2=2).c(c1,c2)", + ) + parser.add_argument("--project", metavar="project", type=str, help="project name") + parser.add_argument( + "--access-id", metavar="access_id", type=str, help="access id of ODPS" + ) + parser.add_argument( + "--secret-access-key", + metavar="secret_access_key", + type=str, + help="access key of ODPS", + ) + parser.add_argument( + "--endpoint", metavar="endpoint", type=str, help="endpoint of ODPS" + ) + parser.add_argument( + "--record-limit", + metavar="record_limit", + type=int, + default=None, + help="limitation of records", + ) + parser.add_argument( + "clz", + metavar="your_script.class_name", + type=str, + help="The full import path of your UDF class", + ) args = parser.parse_args() + delim = _chr_if_necessary(args.D) null_indicator = _chr_if_necessary(args.N) + table_desc = args.table + access_id = args.access_id + secret_access_key = args.secret_access_key + project = args.project + endpoint = args.endpoint + record_limit = args.record_limit + # Import user class - pkg, name = args.clz.rsplit('.', 1) + pkg, name = args.clz.rsplit(".", 1) usermod = __import__(pkg, globals(), locals(), [name]) clz = getattr(usermod, name) # get stdin - pkg, name = args.I.rsplit('.', 1) + pkg, name = args.I.rsplit(".", 1) usermod = __import__(pkg, globals(), locals(), [name]) stdin = getattr(usermod, name) - udf_runner = runners.get_default_runner(clz, delim, null_indicator, stdin) + if table_desc and access_id and secret_access_key: + odps_entry = ODPS(access_id, secret_access_key, project, endpoint) + udf_runner = runners.get_table_runner(clz, odps_entry, table_desc, record_limit) + else: + udf_runner = runners.get_csv_runner(clz, delim, null_indicator, stdin) udf_runner.run() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/pyproject.toml b/pyproject.toml index b9e4bf3a..f44458a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,3 +31,35 @@ exclude_lines = [ "return NotImplemented", "if TYPE_CHECKING:", ] + +[tool.black] +line-length = 88 +skip-string-normalization = true # make sure py27 unicode strings not modified +target-version = ['py34'] # make sure no commas added after kwargs +extend-exclude = ''' +^/benchmarks/.* +| ^/bin/.* +| ^/cupid/.* +| ^/docs/.* +| ^/misc/.* +| ^/odps/mars_extension/.* +| ^/odps/df/.* +| ^/odps/lib/.* +| ^/odps/ml/.* +| ^/odps/static/.* +| ^/odps_scripts/.* +''' + +[tool.isort] +profile = "black" +py_version = 36 +skip_glob = [ + "cupid/*", + "examples/*", + "misc/*", + "odps/df/*", + "odps/lib/*", + "odps/mars_extension/*", + "odps/ml/*", + "odps_scripts/*", +] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..1ab953e3 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,69 @@ +[flake8] +max-line-length = 88 +select = + E9, + E101, + E111, + E117, + E127, + E201, + E202, + E223, + E224, + E225, + E231, + E242, + E251, + E273, + E274, + E275, + E301, + E302, + E303, + E304, + E305, + E401, + E703, + E901, + E999, + F7, + F63, + F82, + F401, + F811, + F821, + F822, + F823, + F841, + W191, + W291, + W292, + W293, + W391, + W601, + W602, + W603, + W604, + W605 +exclude = + __init__.py + __pycache__ + .git/ + benchmarks/ + build/ + bin/ + cupid/ + dist/ + docs/ + env/ + examples/ + misc/ + odps/compat.py + odps/df/* + odps/lib/* + odps/mars_extension/* + odps/ml/* + odps/udf/* + odps_scripts/* + *.pyi + setup.py \ No newline at end of file diff --git a/setup.py b/setup.py index 9f659fc8..31455144 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ import shutil import sys -from setuptools import setup, find_packages, Extension +from setuptools import Extension, find_packages, setup from setuptools.command.install import install try: @@ -43,24 +43,26 @@ # 10.9 system or above, overriding distuitls behaviour which is to target # the version that python was built for. This may be overridden by setting # MACOSX_DEPLOYMENT_TARGET before calling setup.py -if sys.platform == 'darwin': - if 'MACOSX_DEPLOYMENT_TARGET' not in os.environ: +if sys.platform == "darwin": + if "MACOSX_DEPLOYMENT_TARGET" not in os.environ: current_system = Version(platform.mac_ver()[0]) - python_target = Version(get_config_var('MACOSX_DEPLOYMENT_TARGET')) - if python_target < Version('10.9') and current_system >= Version('10.9'): - os.environ['MACOSX_DEPLOYMENT_TARGET'] = '10.9' + python_target = Version(get_config_var("MACOSX_DEPLOYMENT_TARGET")) + if python_target < Version("10.9") and current_system >= Version("10.9"): + os.environ["MACOSX_DEPLOYMENT_TARGET"] = "10.9" repo_root = os.path.dirname(os.path.abspath(__file__)) try: execfile except NameError: + def execfile(fname, globs, locs=None): locs = locs or globs exec(compile(open(fname).read(), fname, "exec"), globs, locs) + version_ns = {} -execfile(os.path.join(repo_root, 'odps', '_version.py'), version_ns) +execfile(os.path.join(repo_root, "odps", "_version.py"), version_ns) extra_install_cmds = [] @@ -111,132 +113,141 @@ def run(self): install.run(self) [self.run_command(cmd) for cmd in extra_install_cmds] + version = sys.version_info PY2 = version[0] == 2 PY3 = version[0] == 3 -PYPY = platform.python_implementation().lower() == 'pypy' +PYPY = platform.python_implementation().lower() == "pypy" if PY2 and version[:2] < (2, 7): - raise Exception('PyODPS supports Python 2.7+ (including Python 3+).') + raise Exception("PyODPS supports Python 2.7+ (including Python 3+).") try: import distribute - raise Exception("PyODPS cannot be installed when 'distribute' is installed. " - "Please uninstall it before installing PyODPS.") + + raise Exception( + "PyODPS cannot be installed when 'distribute' is installed. " + "Please uninstall it before installing PyODPS." + ) except ImportError: pass try: import pip + for pk in pip.get_installed_distributions(): - if pk.key == 'odps': - raise Exception('Package `odps` collides with PyODPS. Please uninstall it before installing PyODPS.') + if pk.key == "odps": + raise Exception( + "Package `odps` collides with PyODPS. Please uninstall it before installing PyODPS." + ) except (ImportError, AttributeError): pass try: from jupyter_core.paths import jupyter_data_dir + has_jupyter = True except ImportError: has_jupyter = False try: from jupyterlab import __version__ + has_jupyterlab = True except ImportError: has_jupyterlab = False -if len(sys.argv) > 1 and sys.argv[1] == 'clean': +if len(sys.argv) > 1 and sys.argv[1] == "clean": build_cmd = sys.argv[1] else: build_cmd = None requirements = [] -with open('requirements.txt') as f: +with open("requirements.txt") as f: requirements.extend(f.read().splitlines()) full_requirements = [ - 'jupyter>=1.0.0', - 'ipython>=4.0.0', - 'numpy>=1.6.0', - 'pandas>=0.17.0', - 'matplotlib>=1.4', - 'graphviz>=0.4', - 'greenlet>=0.4.10', - 'ipython<6.0.0; python_version < "3"', - 'cython>=0.20; sys_platform != "win32"', + "jupyter>=1.0.0", + "ipython>=4.0.0", + "numpy>=1.6.0", + "pandas>=0.17.0", + "matplotlib>=1.4", + "graphviz>=0.4", + "greenlet>=0.4.10", + "ipython<6.0.0; python_version < \"3\"", + "cython>=0.20; sys_platform != \"win32\"", ] mars_requirements = [ - 'pymars>=0.5.4', - 'protobuf>=3.6,<4.0', + "pymars>=0.5.4", + "protobuf>=3.6,<4.0", ] long_description = None -if os.path.exists('README.rst'): - with open('README.rst') as f: +if os.path.exists("README.rst"): + with open("README.rst") as f: long_description = f.read() setup_options = dict( - name='pyodps', - version=version_ns['__version__'], - description='ODPS Python SDK and data analysis framework', + name="pyodps", + version=version_ns["__version__"], + description="ODPS Python SDK and data analysis framework", long_description=long_description, - author='Wu Wei', - author_email='weiwu@cacheme.net', - maintainer='Wenjun Si', - maintainer_email='wenjun.swj@alibaba-inc.com', - url='http://github.com/aliyun/aliyun-odps-python-sdk', - license='Apache License 2.0', + author="Wu Wei", + author_email="weiwu@cacheme.net", + maintainer="Wenjun Si", + maintainer_email="wenjun.swj@alibaba-inc.com", + url="http://github.com/aliyun/aliyun-odps-python-sdk", + license="Apache License 2.0", classifiers=[ - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Programming Language :: Python :: Implementation :: CPython', - 'Programming Language :: Python :: Implementation :: PyPy', - 'Topic :: Software Development :: Libraries', + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + "Topic :: Software Development :: Libraries", ], - cmdclass={'install': CustomInstall}, - packages=find_packages(exclude=('*.tests.*', '*.tests')), + cmdclass={"install": CustomInstall}, + packages=find_packages(exclude=("*.tests.*", "*.tests")), include_package_data=True, install_requires=requirements, include_dirs=[], - extras_require={'full': full_requirements, 'mars': mars_requirements}, + extras_require={"full": full_requirements, "mars": mars_requirements}, entry_points={ - 'sqlalchemy.dialects': [ - 'odps = odps.sqlalchemy_odps:ODPSDialect', - 'maxcompute = odps.sqlalchemy_odps:ODPSDialect', + "sqlalchemy.dialects": [ + "odps = odps.sqlalchemy_odps:ODPSDialect", + "maxcompute = odps.sqlalchemy_odps:ODPSDialect", ], - 'superset.db_engine_specs': [ - 'odps = odps.superset_odps:ODPSEngineSpec', + "superset.db_engine_specs": [ + "odps = odps.superset_odps:ODPSEngineSpec", ], - 'console_scripts': [ - 'pyou = odps_scripts.pyou:main', - 'pyodps-pack = odps_scripts.pyodps_pack:main', + "console_scripts": [ + "pyou = odps_scripts.pyou:main", + "pyodps-pack = odps_scripts.pyodps_pack:main", ], }, ) -if build_cmd != 'clean' and not PYPY: # skip cython in pypy +if build_cmd != "clean" and not PYPY: # skip cython in pypy try: + import cython from Cython.Build import cythonize from Cython.Distutils import build_ext - import cython # detect if cython works - if sys.platform == 'win32': - cython.inline('return a + b', a=1, b=1) + if sys.platform == "win32": + cython.inline("return a + b", a=1, b=1) cythonize_kw = dict(language_level=sys.version_info[0]) - extension_kw = dict(language='c++', include_dirs=[]) + extension_kw = dict(language="c++", include_dirs=[]) if "MSC" in sys.version: extra_compile_args = ["/Ot", "/I" + os.path.join(repo_root, "misc")] extension_kw["extra_compile_args"] = extra_compile_args @@ -252,37 +263,47 @@ def run(self): cythonize_kw["compiler_directives"] = {"linetrace": True} extensions = [ - Extension('odps.src.types_c', ['odps/src/types_c.pyx'], **extension_kw), - Extension('odps.src.crc32c_c', ['odps/src/crc32c_c.pyx'], **extension_kw), - Extension('odps.src.utils_c', ['odps/src/utils_c.pyx'], **extension_kw), - Extension('odps.tunnel.pb.encoder_c', ['odps/tunnel/pb/encoder_c.pyx'], **extension_kw), - Extension('odps.tunnel.pb.decoder_c', ['odps/tunnel/pb/decoder_c.pyx'], **extension_kw), - Extension('odps.tunnel.io.writer_c', ['odps/tunnel/io/writer_c.pyx'], **extension_kw), - Extension('odps.tunnel.io.reader_c', ['odps/tunnel/io/reader_c.pyx'], **extension_kw), - Extension('odps.tunnel.checksum_c', ['odps/tunnel/checksum_c.pyx'], **extension_kw), - Extension('odps.tunnel.hasher_c', ['odps/tunnel/hasher_c.pyx'], **extension_kw), + Extension("odps.src.types_c", ["odps/src/types_c.pyx"], **extension_kw), + Extension("odps.src.crc32c_c", ["odps/src/crc32c_c.pyx"], **extension_kw), + Extension("odps.src.utils_c", ["odps/src/utils_c.pyx"], **extension_kw), + Extension( + "odps.tunnel.pb.encoder_c", + ["odps/tunnel/pb/encoder_c.pyx"], + **extension_kw + ), + Extension( + "odps.tunnel.pb.decoder_c", + ["odps/tunnel/pb/decoder_c.pyx"], + **extension_kw + ), + Extension( + "odps.tunnel.io.writer_c", + ["odps/tunnel/io/writer_c.pyx"], + **extension_kw + ), + Extension( + "odps.tunnel.io.reader_c", + ["odps/tunnel/io/reader_c.pyx"], + **extension_kw + ), + Extension( + "odps.tunnel.checksum_c", ["odps/tunnel/checksum_c.pyx"], **extension_kw + ), + Extension( + "odps.tunnel.hasher_c", ["odps/tunnel/hasher_c.pyx"], **extension_kw + ), ] - try: - import numpy as np - np_extension_kw = extension_kw.copy() - np_extension_kw['include_dirs'].append(np.get_include()) - extensions.extend([ - Extension('odps.tunnel.pdio.pdreader_c', ['odps/tunnel/pdio/pdreader_c.pyx'], **np_extension_kw), - Extension('odps.tunnel.pdio.pdwriter_c', ['odps/tunnel/pdio/pdwriter_c.pyx'], **np_extension_kw), - Extension('odps.tunnel.pdio.block_decoder_c', ['odps/tunnel/pdio/block_decoder_c.pyx'], **np_extension_kw), - Extension('odps.tunnel.pdio.block_encoder_c', ['odps/tunnel/pdio/block_encoder_c.pyx'], **np_extension_kw), - ]) - setup_options['include_dirs'].append(np.get_include()) - except ImportError: - pass - setup_options['cmdclass'].update({'build_ext': build_ext}) + setup_options["cmdclass"].update({"build_ext": build_ext}) force_recompile = bool(int(os.getenv("CYTHON_FORCE_RECOMPILE", "0"))) - setup_options['ext_modules'] = cythonize(extensions, force=force_recompile, **cythonize_kw) + setup_options["ext_modules"] = cythonize( + extensions, force=force_recompile, **cythonize_kw + ) except: pass -if build_cmd != 'clean' and has_jupyter: +if build_cmd != "clean" and has_jupyter: + class InstallJS(Command): description = "install JavaScript extensions" user_options = [] @@ -294,8 +315,8 @@ def finalize_options(self): pass def run(self): - src_dir = os.path.join(repo_root, 'odps', 'static', 'ui', 'target') - dest_dir = os.path.join(jupyter_data_dir(), 'nbextensions', 'pyodps') + src_dir = os.path.join(repo_root, "odps", "static", "ui", "target") + dest_dir = os.path.join(jupyter_data_dir(), "nbextensions", "pyodps") if os.path.exists(dest_dir): shutil.rmtree(dest_dir) if not os.path.exists(dest_dir): @@ -306,14 +327,11 @@ def run(self): from notebook.nbextensions import enable_nbextension except ImportError: return - enable_nbextension('notebook', 'pyodps/main') - + enable_nbextension("notebook", "pyodps/main") class BuildJS(Command): description = "build JavaScript files" - user_options = [ - ('registry=', None, 'npm registry') - ] + user_options = [("registry=", None, "npm registry")] def initialize_options(self): self.registry = None @@ -322,80 +340,52 @@ def finalize_options(self): pass def run(self): - if not which('npm'): - raise Exception('You need to install npm before building the scripts.') + if not which("npm"): + raise Exception("You need to install npm before building the scripts.") cwd = os.getcwd() - os.chdir(os.path.join(os.path.abspath(os.getcwd()), 'odps', 'static', 'ui')) - cmd = 'npm install' - if getattr(self, 'registry', None): - cmd += ' --registry=' + self.registry - print('executing ' + cmd) + os.chdir(os.path.join(os.path.abspath(os.getcwd()), "odps", "static", "ui")) + cmd = "npm install" + if getattr(self, "registry", None): + cmd += " --registry=" + self.registry + print("executing " + cmd) ret = os.system(cmd) ret >>= 8 if ret != 0: - print(cmd + ' exited with error: %d' % ret) + print(cmd + " exited with error: %d" % ret) - print('executing grunt') - ret = os.system('npm run grunt') + print("executing grunt") + ret = os.system("npm run grunt") ret >>= 8 if ret != 0: - print('grunt exited with error: %d' % ret) + print("grunt exited with error: %d" % ret) os.chdir(cwd) - - setup_options['cmdclass'].update({'install_js': InstallJS, 'build_js': BuildJS}) - extra_install_cmds.append('install_js') - -if build_cmd != 'clean' and has_jupyterlab: - class InstallJupyterLabExtension(Command): - description = "install Jupyterlab Extension" - user_options = [ - ('registry=', 'r', 'npm registry') - ] - - def initialize_options(self): - self.registry = 'https://registry.npm.taobao.org' - - def finalize_options(self): - pass - - def run(self): - lab_location = os.path.join(os.path.abspath(os.getcwd()), 'odps', 'lab_extension') - if not os.path.exists(lab_location): - return - os.chdir(lab_location) - print("\033[1;34m" + "Install pyodps-lab-extension" + "\033[0;0m") - os.system('npm install --registry=' + self.registry) - os.system('pip install .') - print("\033[0;32m" + "pyodps-lab-extension install success" + "\033[0;0m") - - - setup_options['cmdclass'].update({'install_jlab': InstallJupyterLabExtension}) - extra_install_cmds.append('install_jlab') + setup_options["cmdclass"].update({"install_js": InstallJS, "build_js": BuildJS}) + extra_install_cmds.append("install_js") setup(**setup_options) -if build_cmd == 'clean': - for root, dirs, files in os.walk(os.path.normpath('odps/')): +if build_cmd == "clean": + for root, dirs, files in os.walk(os.path.normpath("odps/")): pyx_files = set() c_file_pairs = [] - if '__pycache__' in dirs: - full_path = os.path.join(root, '__pycache__') + if "__pycache__" in dirs: + full_path = os.path.join(root, "__pycache__") print("removing '%s'" % full_path) shutil.rmtree(full_path) for f in files: fn, ext = os.path.splitext(f) # delete compiled binaries - if ext.lower() in ('.pyd', '.so', '.pyc'): + if ext.lower() in (".pyd", ".so", ".pyc"): full_path = os.path.join(root, f) print("removing '%s'" % full_path) os.unlink(full_path) - elif ext.lower() == '.pyx': + elif ext.lower() == ".pyx": pyx_files.add(fn) - elif ext.lower() in ('.c', '.cpp', '.cc'): + elif ext.lower() in (".c", ".cpp", ".cc"): c_file_pairs.append((fn, f)) # remove cython-generated files