@@ -156,7 +156,7 @@ Here are the companies that have officially adopted DataHub. Please feel free to
- [DataHub Blog](https://blog.datahubproject.io/)
- [DataHub YouTube Channel](https://www.youtube.com/channel/UC3qFQC5IiwR5fvWEqi_tJ5w)
-- [Optum: Data Mesh via DataHub](https://optum.github.io/blog/2022/03/23/data-mesh-via-datahub/)
+- [Optum: Data Mesh via DataHub](https://opensource.optum.com/blog/2022/03/23/data-mesh-via-datahub)
- [Saxo Bank: Enabling Data Discovery in Data Mesh](https://medium.com/datahub-project/enabling-data-discovery-in-a-data-mesh-the-saxo-journey-451b06969c8f)
- [Bringing The Power Of The DataHub Real-Time Metadata Graph To Everyone At Acryl Data](https://www.dataengineeringpodcast.com/acryl-data-datahub-metadata-graph-episode-230/)
- [DataHub: Popular Metadata Architectures Explained](https://engineering.linkedin.com/blog/2020/datahub-popular-metadata-architectures-explained)
diff --git a/docs-website/build.gradle b/docs-website/build.gradle
index 370ae3eec91761..a213ec1ae8194d 100644
--- a/docs-website/build.gradle
+++ b/docs-website/build.gradle
@@ -89,7 +89,7 @@ task fastReload(type: YarnTask) {
args = ['run', 'generate-rsync']
}
-task yarnLint(type: YarnTask, dependsOn: [yarnInstall]) {
+task yarnLint(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) {
inputs.files(projectMdFiles)
args = ['run', 'lint-check']
outputs.dir("dist")
diff --git a/docs-website/markdown-link-check-config.json b/docs-website/markdown-link-check-config.json
new file mode 100644
index 00000000000000..26e040edde6f79
--- /dev/null
+++ b/docs-website/markdown-link-check-config.json
@@ -0,0 +1,50 @@
+{
+ "ignorePatterns": [
+ {
+ "pattern": "^http://demo\\.datahubproject\\.io"
+ },
+ {
+ "pattern": "^http://localhost"
+ },
+ {
+ "pattern": "^http://www.famfamfam.com"
+ },
+ {
+ "pattern": "^http://www.linkedin.com"
+ },
+ {
+ "pattern": "\\.md$"
+ },
+ {
+ "pattern":"\\.json$"
+ },
+ {
+ "pattern":"\\.txt$"
+ },
+ {
+ "pattern": "\\.java$"
+ },
+ {
+ "pattern": "\\.md#.*$"
+ },
+ {
+ "pattern": "^https://oauth2.googleapis.com/token"
+ },
+ {
+ "pattern": "^https://login.microsoftonline.com/common/oauth2/na$"
+ },
+ {
+ "pattern": "#v(\\d+)-(\\d+)-(\\d+)"
+ },
+ {
+ "pattern": "^https://github.com/mohdsiddique$"
+ },
+ {
+ "pattern": "^https://github.com/2x$"
+ },
+ {
+ "pattern": "^https://github.com/datahub-project/datahub/assets/15873986/2f47d033-6c2b-483a-951d-e6d6b807f0d0%22%3E$"
+ }
+ ],
+ "aliveStatusCodes": [200, 206, 0, 999, 400, 401, 403]
+}
\ No newline at end of file
diff --git a/docs-website/package.json b/docs-website/package.json
index 400ef4143c786a..1722f921696927 100644
--- a/docs-website/package.json
+++ b/docs-website/package.json
@@ -17,7 +17,7 @@
"generate": "rm -rf genDocs genStatic && mkdir genDocs genStatic && yarn _generate-docs && mv docs/* genDocs/ && rmdir docs",
"generate-rsync": "mkdir -p genDocs genStatic && yarn _generate-docs && rsync -v --checksum -r -h -i --delete docs/ genDocs && rm -rf docs",
"lint": "prettier -w generateDocsDir.ts sidebars.js src/pages/index.js",
- "lint-check": "prettier -l generateDocsDir.ts sidebars.js src/pages/index.js",
+ "lint-check": "prettier -l generateDocsDir.ts sidebars.js src/pages/index.js && find ./genDocs -name \\*.md -not -path \"./genDocs/python-sdk/models.md\" -print0 | xargs -0 -n1 markdown-link-check -p -q -c markdown-link-check-config.json",
"lint-fix": "prettier --write generateDocsDir.ts sidebars.js src/pages/index.js"
},
"dependencies": {
@@ -37,6 +37,7 @@
"docusaurus-graphql-plugin": "0.5.0",
"docusaurus-plugin-sass": "^0.2.1",
"dotenv": "^16.0.1",
+ "markdown-link-check": "^3.11.2",
"markprompt": "^0.1.7",
"react": "^18.2.0",
"react-dom": "18.2.0",
diff --git a/docs-website/yarn.lock b/docs-website/yarn.lock
index 0613fe71ef78ee..5698029bff70a8 100644
--- a/docs-website/yarn.lock
+++ b/docs-website/yarn.lock
@@ -3414,6 +3414,11 @@ async-validator@^4.1.0:
resolved "https://registry.yarnpkg.com/async-validator/-/async-validator-4.2.5.tgz#c96ea3332a521699d0afaaceed510a54656c6339"
integrity sha512-7HhHjtERjqlNbZtqNqy2rckN/SpOOlmDliet+lP7k+eKZEjPk3DgyeU9lIXLdeLz0uBbbVp+9Qdow9wJWgwwfg==
+async@^3.2.4:
+ version "3.2.4"
+ resolved "https://registry.yarnpkg.com/async/-/async-3.2.4.tgz#2d22e00f8cddeb5fde5dd33522b56d1cf569a81c"
+ integrity sha512-iAB+JbDEGXhyIUavoDl9WP/Jj106Kz9DEn1DPgYw5ruDn0e3Wgi3sKFm55sASdGBNOQB8F59d9qQ7deqrHA8wQ==
+
asynckit@^0.4.0:
version "0.4.0"
resolved "https://registry.yarnpkg.com/asynckit/-/asynckit-0.4.0.tgz#c79ed97f7f34cb8f2ba1bc9790bcc366474b4b79"
@@ -3765,6 +3770,11 @@ chalk@^4.0.0, chalk@^4.1.0, chalk@^4.1.2:
ansi-styles "^4.1.0"
supports-color "^7.1.0"
+chalk@^5.2.0:
+ version "5.3.0"
+ resolved "https://registry.yarnpkg.com/chalk/-/chalk-5.3.0.tgz#67c20a7ebef70e7f3970a01f90fa210cb6860385"
+ integrity sha512-dLitG79d+GV1Nb/VYcCDFivJeK1hiukt9QjRNVOsUtTy1rR1YJsmpGGTZ3qJos+uw7WmWF4wUwBd9jxjocFC2w==
+
character-entities-legacy@^1.0.0:
version "1.1.4"
resolved "https://registry.yarnpkg.com/character-entities-legacy/-/character-entities-legacy-1.1.4.tgz#94bc1845dce70a5bb9d2ecc748725661293d8fc1"
@@ -3797,7 +3807,7 @@ cheerio-select@^2.1.0:
domhandler "^5.0.3"
domutils "^3.0.1"
-cheerio@^1.0.0-rc.12:
+cheerio@^1.0.0-rc.10, cheerio@^1.0.0-rc.12:
version "1.0.0-rc.12"
resolved "https://registry.yarnpkg.com/cheerio/-/cheerio-1.0.0-rc.12.tgz#788bf7466506b1c6bf5fae51d24a2c4d62e47683"
integrity sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q==
@@ -3984,6 +3994,11 @@ comma-separated-tokens@^2.0.0:
resolved "https://registry.yarnpkg.com/comma-separated-tokens/-/comma-separated-tokens-2.0.3.tgz#4e89c9458acb61bc8fef19f4529973b2392839ee"
integrity sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==
+commander@^10.0.1:
+ version "10.0.1"
+ resolved "https://registry.yarnpkg.com/commander/-/commander-10.0.1.tgz#881ee46b4f77d1c1dccc5823433aa39b022cbe06"
+ integrity sha512-y4Mg2tXshplEbSGzx7amzPwKKOCGuoSRP/CjEdwwk0FOGlUbq6lKuoyDZTNZkmxHdJtp54hdfY/JUrdL7Xfdug==
+
commander@^2.20.0:
version "2.20.3"
resolved "https://registry.yarnpkg.com/commander/-/commander-2.20.3.tgz#fd485e84c03eb4881c20722ba48035e8531aeb33"
@@ -4385,6 +4400,13 @@ debug@4, debug@^4.0.0, debug@^4.1.0, debug@^4.1.1:
dependencies:
ms "2.1.2"
+debug@^3.2.6:
+ version "3.2.7"
+ resolved "https://registry.yarnpkg.com/debug/-/debug-3.2.7.tgz#72580b7e9145fb39b6676f9c5e5fb100b934179a"
+ integrity sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==
+ dependencies:
+ ms "^2.1.1"
+
decode-named-character-reference@^1.0.0:
version "1.0.2"
resolved "https://registry.yarnpkg.com/decode-named-character-reference/-/decode-named-character-reference-1.0.2.tgz#daabac9690874c394c81e4162a0304b35d824f0e"
@@ -5551,6 +5573,13 @@ html-entities@^2.3.2:
resolved "https://registry.yarnpkg.com/html-entities/-/html-entities-2.4.0.tgz#edd0cee70402584c8c76cc2c0556db09d1f45061"
integrity sha512-igBTJcNNNhvZFRtm8uA6xMY6xYleeDwn3PeBCkDz7tHttv4F2hsDI2aPgNERWzvRcNYHNT3ymRaQzllmXj4YsQ==
+html-link-extractor@^1.0.5:
+ version "1.0.5"
+ resolved "https://registry.yarnpkg.com/html-link-extractor/-/html-link-extractor-1.0.5.tgz#a4be345cb13b8c3352d82b28c8b124bb7bf5dd6f"
+ integrity sha512-ADd49pudM157uWHwHQPUSX4ssMsvR/yHIswOR5CUfBdK9g9ZYGMhVSE6KZVHJ6kCkR0gH4htsfzU6zECDNVwyw==
+ dependencies:
+ cheerio "^1.0.0-rc.10"
+
html-minifier-terser@^6.0.2, html-minifier-terser@^6.1.0:
version "6.1.0"
resolved "https://registry.yarnpkg.com/html-minifier-terser/-/html-minifier-terser-6.1.0.tgz#bfc818934cc07918f6b3669f5774ecdfd48f32ab"
@@ -5673,6 +5702,13 @@ iconv-lite@0.4.24:
dependencies:
safer-buffer ">= 2.1.2 < 3"
+iconv-lite@^0.6.3:
+ version "0.6.3"
+ resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.6.3.tgz#a52f80bf38da1952eb5c681790719871a1a72501"
+ integrity sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==
+ dependencies:
+ safer-buffer ">= 2.1.2 < 3.0.0"
+
icss-utils@^5.0.0, icss-utils@^5.1.0:
version "5.1.0"
resolved "https://registry.yarnpkg.com/icss-utils/-/icss-utils-5.1.0.tgz#c6be6858abd013d768e98366ae47e25d5887b1ae"
@@ -5795,6 +5831,11 @@ ipaddr.js@^2.0.1:
resolved "https://registry.yarnpkg.com/ipaddr.js/-/ipaddr.js-2.1.0.tgz#2119bc447ff8c257753b196fc5f1ce08a4cdf39f"
integrity sha512-LlbxQ7xKzfBusov6UMi4MFpEg0m+mAm9xyNGEduwXMEDuf4WfzB/RZwMVYEd7IKGvh4IUkEXYxtAVu9T3OelJQ==
+is-absolute-url@^4.0.1:
+ version "4.0.1"
+ resolved "https://registry.yarnpkg.com/is-absolute-url/-/is-absolute-url-4.0.1.tgz#16e4d487d4fded05cfe0685e53ec86804a5e94dc"
+ integrity sha512-/51/TKE88Lmm7Gc4/8btclNXWS+g50wXhYJq8HWIBAGUBnoAdRu1aXeh364t/O7wXDAcTJDP8PNuNKWUDWie+A==
+
is-alphabetical@1.0.4, is-alphabetical@^1.0.0:
version "1.0.4"
resolved "https://registry.yarnpkg.com/is-alphabetical/-/is-alphabetical-1.0.4.tgz#9e7d6b94916be22153745d184c298cbf986a686d"
@@ -5963,6 +6004,13 @@ is-regexp@^1.0.0:
resolved "https://registry.yarnpkg.com/is-regexp/-/is-regexp-1.0.0.tgz#fd2d883545c46bac5a633e7b9a09e87fa2cb5069"
integrity sha512-7zjFAPO4/gwyQAAgRRmqeEeyIICSdmCqa3tsVHMdBzaXXRiqopZL4Cyghg/XulGWrtABTpbnYYzzIRffLkP4oA==
+is-relative-url@^4.0.0:
+ version "4.0.0"
+ resolved "https://registry.yarnpkg.com/is-relative-url/-/is-relative-url-4.0.0.tgz#4d8371999ff6033b76e4d9972fb5bf496fddfa97"
+ integrity sha512-PkzoL1qKAYXNFct5IKdKRH/iBQou/oCC85QhXj6WKtUQBliZ4Yfd3Zk27RHu9KQG8r6zgvAA2AQKC9p+rqTszg==
+ dependencies:
+ is-absolute-url "^4.0.1"
+
is-root@^2.1.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/is-root/-/is-root-2.1.0.tgz#809e18129cf1129644302a4f8544035d51984a9c"
@@ -6010,6 +6058,13 @@ isarray@~1.0.0:
resolved "https://registry.yarnpkg.com/isarray/-/isarray-1.0.0.tgz#bb935d48582cba168c06834957a54a3e07124f11"
integrity sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==
+isemail@^3.2.0:
+ version "3.2.0"
+ resolved "https://registry.yarnpkg.com/isemail/-/isemail-3.2.0.tgz#59310a021931a9fb06bbb51e155ce0b3f236832c"
+ integrity sha512-zKqkK+O+dGqevc93KNsbZ/TqTUFd46MwWjYOoMrjIMZ51eU7DtQG3Wmd9SQQT7i7RVnuTPEiYEWHU3MSbxC1Tg==
+ dependencies:
+ punycode "2.x.x"
+
isexe@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/isexe/-/isexe-2.0.0.tgz#e8fbf374dc556ff8947a10dcb0572d633f2cfa10"
@@ -6205,6 +6260,16 @@ lines-and-columns@^1.1.6:
resolved "https://registry.yarnpkg.com/lines-and-columns/-/lines-and-columns-1.2.4.tgz#eca284f75d2965079309dc0ad9255abb2ebc1632"
integrity sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==
+link-check@^5.2.0:
+ version "5.2.0"
+ resolved "https://registry.yarnpkg.com/link-check/-/link-check-5.2.0.tgz#595a339d305900bed8c1302f4342a29c366bf478"
+ integrity sha512-xRbhYLaGDw7eRDTibTAcl6fXtmUQ13vkezQiTqshHHdGueQeumgxxmQMIOmJYsh2p8BF08t8thhDQ++EAOOq3w==
+ dependencies:
+ is-relative-url "^4.0.0"
+ isemail "^3.2.0"
+ ms "^2.1.3"
+ needle "^3.1.0"
+
loader-runner@^4.2.0:
version "4.3.0"
resolved "https://registry.yarnpkg.com/loader-runner/-/loader-runner-4.3.0.tgz#c1b4a163b99f614830353b16755e7149ac2314e1"
@@ -6366,6 +6431,28 @@ markdown-escapes@^1.0.0:
resolved "https://registry.yarnpkg.com/markdown-escapes/-/markdown-escapes-1.0.4.tgz#c95415ef451499d7602b91095f3c8e8975f78535"
integrity sha512-8z4efJYk43E0upd0NbVXwgSTQs6cT3T06etieCMEg7dRbzCbxUCK/GHlX8mhHRDcp+OLlHkPKsvqQTCvsRl2cg==
+markdown-link-check@^3.11.2:
+ version "3.11.2"
+ resolved "https://registry.yarnpkg.com/markdown-link-check/-/markdown-link-check-3.11.2.tgz#303a8a03d4a34c42ef3158e0b245bced26b5d904"
+ integrity sha512-zave+vI4AMeLp0FlUllAwGbNytSKsS3R2Zgtf3ufVT892Z/L6Ro9osZwE9PNA7s0IkJ4onnuHqatpsaCiAShJw==
+ dependencies:
+ async "^3.2.4"
+ chalk "^5.2.0"
+ commander "^10.0.1"
+ link-check "^5.2.0"
+ lodash "^4.17.21"
+ markdown-link-extractor "^3.1.0"
+ needle "^3.2.0"
+ progress "^2.0.3"
+
+markdown-link-extractor@^3.1.0:
+ version "3.1.0"
+ resolved "https://registry.yarnpkg.com/markdown-link-extractor/-/markdown-link-extractor-3.1.0.tgz#0d5a703630d791a9e2017449e1a9b294f2d2b676"
+ integrity sha512-r0NEbP1dsM+IqB62Ru9TXLP/HDaTdBNIeylYXumuBi6Xv4ufjE1/g3TnslYL8VNqNcGAGbMptQFHrrdfoZ/Sug==
+ dependencies:
+ html-link-extractor "^1.0.5"
+ marked "^4.1.0"
+
markdown-table@^3.0.0:
version "3.0.3"
resolved "https://registry.yarnpkg.com/markdown-table/-/markdown-table-3.0.3.tgz#e6331d30e493127e031dd385488b5bd326e4a6bd"
@@ -6376,6 +6463,11 @@ marked@^2.0.3:
resolved "https://registry.yarnpkg.com/marked/-/marked-2.1.3.tgz#bd017cef6431724fd4b27e0657f5ceb14bff3753"
integrity sha512-/Q+7MGzaETqifOMWYEA7HVMaZb4XbcRfaOzcSsHZEith83KGlvaSG33u0SKu89Mj5h+T8V2hM+8O45Qc5XTgwA==
+marked@^4.1.0:
+ version "4.3.0"
+ resolved "https://registry.yarnpkg.com/marked/-/marked-4.3.0.tgz#796362821b019f734054582038b116481b456cf3"
+ integrity sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==
+
markprompt@^0.1.7:
version "0.1.7"
resolved "https://registry.yarnpkg.com/markprompt/-/markprompt-0.1.7.tgz#fa049e11109d93372c45c38b3ca40bd5fdf751ea"
@@ -6978,7 +7070,7 @@ ms@2.1.2:
resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.2.tgz#d09d1f357b443f493382a8eb3ccd183872ae6009"
integrity sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==
-ms@2.1.3:
+ms@2.1.3, ms@^2.1.1, ms@^2.1.3:
version "2.1.3"
resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.3.tgz#574c8138ce1d2b5861f0b44579dbadd60c6615b2"
integrity sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==
@@ -7001,6 +7093,15 @@ napi-build-utils@^1.0.1:
resolved "https://registry.yarnpkg.com/napi-build-utils/-/napi-build-utils-1.0.2.tgz#b1fddc0b2c46e380a0b7a76f984dd47c41a13806"
integrity sha512-ONmRUqK7zj7DWX0D9ADe03wbwOBZxNAfF20PlGfCWQcD3+/MakShIHrMqx9YwPTfxDdF1zLeL+RGZiR9kGMLdg==
+needle@^3.1.0, needle@^3.2.0:
+ version "3.2.0"
+ resolved "https://registry.yarnpkg.com/needle/-/needle-3.2.0.tgz#07d240ebcabfd65c76c03afae7f6defe6469df44"
+ integrity sha512-oUvzXnyLiVyVGoianLijF9O/RecZUf7TkBfimjGrLM4eQhXyeJwM6GeAWccwfQ9aa4gMCZKqhAOuLaMIcQxajQ==
+ dependencies:
+ debug "^3.2.6"
+ iconv-lite "^0.6.3"
+ sax "^1.2.4"
+
negotiator@0.6.3:
version "0.6.3"
resolved "https://registry.yarnpkg.com/negotiator/-/negotiator-0.6.3.tgz#58e323a72fedc0d6f9cd4d31fe49f51479590ccd"
@@ -7753,6 +7854,11 @@ process-nextick-args@~2.0.0:
resolved "https://registry.yarnpkg.com/process-nextick-args/-/process-nextick-args-2.0.1.tgz#7820d9b16120cc55ca9ae7792680ae7dba6d7fe2"
integrity sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==
+progress@^2.0.3:
+ version "2.0.3"
+ resolved "https://registry.yarnpkg.com/progress/-/progress-2.0.3.tgz#7e8cf8d8f5b8f239c1bc68beb4eb78567d572ef8"
+ integrity sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==
+
promise@^7.1.1:
version "7.3.1"
resolved "https://registry.yarnpkg.com/promise/-/promise-7.3.1.tgz#064b72602b18f90f29192b8b1bc418ffd1ebd3bf"
@@ -7805,16 +7911,16 @@ pump@^3.0.0:
end-of-stream "^1.1.0"
once "^1.3.1"
+punycode@2.x.x, punycode@^2.1.0:
+ version "2.3.0"
+ resolved "https://registry.yarnpkg.com/punycode/-/punycode-2.3.0.tgz#f67fa67c94da8f4d0cfff981aee4118064199b8f"
+ integrity sha512-rRV+zQD8tVFys26lAGR9WUuS4iUAngJScM+ZRSKtvl5tKeZ2t5bvdNFdNHBW9FWR4guGHlgmsZ1G7BSm2wTbuA==
+
punycode@^1.3.2:
version "1.4.1"
resolved "https://registry.yarnpkg.com/punycode/-/punycode-1.4.1.tgz#c0d5a63b2718800ad8e1eb0fa5269c84dd41845e"
integrity sha512-jmYNElW7yvO7TV33CjSmvSiE2yco3bV2czu/OzDKdMNVZQWfxCblURLhf+47syQRBntjfLdd/H0egrzIG+oaFQ==
-punycode@^2.1.0:
- version "2.3.0"
- resolved "https://registry.yarnpkg.com/punycode/-/punycode-2.3.0.tgz#f67fa67c94da8f4d0cfff981aee4118064199b8f"
- integrity sha512-rRV+zQD8tVFys26lAGR9WUuS4iUAngJScM+ZRSKtvl5tKeZ2t5bvdNFdNHBW9FWR4guGHlgmsZ1G7BSm2wTbuA==
-
pupa@^2.1.1:
version "2.1.1"
resolved "https://registry.yarnpkg.com/pupa/-/pupa-2.1.1.tgz#f5e8fd4afc2c5d97828faa523549ed8744a20d62"
@@ -8789,7 +8895,7 @@ safe-buffer@5.2.1, safe-buffer@>=5.1.0, safe-buffer@^5.0.1, safe-buffer@^5.1.0,
resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.2.1.tgz#1eaf9fa9bdb1fdd4ec75f58f9cdb4e6b7827eec6"
integrity sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==
-"safer-buffer@>= 2.1.2 < 3":
+"safer-buffer@>= 2.1.2 < 3", "safer-buffer@>= 2.1.2 < 3.0.0":
version "2.1.2"
resolved "https://registry.yarnpkg.com/safer-buffer/-/safer-buffer-2.1.2.tgz#44fa161b0187b9549dd84bb91802f9bd8385cd6a"
integrity sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==
diff --git a/docs/advanced/no-code-modeling.md b/docs/advanced/no-code-modeling.md
index d76b776d3dddb2..172e63f821eabd 100644
--- a/docs/advanced/no-code-modeling.md
+++ b/docs/advanced/no-code-modeling.md
@@ -100,10 +100,9 @@ Currently, there are various models in GMS:
1. [Urn](https://github.com/datahub-project/datahub/blob/master/li-utils/src/main/pegasus/com/linkedin/common/DatasetUrn.pdl) - Structs composing primary keys
2. [Root] [Snapshots](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/Snapshot.pdl) - Container of aspects
3. [Aspects](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/DashboardAspect.pdl) - Optional container of fields
-4. [Values](https://github.com/datahub-project/datahub/blob/master/gms/api/src/main/pegasus/com/linkedin/dataset/Dataset.pdl), [Keys](https://github.com/datahub-project/datahub/blob/master/gms/api/src/main/pegasus/com/linkedin/dataset/DatasetKey.pdl) - Model returned by GMS [Rest.li](http://rest.li) API (public facing)
-5. [Entities](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/DatasetEntity.pdl) - Records with fields derived from the URN. Used only in graph / relationships
-6. [Relationships](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/Relationship.pdl) - Edges between 2 entities with optional edge properties
-7. [Search Documents](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/search/ChartDocument.pdl) - Flat documents for indexing within Elastic index
+4. [Keys](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl) - Model returned by GMS [Rest.li](http://rest.li) API (public facing)
+5. [Relationships](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/common/EntityRelationship.pdl) - Edges between 2 entities with optional edge properties
+6. Search Documents - Flat documents for indexing within Elastic index
- And corresponding index [mappings.json](https://github.com/datahub-project/datahub/blob/master/gms/impl/src/main/resources/index/chart/mappings.json), [settings.json](https://github.com/datahub-project/datahub/blob/master/gms/impl/src/main/resources/index/chart/settings.json)
Various components of GMS depend on / make assumptions about these model types:
diff --git a/docs/api/graphql/how-to-set-up-graphql.md b/docs/api/graphql/how-to-set-up-graphql.md
index 584bf34ad3f92d..2be2f935b12b10 100644
--- a/docs/api/graphql/how-to-set-up-graphql.md
+++ b/docs/api/graphql/how-to-set-up-graphql.md
@@ -68,7 +68,7 @@ In the request body, select the `GraphQL` option and enter your GraphQL query in
-Please refer to [Querying with GraphQL](https://learning.postman.com/docs/sending-requests/graphql/graphql/) in the Postman documentation for more information.
+Please refer to [Querying with GraphQL](https://learning.postman.com/docs/sending-requests/graphql/graphql-overview/) in the Postman documentation for more information.
### Authentication + Authorization
diff --git a/docs/architecture/architecture.md b/docs/architecture/architecture.md
index 6a9c1860d71b09..20f18f09d949be 100644
--- a/docs/architecture/architecture.md
+++ b/docs/architecture/architecture.md
@@ -17,7 +17,7 @@ The figures below describe the high-level architecture of DataHub.
-
+
diff --git a/docs/authentication/guides/add-users.md b/docs/authentication/guides/add-users.md
index f5dfc832010831..d380cacd6665e4 100644
--- a/docs/authentication/guides/add-users.md
+++ b/docs/authentication/guides/add-users.md
@@ -19,13 +19,13 @@ To do so, navigate to the **Users & Groups** section inside of Settings page. He
do not have the correct privileges to invite users, this button will be disabled.
-
+
To invite new users, simply share the link with others inside your organization.
-
+
When a new user visits the link, they will be directed to a sign up screen where they can create their DataHub account.
@@ -37,13 +37,13 @@ and click **Reset user password** inside the menu dropdown on the right hand sid
`Manage User Credentials` [Platform Privilege](../../authorization/access-policies-guide.md) in order to reset passwords.
-
+
To reset the password, simply share the password reset link with the user who needs to change their password. Password reset links expire after 24 hours.
-
+
# Configuring Single Sign-On with OpenID Connect
diff --git a/docs/authentication/guides/sso/configure-oidc-react.md b/docs/authentication/guides/sso/configure-oidc-react.md
index d27792ce3967b1..512d6adbf916fc 100644
--- a/docs/authentication/guides/sso/configure-oidc-react.md
+++ b/docs/authentication/guides/sso/configure-oidc-react.md
@@ -26,7 +26,7 @@ please see [this guide](../jaas.md) to mount a custom user.props file for a JAAS
To configure OIDC in React, you will most often need to register yourself as a client with your identity provider (Google, Okta, etc). Each provider may
have their own instructions. Provided below are links to examples for Okta, Google, Azure AD, & Keycloak.
-- [Registering an App in Okta](https://developer.okta.com/docs/guides/add-an-external-idp/apple/register-app-in-okta/)
+- [Registering an App in Okta](https://developer.okta.com/docs/guides/add-an-external-idp/openidconnect/main/)
- [OpenID Connect in Google Identity](https://developers.google.com/identity/protocols/oauth2/openid-connect)
- [OpenID Connect authentication with Azure Active Directory](https://docs.microsoft.com/en-us/azure/active-directory/fundamentals/auth-oidc)
- [Keycloak - Securing Applications and Services Guide](https://www.keycloak.org/docs/latest/securing_apps/)
diff --git a/docs/cli.md b/docs/cli.md
index eb8bb406b01074..267f289d9f54a6 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -547,7 +547,7 @@ Old Entities Migrated = {'urn:li:dataset:(urn:li:dataPlatform:hive,logging_event
### Using docker
[![Docker Hub](https://img.shields.io/docker/pulls/acryldata/datahub-ingestion?style=plastic)](https://hub.docker.com/r/acryldata/datahub-ingestion)
-[![datahub-ingestion docker](https://github.com/acryldata/datahub/actions/workflows/docker-ingestion.yml/badge.svg)](https://github.com/acryldata/datahub/actions/workflows/docker-ingestion.yml)
+[![datahub-ingestion docker](https://github.com/acryldata/datahub/workflows/datahub-ingestion%20docker/badge.svg)](https://github.com/acryldata/datahub/actions/workflows/docker-ingestion.yml)
If you don't want to install locally, you can alternatively run metadata ingestion within a Docker container.
We have prebuilt images available on [Docker hub](https://hub.docker.com/r/acryldata/datahub-ingestion). All plugins will be installed and enabled automatically.
diff --git a/docs/domains.md b/docs/domains.md
index c846a753417c59..1b2ebc9d47f397 100644
--- a/docs/domains.md
+++ b/docs/domains.md
@@ -22,20 +22,20 @@ You can create this privileges by creating a new [Metadata Policy](./authorizati
To create a Domain, first navigate to the **Domains** tab in the top-right menu of DataHub.
-
+
Once you're on the Domains page, you'll see a list of all the Domains that have been created on DataHub. Additionally, you can
view the number of entities inside each Domain.
-
+
To create a new Domain, click '+ New Domain'.
-
+
Inside the form, you can choose a name for your Domain. Most often, this will align with your business units or groups, for example
@@ -48,7 +48,7 @@ for the Domain. This option is useful if you intend to refer to Domains by a com
key to be human-readable. Proceed with caution: once you select a custom id, it cannot be easily changed.
-
+
By default, you don't need to worry about this. DataHub will auto-generate a unique Domain id for you.
@@ -64,7 +64,7 @@ To assign an asset to a Domain, simply navigate to the asset's profile page. At
see a 'Domain' section. Click 'Set Domain', and then search for the Domain you'd like to add to. When you're done, click 'Add'.
-
+
To remove an asset from a Domain, click the 'x' icon on the Domain tag.
@@ -149,27 +149,27 @@ source:
Once you've created a Domain, you can use the search bar to find it.
-
+
Clicking on the search result will take you to the Domain's profile, where you
can edit its description, add / remove owners, and view the assets inside the Domain.
-
+
Once you've added assets to a Domain, you can filter search results to limit to those Assets
within a particular Domain using the left-side search filters.
-
+
On the homepage, you'll also find a list of the most popular Domains in your organization.
-
+
## Additional Resources
@@ -242,7 +242,6 @@ DataHub supports Tags, Glossary Terms, & Domains as distinct types of Metadata t
- **Tags**: Informal, loosely controlled labels that serve as a tool for search & discovery. Assets may have multiple tags. No formal, central management.
- **Glossary Terms**: A controlled vocabulary, with optional hierarchy. Terms are typically used to standardize types of leaf-level attributes (i.e. schema fields) for governance. E.g. (EMAIL_PLAINTEXT)
- **Domains**: A set of top-level categories. Usually aligned to business units / disciplines to which the assets are most relevant. Central or distributed management. Single Domain assignment per data asset.
-
*Need more help? Join the conversation in [Slack](http://slack.datahubproject.io)!*
### Related Features
diff --git a/docs/how/add-new-aspect.md b/docs/how/add-new-aspect.md
index 6ea7256ed75cc0..d1fe567018903b 100644
--- a/docs/how/add-new-aspect.md
+++ b/docs/how/add-new-aspect.md
@@ -1,20 +1,20 @@
# How to add a new metadata aspect?
Adding a new metadata [aspect](../what/aspect.md) is one of the most common ways to extend an existing [entity](../what/entity.md).
-We'll use the [CorpUserEditableInfo](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl) as an example here.
+We'll use the CorpUserEditableInfo as an example here.
1. Add the aspect model to the corresponding namespace (e.g. [`com.linkedin.identity`](https://github.com/datahub-project/datahub/tree/master/metadata-models/src/main/pegasus/com/linkedin/identity))
-2. Extend the entity's aspect union to include the new aspect (e.g. [`CorpUserAspect`](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/CorpUserAspect.pdl))
+2. Extend the entity's aspect union to include the new aspect.
3. Rebuild the rest.li [IDL & snapshot](https://linkedin.github.io/rest.li/modeling/compatibility_check) by running the following command from the project root
```
./gradlew :metadata-service:restli-servlet-impl:build -Prest.model.compatibility=ignore
```
-4. To surface the new aspect at the top-level [resource endpoint](https://linkedin.github.io/rest.li/user_guide/restli_server#writing-resources), extend the resource data model (e.g. [`CorpUser`](https://github.com/datahub-project/datahub/blob/master/gms/api/src/main/pegasus/com/linkedin/identity/CorpUser.pdl)) with an optional field (e.g. [`editableInfo`](https://github.com/datahub-project/datahub/blob/master/gms/api/src/main/pegasus/com/linkedin/identity/CorpUser.pdl#L21)). You'll also need to extend the `toValue` & `toSnapshot` methods of the top-level resource (e.g. [`CorpUsers`](https://github.com/datahub-project/datahub/blob/master/gms/impl/src/main/java/com/linkedin/metadata/resources/identity/CorpUsers.java)) to convert between the snapshot & value models.
+4. To surface the new aspect at the top-level [resource endpoint](https://linkedin.github.io/rest.li/user_guide/restli_server#writing-resources), extend the resource data model with an optional field. You'll also need to extend the `toValue` & `toSnapshot` methods of the top-level resource (e.g. [`CorpUsers`](https://github.com/datahub-project/datahub/blob/master/gms/impl/src/main/java/com/linkedin/metadata/resources/identity/CorpUsers.java)) to convert between the snapshot & value models.
-5. (Optional) If there's need to update the aspect via API (instead of/in addition to MCE), add a [sub-resource](https://linkedin.github.io/rest.li/user_guide/restli_server#sub-resources) endpoint for the new aspect (e.g. [`CorpUsersEditableInfoResource`](https://github.com/datahub-project/datahub/blob/master/gms/impl/src/main/java/com/linkedin/metadata/resources/identity/CorpUsersEditableInfoResource.java)). The sub-resource endpiont also allows you to retrieve previous versions of the aspect as well as additional metadata such as the audit stamp.
+5. (Optional) If there's need to update the aspect via API (instead of/in addition to MCE), add a [sub-resource](https://linkedin.github.io/rest.li/user_guide/restli_server#sub-resources) endpoint for the new aspect (e.g. `CorpUsersEditableInfoResource`). The sub-resource endpiont also allows you to retrieve previous versions of the aspect as well as additional metadata such as the audit stamp.
-6. After rebuilding & restarting [gms](https://github.com/datahub-project/datahub/tree/master/gms), [mce-consumer-job](https://github.com/datahub-project/datahub/tree/master/metadata-jobs/mce-consumer-job) & [mae-consumer-job](https://github.com/datahub-project/datahub/tree/master/metadata-jobs/mae-consumer-job),
+6. After rebuilding & restarting gms, [mce-consumer-job](https://github.com/datahub-project/datahub/tree/master/metadata-jobs/mce-consumer-job) & [mae-consumer-job](https://github.com/datahub-project/datahub/tree/master/metadata-jobs/mae-consumer-job),z
you should be able to start emitting [MCE](../what/mxe.md) with the new aspect and have it automatically ingested & stored in DB.
diff --git a/docs/modeling/extending-the-metadata-model.md b/docs/modeling/extending-the-metadata-model.md
index 98f70f6d933e40..be2d7d795de701 100644
--- a/docs/modeling/extending-the-metadata-model.md
+++ b/docs/modeling/extending-the-metadata-model.md
@@ -24,7 +24,7 @@ We will refer to the two options as the **open-source fork** and **custom reposi
## This Guide
This guide will outline what the experience of adding a new Entity should look like through a real example of adding the
-Dashboard Entity. If you want to extend an existing Entity, you can skip directly to [Step 3](#step_3).
+Dashboard Entity. If you want to extend an existing Entity, you can skip directly to [Step 3](#step-3-define-custom-aspects-or-attach-existing-aspects-to-your-entity).
At a high level, an entity is made up of:
@@ -82,14 +82,14 @@ Because they are aspects, keys need to be annotated with an @Aspect annotation,
can be a part of.
The key can also be annotated with the two index annotations: @Relationship and @Searchable. This instructs DataHub
-infra to use the fields in the key to create relationships and index fields for search. See [Step 3](#step_3) for more details on
+infra to use the fields in the key to create relationships and index fields for search. See [Step 3](#step-3-define-custom-aspects-or-attach-existing-aspects-to-your-entity) for more details on
the annotation model.
**Constraints**: Note that each field in a Key Aspect MUST be of String or Enum type.
### Step 2: Create the new entity with its key aspect
-Define the entity within an `entity-registry.yml` file. Depending on your approach, the location of this file may vary. More on that in steps [4](#step_4) and [5](#step_5).
+Define the entity within an `entity-registry.yml` file. Depending on your approach, the location of this file may vary. More on that in steps [4](#step-4-choose-a-place-to-store-your-model-extension) and [5](#step-5-attaching-your-non-key-aspects-to-the-entity).
Example:
```yaml
@@ -212,11 +212,11 @@ After you create your Aspect, you need to attach to all the entities that it app
**Constraints**: Note that all aspects MUST be of type Record.
-### Step 4: Choose a place to store your model extension
+### Step 4: Choose a place to store your model extension
At the beginning of this document, we walked you through a flow-chart that should help you decide whether you need to maintain a fork of the open source DataHub repo for your model extensions, or whether you can just use a model extension repository that can stay independent of the DataHub repo. Depending on what path you took, the place you store your aspect model files (the .pdl files) and the entity-registry files (the yaml file called `entity-registry.yaml` or `entity-registry.yml`) will vary.
-- Open source Fork: Aspect files go under [`metadata-models`](../../metadata-models) module in the main repo, entity registry goes into [`metadata-models/src/main/resources/entity-registry.yml`](../../metadata-models/src/main/resources/entity-registry.yml). Read on for more details in [Step 5](#step_5).
+- Open source Fork: Aspect files go under [`metadata-models`](../../metadata-models) module in the main repo, entity registry goes into [`metadata-models/src/main/resources/entity-registry.yml`](../../metadata-models/src/main/resources/entity-registry.yml). Read on for more details in [Step 5](#step-5-attaching-your-non-key-aspects-to-the-entity).
- Custom repository: Read the [metadata-models-custom](../../metadata-models-custom/README.md) documentation to learn how to store and version your aspect models and registry.
### Step 5: Attaching your non-key Aspect(s) to the Entity
diff --git a/docs/modeling/metadata-model.md b/docs/modeling/metadata-model.md
index 037c9c7108a6e5..a8958985a0a724 100644
--- a/docs/modeling/metadata-model.md
+++ b/docs/modeling/metadata-model.md
@@ -433,7 +433,7 @@ aggregation query against a timeseries aspect.
The *@TimeseriesField* and the *@TimeseriesFieldCollection* are two new annotations that can be attached to a field of
a *Timeseries aspect* that allows it to be part of an aggregatable query. The kinds of aggregations allowed on these
annotated fields depends on the type of the field, as well as the kind of aggregation, as
-described [here](#Performing-an-aggregation-on-a-Timeseries-aspect).
+described [here](#performing-an-aggregation-on-a-timeseries-aspect).
* `@TimeseriesField = {}` - this annotation can be used with any type of non-collection type field of the aspect such as
primitive types and records (see the fields *stat*, *strStat* and *strArray* fields
@@ -515,7 +515,7 @@ my_emitter = DatahubRestEmitter("http://localhost:8080")
my_emitter.emit(mcpw)
```
-###### Performing an aggregation on a Timeseries aspect.
+###### Performing an aggregation on a Timeseries aspect
Aggreations on timeseries aspects can be performed by the GMS REST API for `/analytics?action=getTimeseriesStats` which
accepts the following params.
diff --git a/docs/tags.md b/docs/tags.md
index 945b514dc7b473..cb08c9fafea490 100644
--- a/docs/tags.md
+++ b/docs/tags.md
@@ -27,25 +27,25 @@ You can create these privileges by creating a new [Metadata Policy](./authorizat
To add a tag at the dataset or container level, simply navigate to the page for that entity and click on the **Add Tag** button.
-
+
Type in the name of the tag you want to add. You can add a new tag, or add a tag that already exists (the autocomplete will pull up the tag if it already exists).
-
+
Click on the "Add" button and you'll see the tag has been added!
-
+
If you would like to add a tag at the schema level, hover over the "Tags" column for a schema until the "Add Tag" button shows up, and then follow the same flow as above.
-
+
### Removing a Tag
@@ -57,7 +57,7 @@ To remove a tag, simply click on the "X" button in the tag. Then click "Yes" whe
You can search for a tag in the search bar, and even filter entities by the presence of a specific tag.
-
+
## Additional Resources
diff --git a/docs/townhall-history.md b/docs/townhall-history.md
index e235a70c5d7b95..d92905af0cd72c 100644
--- a/docs/townhall-history.md
+++ b/docs/townhall-history.md
@@ -328,7 +328,7 @@ November Town Hall (in December!)
* Welcome - 5 mins
* Latest React App Demo! ([video](https://www.youtube.com/watch?v=RQBEJhcen5E)) by John Joyce and Gabe Lyons - 5 mins
-* Use-Case: DataHub at Geotab ([slides](https://docs.google.com/presentation/d/1qcgO3BW5NauuG0HnPqrxGcujsK-rJ1-EuU-7cbexkqE/edit?usp=sharing),[video](https://www.youtube.com/watch?v=boyjT2OrlU4)) by [John Yoon](https://www.linkedin.com/in/yhjyoon/) - 15 mins
+* Use-Case: DataHub at Geotab ([video](https://www.youtube.com/watch?v=boyjT2OrlU4)) by [John Yoon](https://www.linkedin.com/in/yhjyoon/) - 15 mins
* Tech Deep Dive: Tour of new pull-based Python Ingestion scripts ([slides](https://docs.google.com/presentation/d/15Xay596WDIhzkc5c8DEv6M-Bv1N4hP8quup1tkws6ms/edit#slide=id.gb478361595_0_10),[video](https://www.youtube.com/watch?v=u0IUQvG-_xI)) by [Harshal Sheth](https://www.linkedin.com/in/hsheth2/) - 15 mins
* General Q&A from sign up sheet, slack, and participants - 15 mins
* Closing remarks - 5 mins
diff --git a/docs/what/gms.md b/docs/what/gms.md
index 9e1cea1b9540e8..a39450d28ae83e 100644
--- a/docs/what/gms.md
+++ b/docs/what/gms.md
@@ -2,6 +2,4 @@
Metadata for [entities](entity.md) [onboarded](../modeling/metadata-model.md) to [GMA](gma.md) is served through microservices known as Generalized Metadata Service (GMS). GMS typically provides a [Rest.li](http://rest.li) API and must access the metadata using [GMA DAOs](../architecture/metadata-serving.md).
-While a GMS is completely free to define its public APIs, we do provide a list of [resource base classes](https://github.com/datahub-project/datahub-gma/tree/master/restli-resources/src/main/java/com/linkedin/metadata/restli) to leverage for common patterns.
-
-GMA is designed to support a distributed fleet of GMS, each serving a subset of the [GMA graph](graph.md). However, for simplicity we include a single centralized GMS ([datahub-gms](../../gms)) that serves all entities.
+GMA is designed to support a distributed fleet of GMS, each serving a subset of the [GMA graph](graph.md). However, for simplicity we include a single centralized GMS that serves all entities.
diff --git a/docs/what/mxe.md b/docs/what/mxe.md
index 8af96360858a33..25294e04ea3d92 100644
--- a/docs/what/mxe.md
+++ b/docs/what/mxe.md
@@ -266,7 +266,7 @@ A Metadata Change Event represents a request to change multiple aspects for the
It leverages a deprecated concept of `Snapshot`, which is a strongly-typed list of aspects for the same
entity.
-A MCE is a "proposal" for a set of metadata changes, as opposed to [MAE](#metadata-audit-event), which is conveying a committed change.
+A MCE is a "proposal" for a set of metadata changes, as opposed to [MAE](#metadata-audit-event-mae), which is conveying a committed change.
Consequently, only successfully accepted and processed MCEs will lead to the emission of a corresponding MAE / MCLs.
### Emission
diff --git a/docs/what/relationship.md b/docs/what/relationship.md
index dcfe093a1b1245..d5348dc04b3c01 100644
--- a/docs/what/relationship.md
+++ b/docs/what/relationship.md
@@ -102,9 +102,6 @@ For one, the actual direction doesn’t really impact the execution of graph que
That being said, generally there’s a more "natural way" to specify the direction of a relationship, which closely relate to how the metadata is stored. For example, the membership information for an LDAP group is generally stored as a list in group’s metadata. As a result, it’s more natural to model a `HasMember` relationship that points from a group to a member, instead of a `IsMemberOf` relationship pointing from member to group.
-Since all relationships are explicitly declared, it’s fairly easy for a user to discover what relationships are available and their directionality by inspecting
-the [relationships directory](../../metadata-models/src/main/pegasus/com/linkedin/metadata/relationship). It’s also possible to provide a UI for the catalog of entities and relationships for analysts who are interested in building complex graph queries to gain insights into the metadata.
-
## High Cardinality Relationships
See [this doc](../advanced/high-cardinality.md) for suggestions on how to best model relationships with high cardinality.
diff --git a/docs/what/search-document.md b/docs/what/search-document.md
index 81359a55d0caec..bd27656e512c3a 100644
--- a/docs/what/search-document.md
+++ b/docs/what/search-document.md
@@ -13,7 +13,6 @@ As a result, one may be tempted to add as many attributes as needed. This is acc
Below shows an example schema for the `User` search document. Note that:
1. Each search document is required to have a type-specific `urn` field, generally maps to an entity in the [graph](graph.md).
2. Similar to `Entity`, each document has an optional `removed` field for "soft deletion".
-This is captured in [BaseDocument](../../metadata-models/src/main/pegasus/com/linkedin/metadata/search/BaseDocument.pdl), which is expected to be included by all documents.
3. Similar to `Entity`, all remaining fields are made `optional` to support partial updates.
4. `management` shows an example of a string array field.
5. `ownedDataset` shows an example on how a field can be derived from metadata [aspects](aspect.md) associated with other types of entity (in this case, `Dataset`).
diff --git a/metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source.md b/metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source.md
index 6a1204fb0f2b35..9e39d24fb85782 100644
--- a/metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source.md
+++ b/metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source.md
@@ -60,16 +60,14 @@ class StaleEntityCheckpointStateBase(CheckpointStateBase, ABC, Generic[Derived])
```
Examples:
-1. [KafkaCheckpointState](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/state/kafka_state.py#L11).
-2. [DbtCheckpointState](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/state/dbt_state.py#L16)
-3. [BaseSQLAlchemyCheckpointState](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/state/sql_common_state.py#L17)
+* [BaseSQLAlchemyCheckpointState](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/state/sql_common_state.py#L17)
### 2. Modifying the SourceConfig
The source's config must inherit from `StatefulIngestionConfigBase`, and should declare a field named `stateful_ingestion` of type `Optional[StatefulStaleMetadataRemovalConfig]`.
Examples:
-1. The `KafkaSourceConfig`
+- The `KafkaSourceConfig`
```python
from typing import List, Optional
import pydantic
@@ -84,9 +82,6 @@ class KafkaSourceConfig(StatefulIngestionConfigBase):
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
```
-2. The [DBTStatefulIngestionConfig](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/dbt.py#L131)
- and the [DBTConfig](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/dbt.py#L317).
-
### 3. Modifying the SourceReport
The report class of the source should inherit from `StaleEntityRemovalSourceReport` whose definition is shown below.
```python
@@ -102,7 +97,7 @@ class StaleEntityRemovalSourceReport(StatefulIngestionReport):
```
Examples:
-1. The `KafkaSourceReport`
+* The `KafkaSourceReport`
```python
from dataclasses import dataclass
from datahub.ingestion.source.state.stale_entity_removal_handler import StaleEntityRemovalSourceReport
@@ -110,7 +105,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import StaleEnt
class KafkaSourceReport(StaleEntityRemovalSourceReport):
#
Date: Thu, 14 Sep 2023 11:40:38 +0530
Subject: [PATCH 18/65] docs(managed datahub): release notes 0.2.11 (#8830)
---
docs-website/sidebars.js | 1 +
.../managed-datahub/release-notes/v_0_2_11.md | 73 +++++++++++++++++++
2 files changed, 74 insertions(+)
create mode 100644 docs/managed-datahub/release-notes/v_0_2_11.md
diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js
index fcf82b786a1b95..12691e9f8268a5 100644
--- a/docs-website/sidebars.js
+++ b/docs-website/sidebars.js
@@ -597,6 +597,7 @@ module.exports = {
},
{
"Managed DataHub Release History": [
+ "docs/managed-datahub/release-notes/v_0_2_11",
"docs/managed-datahub/release-notes/v_0_2_10",
"docs/managed-datahub/release-notes/v_0_2_9",
"docs/managed-datahub/release-notes/v_0_2_8",
diff --git a/docs/managed-datahub/release-notes/v_0_2_11.md b/docs/managed-datahub/release-notes/v_0_2_11.md
new file mode 100644
index 00000000000000..1f420908487127
--- /dev/null
+++ b/docs/managed-datahub/release-notes/v_0_2_11.md
@@ -0,0 +1,73 @@
+# v0.2.11
+---
+
+Release Availability Date
+---
+14-Sep-2023
+
+Recommended CLI/SDK
+---
+- `v0.11.0` with release notes at https://github.com/acryldata/datahub/releases/tag/v0.10.5.5
+- [Deprecation] In LDAP ingestor, the manager_pagination_enabled changed to general pagination_enabled
+
+If you are using an older CLI/SDK version then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, github actions, airflow, in python SDK somewhere, Java SKD etc. This is a strong recommendation to upgrade as we keep on pushing fixes in the CLI and it helps us support you better.
+
+Special Notes
+---
+- Deployment process for this release is going to have a downtime when systme will be in a read only mode. A rough estimate 1 hour for every 2.3 million entities (includes soft-deleted entities).
+
+
+## Release Changelog
+---
+- Since `v0.2.10` these changes from OSS DataHub https://github.com/datahub-project/datahub/compare/2b0952195b7895df0a2bf92b28e71aac18217781...75252a3d9f6a576904be5a0790d644b9ae2df6ac have been pulled in.
+- Misc fixes & features
+ - Proposals
+ - Group names shown correctly for proposal Inbox
+ - Metadata tests
+ - Deprecate/Un-deprecate actions available in Metadata tests
+ - Last Observed (in underlying sql) available as a filter in metadata tests
+ - [Breaking change] Renamed `__lastUpdated` -> `__created` as a filter to correctly represent what it was. This was not surfaced in the UI. But if you were using it then this needs to be renamed. Acryl Customer Success team will keep an eye out to pro-actively find and bring this up if you are affected by this.
+ - Robustness improvements to metadata test runs
+ - Copy urn for metadata tests to allow for easier filtering for iteration over metadata test results via our APIs.
+ - A lot more fixes to subscriptions, notifications and Observability (Beta).
+ - Some performance improvements to lineage queries
+
+## Some notable features in this SaaS release
+- We now enable you to create and delete pinned announcements on your DataHub homepage! If you have the “Manage Home Page Posts” platform privilege you’ll see a new section in settings called “Home Page Posts” where you can create and delete text posts and link posts that your users see on the home page.
+- Improvements to search experience
+
+
+
+- The CLI now supports recursive deletes
+- New subscriptions feature will be widely rolled out this release
+
+
+
+- We will be enabling these features selectively. If you are interested in trying it and providing feedback, please reach out to your Acryl Customer Success representative.
+ - Acryl Observe Freshness Assertions available in private beta as shared [here](../observe/freshness-assertions.md).
From e75900b9a9e1a4febe584765e59caee3ecb1af14 Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz
Date: Thu, 14 Sep 2023 12:25:41 -0700
Subject: [PATCH 19/65] build(ingest): Remove constraint on jsonschema for
Python >= 3.8 (#8842)
---
metadata-ingestion/setup.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index 7a5fd355803cb4..3067ccd71f92f2 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -58,8 +58,8 @@ def get_long_description():
"requests_file",
"jsonref",
# jsonschema drops python 3.7 support in v4.18.0
- "jsonschema<=4.17.3 ; python_version < '3.8'",
- "jsonschema>=4.18.0 ; python_version >= '3.8'",
+ "jsonschema<=4.17.3; python_version < '3.8'",
+ "jsonschema; python_version >= '3.8'",
"ruamel.yaml",
}
From 319342197689fe8475bfdc05e2f2dcd65a784cdc Mon Sep 17 00:00:00 2001
From: Aseem Bansal
Date: Fri, 15 Sep 2023 17:58:30 +0530
Subject: [PATCH 20/65] fix(build): clean task cleanup generated src (#8844)
---
metadata-events/mxe-avro-1.7/build.gradle | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/metadata-events/mxe-avro-1.7/build.gradle b/metadata-events/mxe-avro-1.7/build.gradle
index e30406644913c6..8c0a26d22dc7d2 100644
--- a/metadata-events/mxe-avro-1.7/build.gradle
+++ b/metadata-events/mxe-avro-1.7/build.gradle
@@ -43,4 +43,8 @@ jar {
dependsOn classes
from sourceSets.main.output
exclude('com/linkedin/events/**')
+}
+
+clean {
+ delete 'src'
}
\ No newline at end of file
From ec714fc1e57a36a3418edae87d5f255e25941b41 Mon Sep 17 00:00:00 2001
From: Aseem Bansal
Date: Fri, 15 Sep 2023 20:04:11 +0530
Subject: [PATCH 21/65] feat(ci): disable ingestion smoke build (#8845)
---
.github/workflows/docker-ingestion-smoke.yml | 2 --
1 file changed, 2 deletions(-)
diff --git a/.github/workflows/docker-ingestion-smoke.yml b/.github/workflows/docker-ingestion-smoke.yml
index 9e74f3a459378a..8d52c237928577 100644
--- a/.github/workflows/docker-ingestion-smoke.yml
+++ b/.github/workflows/docker-ingestion-smoke.yml
@@ -3,8 +3,6 @@ on:
release:
types: [published]
push:
- branches:
- - master
paths:
- "docker/datahub-ingestion-base/**"
- "smoke-test/**"
From 0f7744784d663b377f1743db188d8632b9f6a86c Mon Sep 17 00:00:00 2001
From: Hyejin Yoon <0327jane@gmail.com>
Date: Sat, 16 Sep 2023 03:55:10 +0900
Subject: [PATCH 22/65] fix: fix quickstart page (#8784)
---
docs/quickstart.md | 336 +++++++++++++++++++++++++--------------------
1 file changed, 184 insertions(+), 152 deletions(-)
diff --git a/docs/quickstart.md b/docs/quickstart.md
index cd91dc8d1ac84a..29b22b54dc87a3 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -1,219 +1,218 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
# DataHub Quickstart Guide
+:::tip Managed DataHub
+
This guide provides instructions on deploying the open source DataHub locally.
-If you're interested in a managed version, [Acryl Data](https://www.acryldata.io/product) provides a fully managed, premium version of DataHub.
+If you're interested in a managed version, [Acryl Data](https://www.acryldata.io/product) provides a fully managed, premium version of DataHub.
+**[Get Started with Managed DataHub](./managed-datahub/welcome-acryl.md)**
-
-Get Started with Managed DataHub
-
+:::
-## Deploying DataHub
+## Prerequisites
-To deploy a new instance of DataHub, perform the following steps.
+- Install **Docker** and **Docker Compose** v2 for your platform.
-1. Install Docker and Docker Compose v2 for your platform.
+ | Platform | Application |
+ | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
+ | Window | [Docker Desktop](https://www.docker.com/products/docker-desktop/) |
+ | Mac | [Docker Desktop](https://www.docker.com/products/docker-desktop/) |
+ | Linux | [Docker for Linux](https://docs.docker.com/desktop/install/linux-install/) and [Docker Compose](https://docs.docker.com/compose/install/linux/) |
-- On Windows or Mac, install [Docker Desktop](https://www.docker.com/products/docker-desktop/).
-- On Linux, install [Docker for Linux](https://docs.docker.com/desktop/install/linux-install/) and [Docker Compose](https://docs.docker.com/compose/install/linux/).
+- **Launch the Docker engine** from command line or the desktop app.
+- Ensure you have **Python 3.7+** installed & configured. (Check using `python3 --version`).
-:::note
+:::note Docker Resource Allocation
-Make sure to allocate enough hardware resources for Docker engine.
+Make sure to allocate enough hardware resources for Docker engine.
Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area, and 10GB disk space.
:::
-2. Launch the Docker Engine from command line or the desktop app.
-
-3. Install the DataHub CLI
-
- a. Ensure you have Python 3.7+ installed & configured. (Check using `python3 --version`).
-
- b. Run the following commands in your terminal
+## Install the DataHub CLI
- ```sh
- python3 -m pip install --upgrade pip wheel setuptools
- python3 -m pip install --upgrade acryl-datahub
- datahub version
- ```
+
+
- If you're using poetry, run the following command.
-
- ```sh
- poetry add acryl-datahub
- datahub version
- ```
+```bash
+python3 -m pip install --upgrade pip wheel setuptools
+python3 -m pip install --upgrade acryl-datahub
+datahub version
+```
-:::note
+:::note Command Not Found
-If you see "command not found", try running cli commands with the prefix 'python3 -m' instead like `python3 -m datahub version`
+If you see `command not found`, try running cli commands like `python3 -m datahub version`.
Note that DataHub CLI does not support Python 2.x.
:::
-4. To deploy a DataHub instance locally, run the following CLI command from your terminal
-
- ```
- datahub docker quickstart
- ```
-
- This will deploy a DataHub instance using [docker-compose](https://docs.docker.com/compose/).
- If you are curious, the `docker-compose.yaml` file is downloaded to your home directory under the `.datahub/quickstart` directory.
-
- If things go well, you should see messages like the ones below:
-
- ```
- Fetching docker-compose file https://raw.githubusercontent.com/datahub-project/datahub/master/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml from GitHub
- Pulling docker images...
- Finished pulling docker images!
-
- [+] Running 11/11
- ⠿ Container zookeeper Running 0.0s
- ⠿ Container elasticsearch Running 0.0s
- ⠿ Container broker Running 0.0s
- ⠿ Container schema-registry Running 0.0s
- ⠿ Container elasticsearch-setup Started 0.7s
- ⠿ Container kafka-setup Started 0.7s
- ⠿ Container mysql Running 0.0s
- ⠿ Container datahub-gms Running 0.0s
- ⠿ Container mysql-setup Started 0.7s
- ⠿ Container datahub-datahub-actions-1 Running 0.0s
- ⠿ Container datahub-frontend-react Running 0.0s
- .......
- ✔ DataHub is now running
- Ingest some demo data using `datahub docker ingest-sample-data`,
- or head to http://localhost:9002 (username: datahub, password: datahub) to play around with the frontend.
- Need support? Get in touch on Slack: https://slack.datahubproject.io/
- ```
-
- Upon completion of this step, you should be able to navigate to the DataHub UI
- at [http://localhost:9002](http://localhost:9002) in your browser. You can sign in using `datahub` as both the
- username and password.
-
-:::note
-
-On Mac computers with Apple Silicon (M1, M2 etc.), you might see an error like `no matching manifest for linux/arm64/v8 in the manifest list entries`, this typically means that the datahub cli was not able to detect that you are running it on Apple Silicon. To resolve this issue, override the default architecture detection by issuing `datahub docker quickstart --arch m1`
+
+
-:::
+```bash
+poetry add acryl-datahub
+poetry shell
+datahub version
+```
-5. To ingest the sample metadata, run the following CLI command from your terminal
+
+
- ```bash
- datahub docker ingest-sample-data
- ```
+## Start DataHub
-:::note
+Run the following CLI command from your terminal.
-If you've enabled [Metadata Service Authentication](authentication/introducing-metadata-service-authentication.md), you'll need to provide a Personal Access Token
-using the `--token ` parameter in the command.
+```bash
+datahub docker quickstart
+```
-:::
+This will deploy a DataHub instance using [docker-compose](https://docs.docker.com/compose/).
+If you are curious, the `docker-compose.yaml` file is downloaded to your home directory under the `.datahub/quickstart` directory.
+
+If things go well, you should see messages like the ones below:
+
+```shell-session
+Fetching docker-compose file https://raw.githubusercontent.com/datahub-project/datahub/master/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml from GitHub
+Pulling docker images...
+Finished pulling docker images!
+
+[+] Running 11/11
+⠿ Container zookeeper Running 0.0s
+⠿ Container elasticsearch Running 0.0s
+⠿ Container broker Running 0.0s
+⠿ Container schema-registry Running 0.0s
+⠿ Container elasticsearch-setup Started 0.7s
+⠿ Container kafka-setup Started 0.7s
+⠿ Container mysql Running 0.0s
+⠿ Container datahub-gms Running 0.0s
+⠿ Container mysql-setup Started 0.7s
+⠿ Container datahub-datahub-actions-1 Running 0.0s
+⠿ Container datahub-frontend-react Running 0.0s
+.......
+✔ DataHub is now running
+Ingest some demo data using `datahub docker ingest-sample-data`,
+or head to http://localhost:9002 (username: datahub, password: datahub) to play around with the frontend.
+Need support? Get in touch on Slack: https://slack.datahubproject.io/
+```
-That's it! Now feel free to play around with DataHub!
+:::note Mac M1/M2
-## Troubleshooting Issues
+On Mac computers with Apple Silicon (M1, M2 etc.), you might see an error like `no matching manifest for linux/arm64/v8 in the manifest list entries`.
+This typically means that the datahub cli was not able to detect that you are running it on Apple Silicon.
+To resolve this issue, override the default architecture detection by issuing `datahub docker quickstart --arch m1`
-Please refer to [Quickstart Debugging Guide](./troubleshooting/quickstart.md).
+:::
-## Next Steps
+### Sign In
-### Ingest Metadata
+Upon completion of this step, you should be able to navigate to the DataHub UI at [http://localhost:9002](http://localhost:9002) in your browser.
+You can sign in using the default credentials below.
-To start pushing your company's metadata into DataHub, take a look at [UI-based Ingestion Guide](./ui-ingestion.md), or to run ingestion using the cli, look at the [Metadata Ingestion Guide](../metadata-ingestion/README.md).
+```json
+username: datahub
+password: datahub
+```
-### Invite Users
+To change the default credentials, please refer to [Change the default user datahub in quickstart](authentication/changing-default-credentials.md#quickstart).
-To add users to your deployment to share with your team check out our [Adding Users to DataHub](authentication/guides/add-users.md)
+### Ingest Sample Data
-### Enable Authentication
+To ingest the sample metadata, run the following CLI command from your terminal
-To enable SSO, check out [Configuring OIDC Authentication](authentication/guides/sso/configure-oidc-react.md) or [Configuring JaaS Authentication](authentication/guides/jaas.md).
+```bash
+datahub docker ingest-sample-data
+```
-To enable backend Authentication, check out [authentication in DataHub's backend](authentication/introducing-metadata-service-authentication.md#configuring-metadata-service-authentication).
+:::note Token Authentication
-### Change the Default `datahub` User Credentials
+If you've enabled [Metadata Service Authentication](authentication/introducing-metadata-service-authentication.md), you'll need to provide a Personal Access Token
+using the `--token ` parameter in the command.
-:::note
-Please note that deleting the `Data Hub` user in the UI **WILL NOT** disable the default user. You will still be able to log in using the default 'datahub:datahub' credentials. To safely delete the default credentials, please follow the guide provided below.
:::
-Please refer to [Change the default user datahub in quickstart](authentication/changing-default-credentials.md#quickstart).
-
-### Move to Production
+That's it! Now feel free to play around with DataHub!
-We recommend deploying DataHub to production using Kubernetes. We provide helpful [Helm Charts](https://artifacthub.io/packages/helm/datahub/datahub) to help you quickly get up and running. Check out [Deploying DataHub to Kubernetes](./deploy/kubernetes.md) for a step-by-step walkthrough.
+---
-The `quickstart` method of running DataHub is intended for local development and a quick way to experience the features that DataHub has to offer. It is not
-intended for a production environment. This recommendation is based on the following points.
+## Common Operations
-#### Default Credentials
+### Stop DataHub
-`quickstart` uses docker-compose configuration which includes default credentials for both DataHub, and it's underlying
-prerequisite data stores, such as MySQL. Additionally, other components are unauthenticated out of the box. This is a
-design choice to make development easier and is not best practice for a production environment.
-
-#### Exposed Ports
+To stop DataHub's quickstart, you can issue the following command.
-DataHub's services, and it's backend data stores use the docker default behavior of binding to all interface addresses.
-This makes it useful for development but is not recommended in a production environment.
+```bash
+datahub docker quickstart --stop
+```
-#### Performance & Management
+### Reset DataHub
-* `quickstart` is limited by the resources available on a single host, there is no ability to scale horizontally.
-* Rollout of new versions requires downtime.
-* The configuration is largely pre-determined and not easily managed.
-* `quickstart`, by default, follows the most recent builds forcing updates to the latest released and unreleased builds.
+To cleanse DataHub of all of its state (e.g. before ingesting your own), you can use the CLI `nuke` command.
-## Other Common Operations
+```bash
+datahub docker nuke
+```
-### Stopping DataHub
+### Upgrade DataHub
-To stop DataHub's quickstart, you can issue the following command.
+If you have been testing DataHub locally, a new version of DataHub got released and you want to try the new version then you can just issue the quickstart command again. It will pull down newer images and restart your instance without losing any data.
-```
-datahub docker quickstart --stop
+```bash
+datahub docker quickstart
```
-### Resetting DataHub (a.k.a factory reset)
+### Customize installation
-To cleanse DataHub of all of its state (e.g. before ingesting your own), you can use the CLI `nuke` command.
+If you would like to customize the DataHub installation further, please download the [docker-compose.yaml](https://raw.githubusercontent.com/datahub-project/datahub/master/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml) used by the cli tool, modify it as necessary and deploy DataHub by passing the downloaded docker-compose file:
-```
-datahub docker nuke
+```bash
+datahub docker quickstart --quickstart-compose-file
```
-### Backing up your DataHub Quickstart (experimental)
+### Back up DataHub
-The quickstart image is not recommended for use as a production instance. See [Moving to production](#move-to-production) for recommendations on setting up your production cluster. However, in case you want to take a backup of your current quickstart state (e.g. you have a demo to your company coming up and you want to create a copy of the quickstart data so you can restore it at a future date), you can supply the `--backup` flag to quickstart.
+The quickstart image is not recommended for use as a production instance.
+However, in case you want to take a backup of your current quickstart state (e.g. you have a demo to your company coming up and you want to create a copy of the quickstart data so you can restore it at a future date), you can supply the `--backup` flag to quickstart.
-```
+
+
+
+```bash
datahub docker quickstart --backup
```
-will take a backup of your MySQL image and write it by default to your `~/.datahub/quickstart/` directory as the file `backup.sql`. You can customize this by passing a `--backup-file` argument.
-e.g.
+This will take a backup of your MySQL image and write it by default to your `~/.datahub/quickstart/` directory as the file `backup.sql`.
+
+
+
+```bash
+datahub docker quickstart --backup --backup-file
```
-datahub docker quickstart --backup --backup-file /home/my_user/datahub_backups/quickstart_backup_2002_22_01.sql
-```
-:::note
+You can customize the backup file path by passing a `--backup-file` argument.
+
+
+
+
+:::caution
Note that the Quickstart backup does not include any timeseries data (dataset statistics, profiles, etc.), so you will lose that information if you delete all your indexes and restore from this backup.
:::
-### Restoring your DataHub Quickstart (experimental)
+### Restore DataHub
As you might imagine, these backups are restore-able. The following section describes a few different options you have to restore your backup.
-#### Restoring a backup (primary + index) [most common]
+
+
To restore a previous backup, run the following command:
-```
+```bash
datahub docker quickstart --restore
```
@@ -221,38 +220,71 @@ This command will pick up the `backup.sql` file located under `~/.datahub/quicks
To supply a specific backup file, use the `--restore-file` option.
-```
+```bash
datahub docker quickstart --restore --restore-file /home/my_user/datahub_backups/quickstart_backup_2002_22_01.sql
```
-#### Restoring only the index [to deal with index out of sync / corruption issues]
+
+
Another situation that can come up is the index can get corrupt, or be missing some update. In order to re-bootstrap the index from the primary store, you can run this command to sync the index with the primary store.
-```
+```bash
datahub docker quickstart --restore-indices
```
-#### Restoring a backup (primary but NO index) [rarely used]
+
+
+
Sometimes, you might want to just restore the state of your primary database (MySQL), but not re-index the data. To do this, you have to explicitly disable the restore-indices capability.
-```
+```bash
datahub docker quickstart --restore --no-restore-indices
```
-### Upgrading your local DataHub
+
+
-If you have been testing DataHub locally, a new version of DataHub got released and you want to try the new version then you can just issue the quickstart command again. It will pull down newer images and restart your instance without losing any data.
+---
-```
-datahub docker quickstart
-```
+## Next Steps
-### Customization
+- [Quickstart Debugging Guide](./troubleshooting/quickstart.md)
+- [Ingest metadata through the UI](./ui-ingestion.md)
+- [Ingest metadata through the CLI](../metadata-ingestion/README.md)
+- [Add Users to DataHub](authentication/guides/add-users.md)
+- [Configure OIDC Authentication](authentication/guides/sso/configure-oidc-react.md)
+- [Configure JaaS Authentication](authentication/guides/jaas.md)
+- [Configure authentication in DataHub's backend](authentication/introducing-metadata-service-authentication.md#configuring-metadata-service-authentication).
+- [Change the default user datahub in quickstart](authentication/changing-default-credentials.md#quickstart)
-If you would like to customize the DataHub installation further, please download the [docker-compose.yaml](https://raw.githubusercontent.com/datahub-project/datahub/master/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml) used by the cli tool, modify it as necessary and deploy DataHub by passing the downloaded docker-compose file:
+### Move To Production
-```
-datahub docker quickstart --quickstart-compose-file
-```
+:::caution
+
+Quickstart is not intended for a production environment. We recommend deploying DataHub to production using Kubernetes.
+We provide helpful [Helm Charts](https://artifacthub.io/packages/helm/datahub/datahub) to help you quickly get up and running.
+Check out [Deploying DataHub to Kubernetes](./deploy/kubernetes.md) for a step-by-step walkthrough.
+
+:::
+
+The `quickstart` method of running DataHub is intended for local development and a quick way to experience the features that DataHub has to offer.
+It is not intended for a production environment. This recommendation is based on the following points.
+
+#### Default Credentials
+
+`quickstart` uses docker-compose configuration which includes default credentials for both DataHub, and it's underlying
+prerequisite data stores, such as MySQL. Additionally, other components are unauthenticated out of the box. This is a
+design choice to make development easier and is not best practice for a production environment.
+
+#### Exposed Ports
+
+DataHub's services, and it's backend data stores use the docker default behavior of binding to all interface addresses.
+This makes it useful for development but is not recommended in a production environment.
+
+#### Performance & Management
+
+`quickstart` is limited by the resources available on a single host, there is no ability to scale horizontally.
+Rollout of new versions often requires downtime and the configuration is largely pre-determined and not easily managed.
+Lastly, by default, `quickstart` follows the most recent builds forcing updates to the latest released and unreleased builds.
From cdb9f5ba620956346479bdbf68920dbdd3f6e0cc Mon Sep 17 00:00:00 2001
From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
Date: Sat, 16 Sep 2023 00:25:39 +0530
Subject: [PATCH 23/65] feat(bigquery): add better timers around every API call
(#8626)
---
.../ingestion/source/bigquery_v2/bigquery.py | 241 ++------
.../source/bigquery_v2/bigquery_audit.py | 43 --
.../bigquery_v2/bigquery_audit_log_api.py | 139 +++++
.../source/bigquery_v2/bigquery_config.py | 86 ++-
.../source/bigquery_v2/bigquery_report.py | 53 +-
.../source/bigquery_v2/bigquery_schema.py | 530 ++++++-----------
.../ingestion/source/bigquery_v2/common.py | 34 --
.../ingestion/source/bigquery_v2/lineage.py | 545 +++++++++---------
.../ingestion/source/bigquery_v2/queries.py | 426 ++++++++++++++
.../ingestion/source/bigquery_v2/usage.py | 240 ++------
.../ingestion/source/redshift/lineage.py | 4 +-
.../source/snowflake/snowflake_v2.py | 21 +-
.../src/datahub/utilities/perf_timer.py | 69 ++-
.../integration/bigquery_v2/test_bigquery.py | 14 +-
.../tests/unit/test_bigquery_lineage.py | 11 +-
.../tests/unit/test_bigquery_source.py | 141 +++--
.../unit/test_bigqueryv2_usage_source.py | 11 +-
.../tests/unit/utilities/test_perf_timer.py | 46 ++
18 files changed, 1450 insertions(+), 1204 deletions(-)
create mode 100644 metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py
create mode 100644 metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py
create mode 100644 metadata-ingestion/tests/unit/utilities/test_perf_timer.py
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index 1107a54a1896bf..ae49a4ba17c114 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -4,7 +4,7 @@
import re
import traceback
from collections import defaultdict
-from datetime import datetime, timedelta, timezone
+from datetime import datetime, timedelta
from typing import Dict, Iterable, List, Optional, Set, Type, Union, cast
from google.cloud import bigquery
@@ -44,21 +44,17 @@
from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
BigqueryColumn,
- BigQueryDataDictionary,
BigqueryDataset,
BigqueryProject,
+ BigQuerySchemaApi,
BigqueryTable,
BigqueryView,
)
from datahub.ingestion.source.bigquery_v2.common import (
BQ_EXTERNAL_DATASET_URL_TEMPLATE,
BQ_EXTERNAL_TABLE_URL_TEMPLATE,
- get_bigquery_client,
-)
-from datahub.ingestion.source.bigquery_v2.lineage import (
- BigqueryLineageExtractor,
- make_lineage_edges_from_parsing_result,
)
+from datahub.ingestion.source.bigquery_v2.lineage import BigqueryLineageExtractor
from datahub.ingestion.source.bigquery_v2.profiler import BigqueryProfiler
from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
from datahub.ingestion.source.common.subtypes import (
@@ -83,7 +79,6 @@
StatefulIngestionSourceBase,
)
from datahub.ingestion.source_report.ingestion_stage import (
- LINEAGE_EXTRACTION,
METADATA_EXTRACTION,
PROFILING,
)
@@ -94,7 +89,6 @@
)
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
DatasetProperties,
- UpstreamLineage,
ViewProperties,
)
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
@@ -113,11 +107,9 @@
)
from datahub.metadata.schema_classes import (
DataPlatformInstanceClass,
- DatasetLineageTypeClass,
GlobalTagsClass,
TagAssociationClass,
)
-from datahub.specific.dataset import DatasetPatchBuilder
from datahub.utilities.file_backed_collections import FileBackedDict
from datahub.utilities.hive_schema_to_avro import (
HiveColumnToAvroConverter,
@@ -126,7 +118,7 @@
from datahub.utilities.mapping import Constants
from datahub.utilities.perf_timer import PerfTimer
from datahub.utilities.registries.domain_registry import DomainRegistry
-from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage
+from datahub.utilities.sqlglot_lineage import SchemaResolver
logger: logging.Logger = logging.getLogger(__name__)
@@ -228,11 +220,15 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
set_dataset_urn_to_lower(self.config.convert_urns_to_lowercase)
- self.redundant_lineage_run_skip_handler: Optional[
+ self.bigquery_data_dictionary = BigQuerySchemaApi(
+ self.report.schema_api_perf, self.config.get_bigquery_client()
+ )
+
+ redundant_lineage_run_skip_handler: Optional[
RedundantLineageRunSkipHandler
] = None
if self.config.enable_stateful_lineage_ingestion:
- self.redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler(
+ redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler(
source=self,
config=self.config,
pipeline_name=self.ctx.pipeline_name,
@@ -241,7 +237,10 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
# For database, schema, tables, views, etc
self.lineage_extractor = BigqueryLineageExtractor(
- config, self.report, self.redundant_lineage_run_skip_handler
+ config,
+ self.report,
+ dataset_urn_builder=self.gen_dataset_urn_from_ref,
+ redundant_run_skip_handler=redundant_lineage_run_skip_handler,
)
redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = None
@@ -289,6 +288,7 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
self.sql_parser_schema_resolver = SchemaResolver(
platform=self.platform, env=self.config.env
)
+
self.add_config_to_report()
atexit.register(cleanup, config)
@@ -314,18 +314,20 @@ def metadata_read_capability_test(
for project_id in project_ids:
try:
logger.info((f"Metadata read capability test for project {project_id}"))
- client: bigquery.Client = get_bigquery_client(config)
+ client: bigquery.Client = config.get_bigquery_client()
assert client
- result = BigQueryDataDictionary.get_datasets_for_project_id(
- client, project_id, 10
+ bigquery_data_dictionary = BigQuerySchemaApi(
+ BigQueryV2Report().schema_api_perf, client
+ )
+ result = bigquery_data_dictionary.get_datasets_for_project_id(
+ project_id, 10
)
if len(result) == 0:
return CapabilityReport(
capable=False,
failure_reason=f"Dataset query returned empty dataset. It is either empty or no dataset in project {project_id}",
)
- tables = BigQueryDataDictionary.get_tables_for_dataset(
- conn=client,
+ tables = bigquery_data_dictionary.get_tables_for_dataset(
project_id=project_id,
dataset_name=result[0].name,
tables={},
@@ -351,7 +353,9 @@ def lineage_capability_test(
project_ids: List[str],
report: BigQueryV2Report,
) -> CapabilityReport:
- lineage_extractor = BigqueryLineageExtractor(connection_conf, report)
+ lineage_extractor = BigqueryLineageExtractor(
+ connection_conf, report, lambda ref: ""
+ )
for project_id in project_ids:
try:
logger.info(f"Lineage capability test for project {project_id}")
@@ -397,7 +401,7 @@ def test_connection(config_dict: dict) -> TestConnectionReport:
try:
connection_conf = BigQueryV2Config.parse_obj_allow_extras(config_dict)
- client: bigquery.Client = get_bigquery_client(connection_conf)
+ client: bigquery.Client = connection_conf.get_bigquery_client()
assert client
test_report.basic_connectivity = BigqueryV2Source.connectivity_test(client)
@@ -519,54 +523,30 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
]
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
- conn: bigquery.Client = get_bigquery_client(self.config)
-
- projects = self._get_projects(conn)
+ projects = self._get_projects()
if not projects:
return
for project_id in projects:
self.report.set_ingestion_stage(project_id.id, METADATA_EXTRACTION)
logger.info(f"Processing project: {project_id.id}")
- yield from self._process_project(conn, project_id)
+ yield from self._process_project(project_id)
if self.config.include_usage_statistics:
yield from self.usage_extractor.get_usage_workunits(
[p.id for p in projects], self.table_refs
)
- if self._should_ingest_lineage():
- for project in projects:
- self.report.set_ingestion_stage(project.id, LINEAGE_EXTRACTION)
- yield from self.generate_lineage(project.id)
-
- if self.redundant_lineage_run_skip_handler:
- # Update the checkpoint state for this run.
- self.redundant_lineage_run_skip_handler.update_state(
- self.config.start_time, self.config.end_time
- )
-
- def _should_ingest_lineage(self) -> bool:
- if not self.config.include_table_lineage:
- return False
-
- if (
- self.redundant_lineage_run_skip_handler
- and self.redundant_lineage_run_skip_handler.should_skip_this_run(
- cur_start_time=self.config.start_time,
- cur_end_time=self.config.end_time,
+ if self.config.include_table_lineage:
+ yield from self.lineage_extractor.get_lineage_workunits(
+ [p.id for p in projects],
+ self.sql_parser_schema_resolver,
+ self.view_refs_by_project,
+ self.view_definitions,
+ self.table_refs,
)
- ):
- # Skip this run
- self.report.report_warning(
- "lineage-extraction",
- "Skip this run as there was already a run for current ingestion window.",
- )
- return False
-
- return True
- def _get_projects(self, conn: bigquery.Client) -> List[BigqueryProject]:
+ def _get_projects(self) -> List[BigqueryProject]:
logger.info("Getting projects")
if self.config.project_ids or self.config.project_id:
project_ids = self.config.project_ids or [self.config.project_id] # type: ignore
@@ -575,15 +555,10 @@ def _get_projects(self, conn: bigquery.Client) -> List[BigqueryProject]:
for project_id in project_ids
]
else:
- return list(self._get_project_list(conn))
-
- def _get_project_list(self, conn: bigquery.Client) -> Iterable[BigqueryProject]:
- try:
- projects = BigQueryDataDictionary.get_projects(conn)
- except Exception as e:
- logger.error(f"Error getting projects. {e}", exc_info=True)
- projects = []
+ return list(self._query_project_list())
+ def _query_project_list(self) -> Iterable[BigqueryProject]:
+ projects = self.bigquery_data_dictionary.get_projects()
if not projects: # Report failure on exception and if empty list is returned
self.report.report_failure(
"metadata-extraction",
@@ -600,7 +575,7 @@ def _get_project_list(self, conn: bigquery.Client) -> Iterable[BigqueryProject]:
self.report.report_dropped(project.id)
def _process_project(
- self, conn: bigquery.Client, bigquery_project: BigqueryProject
+ self, bigquery_project: BigqueryProject
) -> Iterable[MetadataWorkUnit]:
db_tables: Dict[str, List[BigqueryTable]] = {}
db_views: Dict[str, List[BigqueryView]] = {}
@@ -611,7 +586,7 @@ def _process_project(
try:
bigquery_project.datasets = (
- BigQueryDataDictionary.get_datasets_for_project_id(conn, project_id)
+ self.bigquery_data_dictionary.get_datasets_for_project_id(project_id)
)
except Exception as e:
error_message = f"Unable to get datasets for project {project_id}, skipping. The error was: {e}"
@@ -645,7 +620,7 @@ def _process_project(
try:
# db_tables and db_views are populated in the this method
yield from self._process_schema(
- conn, project_id, bigquery_dataset, db_tables, db_views
+ project_id, bigquery_dataset, db_tables, db_views
)
except Exception as e:
@@ -670,73 +645,8 @@ def _process_project(
tables=db_tables,
)
- def generate_lineage(self, project_id: str) -> Iterable[MetadataWorkUnit]:
- logger.info(f"Generate lineage for {project_id}")
- lineage = self.lineage_extractor.calculate_lineage_for_project(
- project_id,
- sql_parser_schema_resolver=self.sql_parser_schema_resolver,
- )
-
- if self.config.lineage_parse_view_ddl:
- for view in self.view_refs_by_project[project_id]:
- view_definition = self.view_definitions[view]
- raw_view_lineage = sqlglot_lineage(
- view_definition,
- schema_resolver=self.sql_parser_schema_resolver,
- default_db=project_id,
- )
- if raw_view_lineage.debug_info.table_error:
- logger.debug(
- f"Failed to parse lineage for view {view}: {raw_view_lineage.debug_info.table_error}"
- )
- self.report.num_view_definitions_failed_parsing += 1
- self.report.view_definitions_parsing_failures.append(
- f"Table-level sql parsing error for view {view}: {raw_view_lineage.debug_info.table_error}"
- )
- continue
- elif raw_view_lineage.debug_info.column_error:
- self.report.num_view_definitions_failed_column_parsing += 1
- self.report.view_definitions_parsing_failures.append(
- f"Column-level sql parsing error for view {view}: {raw_view_lineage.debug_info.column_error}"
- )
- else:
- self.report.num_view_definitions_parsed += 1
-
- # For views, we override the upstreams obtained by parsing audit logs
- # as they may contain indirectly referenced tables.
- ts = datetime.now(timezone.utc)
- lineage[view] = set(
- make_lineage_edges_from_parsing_result(
- raw_view_lineage,
- audit_stamp=ts,
- lineage_type=DatasetLineageTypeClass.VIEW,
- )
- )
-
- for lineage_key in lineage.keys():
- if lineage_key not in self.table_refs:
- continue
-
- table_ref = BigQueryTableRef.from_string_name(lineage_key)
- dataset_urn = self.gen_dataset_urn(
- project_id=table_ref.table_identifier.project_id,
- dataset_name=table_ref.table_identifier.dataset,
- table=table_ref.table_identifier.get_table_display_name(),
- )
-
- lineage_info = self.lineage_extractor.get_lineage_for_table(
- bq_table=table_ref,
- bq_table_urn=dataset_urn,
- platform=self.platform,
- lineage_metadata=lineage,
- )
-
- if lineage_info:
- yield from self.gen_lineage(dataset_urn, lineage_info)
-
def _process_schema(
self,
- conn: bigquery.Client,
project_id: str,
bigquery_dataset: BigqueryDataset,
db_tables: Dict[str, List[BigqueryTable]],
@@ -750,8 +660,7 @@ def _process_schema(
columns = None
if self.config.include_tables or self.config.include_views:
- columns = BigQueryDataDictionary.get_columns_for_dataset(
- conn,
+ columns = self.bigquery_data_dictionary.get_columns_for_dataset(
project_id=project_id,
dataset_name=dataset_name,
column_limit=self.config.column_limit,
@@ -760,7 +669,7 @@ def _process_schema(
if self.config.include_tables:
db_tables[dataset_name] = list(
- self.get_tables_for_dataset(conn, project_id, dataset_name)
+ self.get_tables_for_dataset(project_id, dataset_name)
)
for table in db_tables[dataset_name]:
@@ -773,7 +682,9 @@ def _process_schema(
)
elif self.config.include_table_lineage or self.config.include_usage_statistics:
# Need table_refs to calculate lineage and usage
- for table_item in conn.list_tables(f"{project_id}.{dataset_name}"):
+ for table_item in self.bigquery_data_dictionary.list_tables(
+ dataset_name, project_id
+ ):
identifier = BigqueryTableIdentifier(
project_id=project_id,
dataset=dataset_name,
@@ -793,8 +704,8 @@ def _process_schema(
if self.config.include_views:
db_views[dataset_name] = list(
- BigQueryDataDictionary.get_views_for_dataset(
- conn, project_id, dataset_name, self.config.is_profiling_enabled()
+ self.bigquery_data_dictionary.get_views_for_dataset(
+ project_id, dataset_name, self.config.is_profiling_enabled()
)
)
@@ -1065,39 +976,6 @@ def gen_dataset_workunits(
domain_config=self.config.domain,
)
- def gen_lineage(
- self,
- dataset_urn: str,
- upstream_lineage: Optional[UpstreamLineage] = None,
- ) -> Iterable[MetadataWorkUnit]:
- if upstream_lineage is None:
- return
-
- if upstream_lineage is not None:
- if self.config.incremental_lineage:
- patch_builder: DatasetPatchBuilder = DatasetPatchBuilder(
- urn=dataset_urn
- )
- for upstream in upstream_lineage.upstreams:
- patch_builder.add_upstream_lineage(upstream)
-
- yield from [
- MetadataWorkUnit(
- id=f"upstreamLineage-for-{dataset_urn}",
- mcp_raw=mcp,
- )
- for mcp in patch_builder.build()
- ]
- else:
- if not self.config.extract_column_lineage:
- upstream_lineage.fineGrainedLineages = None
-
- yield from [
- MetadataChangeProposalWrapper(
- entityUrn=dataset_urn, aspect=upstream_lineage
- ).as_workunit()
- ]
-
def gen_tags_aspect_workunit(
self, dataset_urn: str, tags_to_add: List[str]
) -> MetadataWorkUnit:
@@ -1212,7 +1090,6 @@ def get_report(self) -> BigQueryV2Report:
def get_tables_for_dataset(
self,
- conn: bigquery.Client,
project_id: str,
dataset_name: str,
) -> Iterable[BigqueryTable]:
@@ -1231,14 +1108,15 @@ def get_tables_for_dataset(
# We get the list of tables in the dataset to get core table properties and to be able to process the tables in batches
# We collect only the latest shards from sharded tables (tables with _YYYYMMDD suffix) and ignore temporary tables
- table_items = self.get_core_table_details(conn, dataset_name, project_id)
+ table_items = self.get_core_table_details(
+ dataset_name, project_id, self.config.temp_table_dataset_prefix
+ )
items_to_get: Dict[str, TableListItem] = {}
for table_item in table_items.keys():
items_to_get[table_item] = table_items[table_item]
if len(items_to_get) % max_batch_size == 0:
- yield from BigQueryDataDictionary.get_tables_for_dataset(
- conn,
+ yield from self.bigquery_data_dictionary.get_tables_for_dataset(
project_id,
dataset_name,
items_to_get,
@@ -1247,8 +1125,7 @@ def get_tables_for_dataset(
items_to_get.clear()
if items_to_get:
- yield from BigQueryDataDictionary.get_tables_for_dataset(
- conn,
+ yield from self.bigquery_data_dictionary.get_tables_for_dataset(
project_id,
dataset_name,
items_to_get,
@@ -1260,13 +1137,15 @@ def get_tables_for_dataset(
)
def get_core_table_details(
- self, conn: bigquery.Client, dataset_name: str, project_id: str
+ self, dataset_name: str, project_id: str, temp_table_dataset_prefix: str
) -> Dict[str, TableListItem]:
table_items: Dict[str, TableListItem] = {}
# Dict to store sharded table and the last seen max shard id
sharded_tables: Dict[str, TableListItem] = {}
- for table in conn.list_tables(f"{project_id}.{dataset_name}"):
+ for table in self.bigquery_data_dictionary.list_tables(
+ dataset_name, project_id
+ ):
table_identifier = BigqueryTableIdentifier(
project_id=project_id,
dataset=dataset_name,
@@ -1303,9 +1182,7 @@ def get_core_table_details(
if stored_shard < shard:
sharded_tables[table_name] = table
continue
- elif str(table_identifier).startswith(
- self.config.temp_table_dataset_prefix
- ):
+ elif str(table_identifier).startswith(temp_table_dataset_prefix):
logger.debug(f"Dropping temporary table {table_identifier.table}")
self.report.report_dropped(table_identifier.raw_table_name())
continue
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py
index 0f9b37c93feaa0..b0ac77201b415b 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py
@@ -13,48 +13,6 @@
get_first_missing_key_any,
)
-BQ_FILTER_RULE_TEMPLATE = "BQ_FILTER_RULE_TEMPLATE"
-
-BQ_AUDIT_V2 = {
- BQ_FILTER_RULE_TEMPLATE: """
-resource.type=("bigquery_project" OR "bigquery_dataset")
-AND
-timestamp >= "{start_time}"
-AND
-timestamp < "{end_time}"
-AND protoPayload.serviceName="bigquery.googleapis.com"
-AND
-(
- (
- protoPayload.methodName=
- (
- "google.cloud.bigquery.v2.JobService.Query"
- OR
- "google.cloud.bigquery.v2.JobService.InsertJob"
- )
- AND protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE"
- AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:*
- AND protoPayload.metadata.jobChange.job.jobConfig.queryConfig:*
- AND
- (
- (
- protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:*
- AND NOT protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables =~ "projects/.*/datasets/.*/tables/__TABLES__|__TABLES_SUMMARY__|INFORMATION_SCHEMA.*"
- )
- OR
- (
- protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable:*
- )
- )
- )
- OR
- protoPayload.metadata.tableDataRead.reason = "JOB"
-)
-""".strip(
- "\t \n"
- ),
-}
-
AuditLogEntry = Any
# BigQueryAuditMetadata is the v2 format in which audit logs are exported to BigQuery
@@ -606,7 +564,6 @@ def from_query_event(
query_event: QueryEvent,
debug_include_full_payloads: bool = False,
) -> "ReadEvent":
-
readEvent = ReadEvent(
actor_email=query_event.actor_email,
timestamp=query_event.timestamp,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py
new file mode 100644
index 00000000000000..03b12c61ee5c6c
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py
@@ -0,0 +1,139 @@
+import logging
+from datetime import datetime
+from typing import Callable, Iterable, List, Optional
+
+from google.cloud import bigquery
+from google.cloud.logging_v2.client import Client as GCPLoggingClient
+from ratelimiter import RateLimiter
+
+from datahub.ingestion.source.bigquery_v2.bigquery_audit import (
+ AuditLogEntry,
+ BigQueryAuditMetadata,
+)
+from datahub.ingestion.source.bigquery_v2.bigquery_report import (
+ BigQueryAuditLogApiPerfReport,
+)
+from datahub.ingestion.source.bigquery_v2.common import (
+ BQ_DATE_SHARD_FORMAT,
+ BQ_DATETIME_FORMAT,
+)
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+# Api interfaces are separated based on functionality they provide
+# rather than the underlying bigquery client that is used to
+# provide the functionality.
+class BigQueryAuditLogApi:
+ def __init__(
+ self,
+ report: BigQueryAuditLogApiPerfReport,
+ rate_limit: bool,
+ requests_per_min: int,
+ ) -> None:
+ self.report = report
+ self.rate_limit = rate_limit
+ self.requests_per_min = requests_per_min
+
+ def get_exported_bigquery_audit_metadata(
+ self,
+ bigquery_client: bigquery.Client,
+ bigquery_audit_metadata_query_template: Callable[
+ [
+ str, # dataset: str
+ bool, # use_date_sharded_tables: bool
+ Optional[int], # limit: Optional[int] = None
+ ],
+ str,
+ ],
+ bigquery_audit_metadata_datasets: Optional[List[str]],
+ use_date_sharded_audit_log_tables: bool,
+ start_time: datetime,
+ end_time: datetime,
+ limit: Optional[int] = None,
+ ) -> Iterable[BigQueryAuditMetadata]:
+ if bigquery_audit_metadata_datasets is None:
+ return
+
+ audit_start_time = start_time.strftime(BQ_DATETIME_FORMAT)
+ audit_start_date = start_time.strftime(BQ_DATE_SHARD_FORMAT)
+
+ audit_end_time = end_time.strftime(BQ_DATETIME_FORMAT)
+ audit_end_date = end_time.strftime(BQ_DATE_SHARD_FORMAT)
+
+ rate_limiter: Optional[RateLimiter] = None
+ if self.rate_limit:
+ rate_limiter = RateLimiter(max_calls=self.requests_per_min, period=60)
+
+ with self.report.get_exported_log_entries as current_timer:
+ for dataset in bigquery_audit_metadata_datasets:
+ logger.info(
+ f"Start loading log entries from BigQueryAuditMetadata in {dataset}"
+ )
+
+ query = bigquery_audit_metadata_query_template(
+ dataset,
+ use_date_sharded_audit_log_tables,
+ limit,
+ ).format(
+ start_time=audit_start_time,
+ end_time=audit_end_time,
+ start_date=audit_start_date,
+ end_date=audit_end_date,
+ )
+
+ query_job = bigquery_client.query(query)
+ logger.info(
+ f"Finished loading log entries from BigQueryAuditMetadata in {dataset}"
+ )
+
+ for entry in query_job:
+ with current_timer.pause():
+ if rate_limiter:
+ with rate_limiter:
+ yield entry
+ else:
+ yield entry
+
+ def get_bigquery_log_entries_via_gcp_logging(
+ self,
+ client: GCPLoggingClient,
+ filter: str,
+ log_page_size: int,
+ limit: Optional[int] = None,
+ ) -> Iterable[AuditLogEntry]:
+ logger.debug(filter)
+
+ list_entries: Iterable[AuditLogEntry]
+ rate_limiter: Optional[RateLimiter] = None
+ if self.rate_limit:
+ # client.list_entries is a generator, does api calls to GCP Logging when it runs out of entries and needs to fetch more from GCP Logging
+ # to properly ratelimit we multiply the page size by the number of requests per minute
+ rate_limiter = RateLimiter(
+ max_calls=self.requests_per_min * log_page_size,
+ period=60,
+ )
+
+ with self.report.list_log_entries as current_timer:
+ list_entries = client.list_entries(
+ filter_=filter,
+ page_size=log_page_size,
+ max_results=limit,
+ )
+
+ for i, entry in enumerate(list_entries):
+ if i % 1000 == 0:
+ logger.info(
+ f"Loaded {i} log entries from GCP Log for {client.project}"
+ )
+
+ with current_timer.pause():
+ if rate_limiter:
+ with rate_limiter:
+ yield entry
+ else:
+ yield entry
+
+ logger.info(
+ f"Finished loading log entries from GCP Log for {client.project}"
+ )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
index 0f2082c5e53bf2..3b06a4699c5660 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
@@ -4,9 +4,11 @@
from typing import Any, Dict, List, Optional
import pydantic
-from pydantic import Field, PositiveInt, PrivateAttr, root_validator
+from google.cloud import bigquery
+from google.cloud.logging_v2.client import Client as GCPLoggingClient
+from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
-from datahub.configuration.common import AllowDenyPattern
+from datahub.configuration.common import AllowDenyPattern, ConfigModel
from datahub.configuration.validate_field_removal import pydantic_removed_field
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
from datahub.ingestion.source.state.stateful_ingestion_base import (
@@ -35,7 +37,52 @@ class BigQueryUsageConfig(BaseUsageConfig):
)
+class BigQueryConnectionConfig(ConfigModel):
+ credential: Optional[BigQueryCredential] = Field(
+ default=None, description="BigQuery credential informations"
+ )
+
+ _credentials_path: Optional[str] = PrivateAttr(None)
+
+ extra_client_options: Dict[str, Any] = Field(
+ default={},
+ description="Additional options to pass to google.cloud.logging_v2.client.Client.",
+ )
+
+ project_on_behalf: Optional[str] = Field(
+ default=None,
+ description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
+ )
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ if self.credential:
+ self._credentials_path = self.credential.create_credential_temp_file()
+ logger.debug(
+ f"Creating temporary credential file at {self._credentials_path}"
+ )
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
+
+ def get_bigquery_client(config) -> bigquery.Client:
+ client_options = config.extra_client_options
+ return bigquery.Client(config.project_on_behalf, **client_options)
+
+ def make_gcp_logging_client(
+ self, project_id: Optional[str] = None
+ ) -> GCPLoggingClient:
+ # See https://github.com/googleapis/google-cloud-python/issues/2674 for
+ # why we disable gRPC here.
+ client_options = self.extra_client_options.copy()
+ client_options["_use_grpc"] = False
+ if project_id is not None:
+ return GCPLoggingClient(**client_options, project=project_id)
+ else:
+ return GCPLoggingClient(**client_options)
+
+
class BigQueryV2Config(
+ BigQueryConnectionConfig,
BigQueryBaseConfig,
SQLCommonConfig,
StatefulUsageConfigMixin,
@@ -122,11 +169,6 @@ class BigQueryV2Config(
),
)
- project_on_behalf: Optional[str] = Field(
- default=None,
- description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
- )
-
storage_project_id: None = Field(default=None, hidden_from_docs=True)
lineage_use_sql_parser: bool = Field(
@@ -180,14 +222,8 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool:
default=1000,
description="The number of log item will be queried per page for lineage collection",
)
- credential: Optional[BigQueryCredential] = Field(
- description="BigQuery credential informations"
- )
+
# extra_client_options, include_table_lineage and max_query_duration are relevant only when computing the lineage.
- extra_client_options: Dict[str, Any] = Field(
- default={},
- description="Additional options to pass to google.cloud.logging_v2.client.Client.",
- )
include_table_lineage: Optional[bool] = Field(
default=True,
description="Option to enable/disable lineage generation. Is enabled by default.",
@@ -209,7 +245,6 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool:
default=False,
description="Whether to read date sharded tables or time partitioned tables when extracting usage from exported audit logs.",
)
- _credentials_path: Optional[str] = PrivateAttr(None)
_cache_path: Optional[str] = PrivateAttr(None)
@@ -230,16 +265,6 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool:
description="Maximum number of entries for the in-memory caches of FileBacked data structures.",
)
- def __init__(self, **data: Any):
- super().__init__(**data)
-
- if self.credential:
- self._credentials_path = self.credential.create_credential_temp_file()
- logger.debug(
- f"Creating temporary credential file at {self._credentials_path}"
- )
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
-
@root_validator(pre=False)
def profile_default_settings(cls, values: Dict) -> Dict:
# Extra default SQLAlchemy option for better connection pooling and threading.
@@ -248,6 +273,17 @@ def profile_default_settings(cls, values: Dict) -> Dict:
return values
+ @validator("bigquery_audit_metadata_datasets")
+ def validate_bigquery_audit_metadata_datasets(
+ cls, v: Optional[List[str]], values: Dict
+ ) -> Optional[List[str]]:
+ if values.get("use_exported_bigquery_audit_metadata"):
+ assert (
+ v and len(v) > 0
+ ), "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`."
+
+ return v
+
@root_validator(pre=False)
def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
project_id = values.get("project_id")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
index b2251fbb8ab1f2..2d6882caa38ef7 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
@@ -1,5 +1,4 @@
import collections
-import dataclasses
import logging
from dataclasses import dataclass, field
from datetime import datetime
@@ -11,11 +10,26 @@
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
from datahub.utilities.lossy_collections import LossyDict, LossyList
+from datahub.utilities.perf_timer import PerfTimer
from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
logger: logging.Logger = logging.getLogger(__name__)
+class BigQuerySchemaApiPerfReport:
+ list_projects = PerfTimer()
+ list_datasets = PerfTimer()
+ get_columns_for_dataset = PerfTimer()
+ get_tables_for_dataset = PerfTimer()
+ list_tables = PerfTimer()
+ get_views_for_dataset = PerfTimer()
+
+
+class BigQueryAuditLogApiPerfReport:
+ get_exported_log_entries = PerfTimer()
+ list_log_entries = PerfTimer()
+
+
@dataclass
class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowReport):
num_total_lineage_entries: TopKDict[str, int] = field(default_factory=TopKDict)
@@ -31,8 +45,12 @@ class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowR
num_skipped_lineage_entries_other: TopKDict[str, int] = field(
default_factory=int_top_k_dict
)
- num_total_log_entries: TopKDict[str, int] = field(default_factory=int_top_k_dict)
- num_parsed_log_entries: TopKDict[str, int] = field(default_factory=int_top_k_dict)
+ num_lineage_total_log_entries: TopKDict[str, int] = field(
+ default_factory=int_top_k_dict
+ )
+ num_lineage_parsed_log_entries: TopKDict[str, int] = field(
+ default_factory=int_top_k_dict
+ )
num_lineage_log_parse_failures: TopKDict[str, int] = field(
default_factory=int_top_k_dict
)
@@ -42,7 +60,14 @@ class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowR
lineage_mem_size: Dict[str, str] = field(default_factory=TopKDict)
lineage_extraction_sec: Dict[str, float] = field(default_factory=TopKDict)
usage_extraction_sec: Dict[str, float] = field(default_factory=TopKDict)
+ num_usage_total_log_entries: TopKDict[str, int] = field(
+ default_factory=int_top_k_dict
+ )
+ num_usage_parsed_log_entries: TopKDict[str, int] = field(
+ default_factory=int_top_k_dict
+ )
usage_error_count: Dict[str, int] = field(default_factory=int_top_k_dict)
+
num_usage_resources_dropped: int = 0
num_usage_operations_dropped: int = 0
operation_dropped: LossyList[str] = field(default_factory=LossyList)
@@ -53,10 +78,10 @@ class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowR
use_date_sharded_audit_log_tables: Optional[bool] = None
log_page_size: Optional[pydantic.PositiveInt] = None
use_exported_bigquery_audit_metadata: Optional[bool] = None
- log_entry_start_time: Optional[str] = None
- log_entry_end_time: Optional[str] = None
- audit_start_time: Optional[str] = None
- audit_end_time: Optional[str] = None
+ log_entry_start_time: Optional[datetime] = None
+ log_entry_end_time: Optional[datetime] = None
+ audit_start_time: Optional[datetime] = None
+ audit_end_time: Optional[datetime] = None
upstream_lineage: LossyDict = field(default_factory=LossyDict)
partition_info: Dict[str, str] = field(default_factory=TopKDict)
profile_table_selection_criteria: Dict[str, str] = field(default_factory=TopKDict)
@@ -89,13 +114,17 @@ class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowR
num_view_definitions_failed_column_parsing: int = 0
view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
- read_reasons_stat: Counter[str] = dataclasses.field(
- default_factory=collections.Counter
+ read_reasons_stat: Counter[str] = field(default_factory=collections.Counter)
+ operation_types_stat: Counter[str] = field(default_factory=collections.Counter)
+
+ usage_state_size: Optional[str] = None
+
+ schema_api_perf: BigQuerySchemaApiPerfReport = field(
+ default_factory=BigQuerySchemaApiPerfReport
)
- operation_types_stat: Counter[str] = dataclasses.field(
- default_factory=collections.Counter
+ audit_log_api_perf: BigQueryAuditLogApiPerfReport = field(
+ default_factory=BigQueryAuditLogApiPerfReport
)
- usage_state_size: Optional[str] = None
lineage_start_time: Optional[datetime] = None
lineage_end_time: Optional[datetime] = None
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
index 47a04c545231b9..7edc8656360bb8 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
@@ -13,22 +13,19 @@
)
from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
-from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
+from datahub.ingestion.source.bigquery_v2.bigquery_report import (
+ BigQuerySchemaApiPerfReport,
+ BigQueryV2Report,
+)
+from datahub.ingestion.source.bigquery_v2.queries import (
+ BigqueryQuery,
+ BigqueryTableType,
+)
from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView
logger: logging.Logger = logging.getLogger(__name__)
-class BigqueryTableType:
- # See https://cloud.google.com/bigquery/docs/information-schema-tables#schema
- BASE_TABLE = "BASE TABLE"
- EXTERNAL = "EXTERNAL"
- VIEW = "VIEW"
- MATERIALIZED_VIEW = "MATERIALIZED VIEW"
- CLONE = "CLONE"
- SNAPSHOT = "SNAPSHOT"
-
-
@dataclass
class BigqueryColumn(BaseColumn):
field_path: str
@@ -129,253 +126,43 @@ class BigqueryProject:
datasets: List[BigqueryDataset] = field(default_factory=list)
-class BigqueryQuery:
- show_datasets: str = (
- "select schema_name from `{project_id}`.INFORMATION_SCHEMA.SCHEMATA"
- )
-
- datasets_for_project_id: str = """
-select
- s.CATALOG_NAME as catalog_name,
- s.schema_name as table_schema,
- s.location as location,
- s.CREATION_TIME as created,
- s.LAST_MODIFIED_TIME as last_altered,
- o.OPTION_VALUE as comment
-from
- `{project_id}`.INFORMATION_SCHEMA.SCHEMATA as s
- left join `{project_id}`.INFORMATION_SCHEMA.SCHEMATA_OPTIONS as o on o.schema_name = s.schema_name
- and o.option_name = "description"
-order by
- s.schema_name
-"""
-
- # https://cloud.google.com/bigquery/docs/information-schema-table-storage?hl=en
- # Note for max_partition_id -
- # should we instead pick the partition with latest LAST_MODIFIED_TIME ?
- # for range partitioning max may not be latest partition
- tables_for_dataset = f"""
-SELECT
- t.table_catalog as table_catalog,
- t.table_schema as table_schema,
- t.table_name as table_name,
- t.table_type as table_type,
- t.creation_time as created,
- ts.last_modified_time as last_altered,
- tos.OPTION_VALUE as comment,
- is_insertable_into,
- ddl,
- row_count,
- size_bytes as bytes,
- num_partitions,
- max_partition_id,
- active_billable_bytes,
- long_term_billable_bytes,
- REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix,
- REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base
-
-FROM
- `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
- join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME
- left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
- and t.TABLE_NAME = tos.TABLE_NAME
- and tos.OPTION_NAME = "description"
- left join (
- select
- table_name,
- sum(case when partition_id not in ('__NULL__', '__UNPARTITIONED__', '__STREAMING_UNPARTITIONED__') then 1 else 0 END) as num_partitions,
- max(case when partition_id not in ('__NULL__', '__UNPARTITIONED__', '__STREAMING_UNPARTITIONED__') then partition_id else NULL END) as max_partition_id,
- sum(total_rows) as total_rows,
- sum(case when storage_tier = 'LONG_TERM' then total_billable_bytes else 0 end) as long_term_billable_bytes,
- sum(case when storage_tier = 'ACTIVE' then total_billable_bytes else 0 end) as active_billable_bytes,
- from
- `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.PARTITIONS
- group by
- table_name) as p on
- t.table_name = p.table_name
-WHERE
- table_type in ('{BigqueryTableType.BASE_TABLE}', '{BigqueryTableType.EXTERNAL}')
-{{table_filter}}
-order by
- table_schema ASC,
- table_base ASC,
- table_suffix DESC
-"""
-
- tables_for_dataset_without_partition_data = f"""
-SELECT
- t.table_catalog as table_catalog,
- t.table_schema as table_schema,
- t.table_name as table_name,
- t.table_type as table_type,
- t.creation_time as created,
- tos.OPTION_VALUE as comment,
- is_insertable_into,
- ddl,
- REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix,
- REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base
-
-FROM
- `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
- left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
- and t.TABLE_NAME = tos.TABLE_NAME
- and tos.OPTION_NAME = "description"
-WHERE
- table_type in ('{BigqueryTableType.BASE_TABLE}', '{BigqueryTableType.EXTERNAL}')
-{{table_filter}}
-order by
- table_schema ASC,
- table_base ASC,
- table_suffix DESC
-"""
-
- views_for_dataset: str = f"""
-SELECT
- t.table_catalog as table_catalog,
- t.table_schema as table_schema,
- t.table_name as table_name,
- t.table_type as table_type,
- t.creation_time as created,
- ts.last_modified_time as last_altered,
- tos.OPTION_VALUE as comment,
- is_insertable_into,
- ddl as view_definition,
- row_count,
- size_bytes
-FROM
- `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
- join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME
- left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
- and t.TABLE_NAME = tos.TABLE_NAME
- and tos.OPTION_NAME = "description"
-WHERE
- table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}')
-order by
- table_schema ASC,
- table_name ASC
-"""
-
- views_for_dataset_without_data_read: str = f"""
-SELECT
- t.table_catalog as table_catalog,
- t.table_schema as table_schema,
- t.table_name as table_name,
- t.table_type as table_type,
- t.creation_time as created,
- tos.OPTION_VALUE as comment,
- is_insertable_into,
- ddl as view_definition
-FROM
- `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
- left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
- and t.TABLE_NAME = tos.TABLE_NAME
- and tos.OPTION_NAME = "description"
-WHERE
- table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}')
-order by
- table_schema ASC,
- table_name ASC
-"""
-
- columns_for_dataset: str = """
-select
- c.table_catalog as table_catalog,
- c.table_schema as table_schema,
- c.table_name as table_name,
- c.column_name as column_name,
- c.ordinal_position as ordinal_position,
- cfp.field_path as field_path,
- c.is_nullable as is_nullable,
- CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
- description as comment,
- c.is_hidden as is_hidden,
- c.is_partitioning_column as is_partitioning_column,
- c.clustering_ordinal_position as clustering_ordinal_position,
-from
- `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c
- join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name
- and cfp.column_name = c.column_name
-ORDER BY
- table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC"""
-
- optimized_columns_for_dataset: str = """
-select * from
-(select
- c.table_catalog as table_catalog,
- c.table_schema as table_schema,
- c.table_name as table_name,
- c.column_name as column_name,
- c.ordinal_position as ordinal_position,
- cfp.field_path as field_path,
- c.is_nullable as is_nullable,
- CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
- description as comment,
- c.is_hidden as is_hidden,
- c.is_partitioning_column as is_partitioning_column,
- c.clustering_ordinal_position as clustering_ordinal_position,
- -- We count the columns to be able limit it later
- row_number() over (partition by c.table_catalog, c.table_schema, c.table_name order by c.ordinal_position asc, c.data_type DESC) as column_num,
- -- Getting the maximum shard for each table
- row_number() over (partition by c.table_catalog, c.table_schema, ifnull(REGEXP_EXTRACT(c.table_name, r'(.*)_\\d{{8}}$'), c.table_name), cfp.field_path order by c.table_catalog, c.table_schema asc, c.table_name desc) as shard_num
-from
- `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c
- join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name
- and cfp.column_name = c.column_name
- )
--- We filter column limit + 1 to make sure we warn about the limit being reached but not reading too much data
-where column_num <= {column_limit} and shard_num = 1
-ORDER BY
- table_catalog, table_schema, table_name, ordinal_position, column_num ASC, data_type DESC"""
-
- columns_for_table: str = """
-select
- c.table_catalog as table_catalog,
- c.table_schema as table_schema,
- c.table_name as table_name,
- c.column_name as column_name,
- c.ordinal_position as ordinal_position,
- cfp.field_path as field_path,
- c.is_nullable as is_nullable,
- CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
- c.is_hidden as is_hidden,
- c.is_partitioning_column as is_partitioning_column,
- c.clustering_ordinal_position as clustering_ordinal_position,
- description as comment
-from
- `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMNS as c
- join `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name
- and cfp.column_name = c.column_name
-where
- c.table_name = '{table_identifier.table}'
-ORDER BY
- table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC"""
-
-
-class BigQueryDataDictionary:
- @staticmethod
- def get_query_result(conn: bigquery.Client, query: str) -> RowIterator:
+class BigQuerySchemaApi:
+ def __init__(
+ self, report: BigQuerySchemaApiPerfReport, client: bigquery.Client
+ ) -> None:
+ self.bq_client = client
+ self.report = report
+
+ def get_query_result(self, query: str) -> RowIterator:
logger.debug(f"Query : {query}")
- resp = conn.query(query)
+ resp = self.bq_client.query(query)
return resp.result()
- @staticmethod
- def get_projects(conn: bigquery.Client) -> List[BigqueryProject]:
- projects = conn.list_projects()
+ def get_projects(self) -> List[BigqueryProject]:
+ with self.report.list_projects:
+ try:
+ projects = self.bq_client.list_projects()
- return [
- BigqueryProject(id=p.project_id, name=p.friendly_name) for p in projects
- ]
+ return [
+ BigqueryProject(id=p.project_id, name=p.friendly_name)
+ for p in projects
+ ]
+ except Exception as e:
+ logger.error(f"Error getting projects. {e}", exc_info=True)
+ return []
- @staticmethod
def get_datasets_for_project_id(
- conn: bigquery.Client, project_id: str, maxResults: Optional[int] = None
+ self, project_id: str, maxResults: Optional[int] = None
) -> List[BigqueryDataset]:
- datasets = conn.list_datasets(project_id, max_results=maxResults)
- return [BigqueryDataset(name=d.dataset_id, labels=d.labels) for d in datasets]
+ with self.report.list_datasets:
+ datasets = self.bq_client.list_datasets(project_id, max_results=maxResults)
+ return [
+ BigqueryDataset(name=d.dataset_id, labels=d.labels) for d in datasets
+ ]
- @staticmethod
+ # This is not used anywhere
def get_datasets_for_project_id_with_information_schema(
- conn: bigquery.Client, project_id: str
+ self, project_id: str
) -> List[BigqueryDataset]:
"""
This method is not used as of now, due to below limitation.
@@ -383,8 +170,7 @@ def get_datasets_for_project_id_with_information_schema(
We'll need Region wise separate queries to fetch all datasets
https://cloud.google.com/bigquery/docs/information-schema-datasets-schemata
"""
- schemas = BigQueryDataDictionary.get_query_result(
- conn,
+ schemas = self.get_query_result(
BigqueryQuery.datasets_for_project_id.format(project_id=project_id),
)
return [
@@ -398,56 +184,67 @@ def get_datasets_for_project_id_with_information_schema(
for s in schemas
]
- @staticmethod
+ def list_tables(
+ self, dataset_name: str, project_id: str
+ ) -> Iterator[TableListItem]:
+ with self.report.list_tables as current_timer:
+ for table in self.bq_client.list_tables(f"{project_id}.{dataset_name}"):
+ with current_timer.pause():
+ yield table
+
def get_tables_for_dataset(
- conn: bigquery.Client,
+ self,
project_id: str,
dataset_name: str,
tables: Dict[str, TableListItem],
with_data_read_permission: bool = False,
report: Optional[BigQueryV2Report] = None,
) -> Iterator[BigqueryTable]:
- filter: str = ", ".join(f"'{table}'" for table in tables.keys())
-
- if with_data_read_permission:
- # Tables are ordered by name and table suffix to make sure we always process the latest sharded table
- # and skip the others. Sharded tables are tables with suffix _20220102
- cur = BigQueryDataDictionary.get_query_result(
- conn,
- BigqueryQuery.tables_for_dataset.format(
- project_id=project_id,
- dataset_name=dataset_name,
- table_filter=f" and t.table_name in ({filter})" if filter else "",
- ),
- )
- else:
- # Tables are ordered by name and table suffix to make sure we always process the latest sharded table
- # and skip the others. Sharded tables are tables with suffix _20220102
- cur = BigQueryDataDictionary.get_query_result(
- conn,
- BigqueryQuery.tables_for_dataset_without_partition_data.format(
- project_id=project_id,
- dataset_name=dataset_name,
- table_filter=f" and t.table_name in ({filter})" if filter else "",
- ),
- )
-
- for table in cur:
- try:
- yield BigQueryDataDictionary._make_bigquery_table(
- table, tables.get(table.table_name)
+ with self.report.get_tables_for_dataset as current_timer:
+ filter_clause: str = ", ".join(f"'{table}'" for table in tables.keys())
+
+ if with_data_read_permission:
+ # Tables are ordered by name and table suffix to make sure we always process the latest sharded table
+ # and skip the others. Sharded tables are tables with suffix _20220102
+ cur = self.get_query_result(
+ BigqueryQuery.tables_for_dataset.format(
+ project_id=project_id,
+ dataset_name=dataset_name,
+ table_filter=f" and t.table_name in ({filter_clause})"
+ if filter_clause
+ else "",
+ ),
)
- except Exception as e:
- table_name = f"{project_id}.{dataset_name}.{table.table_name}"
- logger.warning(
- f"Error while processing table {table_name}",
- exc_info=True,
+ else:
+ # Tables are ordered by name and table suffix to make sure we always process the latest sharded table
+ # and skip the others. Sharded tables are tables with suffix _20220102
+ cur = self.get_query_result(
+ BigqueryQuery.tables_for_dataset_without_partition_data.format(
+ project_id=project_id,
+ dataset_name=dataset_name,
+ table_filter=f" and t.table_name in ({filter_clause})"
+ if filter_clause
+ else "",
+ ),
)
- if report:
- report.report_warning(
- "metadata-extraction",
- f"Failed to get table {table_name}: {e}",
+
+ for table in cur:
+ try:
+ with current_timer.pause():
+ yield BigQuerySchemaApi._make_bigquery_table(
+ table, tables.get(table.table_name)
+ )
+ except Exception as e:
+ table_name = f"{project_id}.{dataset_name}.{table.table_name}"
+ logger.warning(
+ f"Error while processing table {table_name}",
+ exc_info=True,
)
+ if report:
+ report.report_warning(
+ "metadata-extraction",
+ f"Failed to get table {table_name}: {e}",
+ )
@staticmethod
def _make_bigquery_table(
@@ -487,43 +284,42 @@ def _make_bigquery_table(
long_term_billable_bytes=table.get("long_term_billable_bytes"),
)
- @staticmethod
def get_views_for_dataset(
- conn: bigquery.Client,
+ self,
project_id: str,
dataset_name: str,
has_data_read: bool,
report: Optional[BigQueryV2Report] = None,
) -> Iterator[BigqueryView]:
- if has_data_read:
- cur = BigQueryDataDictionary.get_query_result(
- conn,
- BigqueryQuery.views_for_dataset.format(
- project_id=project_id, dataset_name=dataset_name
- ),
- )
- else:
- cur = BigQueryDataDictionary.get_query_result(
- conn,
- BigqueryQuery.views_for_dataset_without_data_read.format(
- project_id=project_id, dataset_name=dataset_name
- ),
- )
-
- for table in cur:
- try:
- yield BigQueryDataDictionary._make_bigquery_view(table)
- except Exception as e:
- view_name = f"{project_id}.{dataset_name}.{table.table_name}"
- logger.warning(
- f"Error while processing view {view_name}",
- exc_info=True,
+ with self.report.get_views_for_dataset as current_timer:
+ if has_data_read:
+ cur = self.get_query_result(
+ BigqueryQuery.views_for_dataset.format(
+ project_id=project_id, dataset_name=dataset_name
+ ),
+ )
+ else:
+ cur = self.get_query_result(
+ BigqueryQuery.views_for_dataset_without_data_read.format(
+ project_id=project_id, dataset_name=dataset_name
+ ),
)
- if report:
- report.report_warning(
- "metadata-extraction",
- f"Failed to get view {view_name}: {e}",
+
+ for table in cur:
+ try:
+ with current_timer.pause():
+ yield BigQuerySchemaApi._make_bigquery_view(table)
+ except Exception as e:
+ view_name = f"{project_id}.{dataset_name}.{table.table_name}"
+ logger.warning(
+ f"Error while processing view {view_name}",
+ exc_info=True,
)
+ if report:
+ report.report_warning(
+ "metadata-extraction",
+ f"Failed to get view {view_name}: {e}",
+ )
@staticmethod
def _make_bigquery_view(view: bigquery.Row) -> BigqueryView:
@@ -540,70 +336,68 @@ def _make_bigquery_view(view: bigquery.Row) -> BigqueryView:
materialized=view.table_type == BigqueryTableType.MATERIALIZED_VIEW,
)
- @staticmethod
def get_columns_for_dataset(
- conn: bigquery.Client,
+ self,
project_id: str,
dataset_name: str,
column_limit: int,
run_optimized_column_query: bool = False,
) -> Optional[Dict[str, List[BigqueryColumn]]]:
columns: Dict[str, List[BigqueryColumn]] = defaultdict(list)
- try:
- cur = BigQueryDataDictionary.get_query_result(
- conn,
- BigqueryQuery.columns_for_dataset.format(
- project_id=project_id, dataset_name=dataset_name
- )
- if not run_optimized_column_query
- else BigqueryQuery.optimized_columns_for_dataset.format(
- project_id=project_id,
- dataset_name=dataset_name,
- column_limit=column_limit,
- ),
- )
- except Exception as e:
- logger.warning(f"Columns for dataset query failed with exception: {e}")
- # Error - Information schema query returned too much data.
- # Please repeat query with more selective predicates.
- return None
-
- last_seen_table: str = ""
- for column in cur:
- if (
- column_limit
- and column.table_name in columns
- and len(columns[column.table_name]) >= column_limit
- ):
- if last_seen_table != column.table_name:
- logger.warning(
- f"{project_id}.{dataset_name}.{column.table_name} contains more than {column_limit} columns, only processing {column_limit} columns"
- )
- last_seen_table = column.table_name
- else:
- columns[column.table_name].append(
- BigqueryColumn(
- name=column.column_name,
- ordinal_position=column.ordinal_position,
- field_path=column.field_path,
- is_nullable=column.is_nullable == "YES",
- data_type=column.data_type,
- comment=column.comment,
- is_partition_column=column.is_partitioning_column == "YES",
- cluster_column_position=column.clustering_ordinal_position,
+ with self.report.get_columns_for_dataset:
+ try:
+ cur = self.get_query_result(
+ BigqueryQuery.columns_for_dataset.format(
+ project_id=project_id, dataset_name=dataset_name
)
+ if not run_optimized_column_query
+ else BigqueryQuery.optimized_columns_for_dataset.format(
+ project_id=project_id,
+ dataset_name=dataset_name,
+ column_limit=column_limit,
+ ),
)
+ except Exception as e:
+ logger.warning(f"Columns for dataset query failed with exception: {e}")
+ # Error - Information schema query returned too much data.
+ # Please repeat query with more selective predicates.
+ return None
+
+ last_seen_table: str = ""
+ for column in cur:
+ if (
+ column_limit
+ and column.table_name in columns
+ and len(columns[column.table_name]) >= column_limit
+ ):
+ if last_seen_table != column.table_name:
+ logger.warning(
+ f"{project_id}.{dataset_name}.{column.table_name} contains more than {column_limit} columns, only processing {column_limit} columns"
+ )
+ last_seen_table = column.table_name
+ else:
+ columns[column.table_name].append(
+ BigqueryColumn(
+ name=column.column_name,
+ ordinal_position=column.ordinal_position,
+ field_path=column.field_path,
+ is_nullable=column.is_nullable == "YES",
+ data_type=column.data_type,
+ comment=column.comment,
+ is_partition_column=column.is_partitioning_column == "YES",
+ cluster_column_position=column.clustering_ordinal_position,
+ )
+ )
return columns
- @staticmethod
+ # This is not used anywhere
def get_columns_for_table(
- conn: bigquery.Client,
+ self,
table_identifier: BigqueryTableIdentifier,
column_limit: Optional[int],
) -> List[BigqueryColumn]:
- cur = BigQueryDataDictionary.get_query_result(
- conn,
+ cur = self.get_query_result(
BigqueryQuery.columns_for_table.format(table_identifier=table_identifier),
)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py
index 4ff509858b87d0..e38ab07855b8be 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py
@@ -1,39 +1,5 @@
-from typing import Any, Dict, Optional
-
-from google.cloud import bigquery
-from google.cloud.logging_v2.client import Client as GCPLoggingClient
-
-from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
-
BQ_DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
BQ_DATE_SHARD_FORMAT = "%Y%m%d"
BQ_EXTERNAL_TABLE_URL_TEMPLATE = "https://console.cloud.google.com/bigquery?project={project}&ws=!1m5!1m4!4m3!1s{project}!2s{dataset}!3s{table}"
BQ_EXTERNAL_DATASET_URL_TEMPLATE = "https://console.cloud.google.com/bigquery?project={project}&ws=!1m4!1m3!3m2!1s{project}!2s{dataset}"
-
-
-def _make_gcp_logging_client(
- project_id: Optional[str] = None, extra_client_options: Dict[str, Any] = {}
-) -> GCPLoggingClient:
- # See https://github.com/googleapis/google-cloud-python/issues/2674 for
- # why we disable gRPC here.
- client_options = extra_client_options.copy()
- client_options["_use_grpc"] = False
- if project_id is not None:
- return GCPLoggingClient(**client_options, project=project_id)
- else:
- return GCPLoggingClient(**client_options)
-
-
-def get_bigquery_client(config: BigQueryV2Config) -> bigquery.Client:
- client_options = config.extra_client_options
- return bigquery.Client(config.project_on_behalf, **client_options)
-
-
-def get_sql_alchemy_url(config: BigQueryV2Config) -> str:
- if config.project_on_behalf:
- return f"bigquery://{config.project_on_behalf}"
- # When project_id is not set, we will attempt to detect the project ID
- # based on the credentials or environment variables.
- # See https://github.com/mxmzdlv/pybigquery#authentication.
- return "bigquery://"
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
index 341952d95e7d71..98c8cbaf85eec5 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
@@ -1,7 +1,6 @@
import collections
import itertools
import logging
-import textwrap
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import (
@@ -18,12 +17,12 @@
)
import humanfriendly
-from google.cloud.bigquery import Client as BigQueryClient
from google.cloud.datacatalog import lineage_v1
from google.cloud.logging_v2.client import Client as GCPLoggingClient
-from ratelimiter import RateLimiter
from datahub.emitter import mce_builder
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.bigquery_v2.bigquery_audit import (
AuditLogEntry,
BigQueryAuditMetadata,
@@ -32,13 +31,16 @@
QueryEvent,
ReadEvent,
)
+from datahub.ingestion.source.bigquery_v2.bigquery_audit_log_api import (
+ BigQueryAuditLogApi,
+)
from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
-from datahub.ingestion.source.bigquery_v2.common import (
- BQ_DATE_SHARD_FORMAT,
- BQ_DATETIME_FORMAT,
- _make_gcp_logging_client,
- get_bigquery_client,
+from datahub.ingestion.source.bigquery_v2.bigquery_schema import BigQuerySchemaApi
+from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT
+from datahub.ingestion.source.bigquery_v2.queries import (
+ BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE,
+ bigquery_audit_metadata_query_template_lineage,
)
from datahub.ingestion.source.state.redundant_run_skip_handler import (
RedundantLineageRunSkipHandler,
@@ -52,7 +54,9 @@
UpstreamClass,
UpstreamLineageClass,
)
+from datahub.specific.dataset import DatasetPatchBuilder
from datahub.utilities import memory_footprint
+from datahub.utilities.file_backed_collections import FileBackedDict
from datahub.utilities.perf_timer import PerfTimer
from datahub.utilities.sqlglot_lineage import (
SchemaResolver,
@@ -194,49 +198,21 @@ def make_lineage_edges_from_parsing_result(
class BigqueryLineageExtractor:
- BQ_FILTER_RULE_TEMPLATE_V2 = """
-resource.type=("bigquery_project")
-AND
-(
- protoPayload.methodName=
- (
- "google.cloud.bigquery.v2.JobService.Query"
- OR
- "google.cloud.bigquery.v2.JobService.InsertJob"
- )
- AND
- protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE"
- AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:*
- AND (
- protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:*
- OR
- protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedViews:*
- )
- AND (
- protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/_.*/tables/anon.*"
- AND
- protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/INFORMATION_SCHEMA.*"
- AND
- protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/__TABLES__"
- AND
- protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable !~ "projects/.*/datasets/_.*/tables/anon.*"
- )
-
-)
-AND
-timestamp >= "{start_time}"
-AND
-timestamp < "{end_time}"
-""".strip()
-
def __init__(
self,
config: BigQueryV2Config,
report: BigQueryV2Report,
+ dataset_urn_builder: Callable[[BigQueryTableRef], str],
redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = None,
):
self.config = config
self.report = report
+ self.dataset_urn_builder = dataset_urn_builder
+ self.audit_log_api = BigQueryAuditLogApi(
+ report.audit_log_api_perf,
+ self.config.rate_limit,
+ self.config.requests_per_min,
+ )
self.redundant_run_skip_handler = redundant_run_skip_handler
self.start_time, self.end_time = (
@@ -256,55 +232,205 @@ def error(self, log: logging.Logger, key: str, reason: str) -> None:
self.report.report_warning(key, reason)
log.error(f"{key} => {reason}")
- @staticmethod
- def bigquery_audit_metadata_query_template(
- dataset: str, use_date_sharded_tables: bool, limit: Optional[int] = None
- ) -> str:
- """
- Receives a dataset (with project specified) and returns a query template that is used to query exported
- AuditLogs containing protoPayloads of type BigQueryAuditMetadata.
- Include only those that:
- - have been completed (jobStatus.jobState = "DONE")
- - do not contain errors (jobStatus.errorResults is none)
- :param dataset: the dataset to query against in the form of $PROJECT.$DATASET
- :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log
- tables
- :param limit: set a limit for the maximum event to return. It is used for connection testing currently
- :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery
- """
- limit_text = f"limit {limit}" if limit else ""
+ def _should_ingest_lineage(self) -> bool:
+ if (
+ self.redundant_run_skip_handler
+ and self.redundant_run_skip_handler.should_skip_this_run(
+ cur_start_time=self.config.start_time,
+ cur_end_time=self.config.end_time,
+ )
+ ):
+ # Skip this run
+ self.report.report_warning(
+ "lineage-extraction",
+ "Skip this run as there was already a run for current ingestion window.",
+ )
+ return False
+
+ return True
+
+ def get_lineage_workunits(
+ self,
+ projects: List[str],
+ sql_parser_schema_resolver: SchemaResolver,
+ view_refs_by_project: Dict[str, Set[str]],
+ view_definitions: FileBackedDict[str],
+ table_refs: Set[str],
+ ) -> Iterable[MetadataWorkUnit]:
+ if not self._should_ingest_lineage():
+ return
+ views_skip_audit_log_lineage: Set[str] = set()
+ if self.config.lineage_parse_view_ddl:
+ view_lineage: Dict[str, Set[LineageEdge]] = {}
+ for project in projects:
+ self.populate_view_lineage_with_sql_parsing(
+ view_lineage,
+ view_refs_by_project[project],
+ view_definitions,
+ sql_parser_schema_resolver,
+ project,
+ )
- shard_condition = ""
- if use_date_sharded_tables:
- from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`"
- shard_condition = (
- """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """
+ views_skip_audit_log_lineage.update(view_lineage.keys())
+ for lineage_key in view_lineage.keys():
+ yield from self.gen_lineage_workunits_for_table(
+ view_lineage, BigQueryTableRef.from_string_name(lineage_key)
+ )
+
+ if self.config.use_exported_bigquery_audit_metadata:
+ projects = ["*"] # project_id not used when using exported metadata
+
+ for project in projects:
+ self.report.set_ingestion_stage(project, "Lineage Extraction")
+ yield from self.generate_lineage(
+ project,
+ sql_parser_schema_resolver,
+ views_skip_audit_log_lineage,
+ table_refs,
)
- else:
- from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`"
-
- query = f"""
- SELECT
- timestamp,
- logName,
- insertId,
- protopayload_auditlog AS protoPayload,
- protopayload_auditlog.metadataJson AS metadata
- FROM
- {from_table}
- WHERE (
- timestamp >= "{{start_time}}"
- AND timestamp < "{{end_time}}"
+
+ if self.redundant_run_skip_handler:
+ # Update the checkpoint state for this run.
+ self.redundant_run_skip_handler.update_state(
+ self.config.start_time, self.config.end_time
)
- {shard_condition}
- AND protopayload_auditlog.serviceName="bigquery.googleapis.com"
- AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE"
- AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL
- AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL
- {limit_text};
- """
- return textwrap.dedent(query)
+ def generate_lineage(
+ self,
+ project_id: str,
+ sql_parser_schema_resolver: SchemaResolver,
+ views_skip_audit_log_lineage: Set[str],
+ table_refs: Set[str],
+ ) -> Iterable[MetadataWorkUnit]:
+ logger.info(f"Generate lineage for {project_id}")
+ with PerfTimer() as timer:
+ try:
+ if self.config.extract_lineage_from_catalog:
+ lineage = self.lineage_via_catalog_lineage_api(project_id)
+ else:
+ events = self._get_parsed_audit_log_events(project_id)
+ lineage = self._create_lineage_map(
+ events, sql_parser_schema_resolver
+ )
+ except Exception as e:
+ if project_id:
+ self.report.lineage_failed_extraction.append(project_id)
+ self.error(
+ logger,
+ "lineage",
+ f"{project_id}: {e}",
+ )
+ lineage = {}
+
+ self.report.lineage_metadata_entries[project_id] = len(lineage)
+ logger.info(f"Built lineage map containing {len(lineage)} entries.")
+ logger.debug(f"lineage metadata is {lineage}")
+ self.report.lineage_extraction_sec[project_id] = round(
+ timer.elapsed_seconds(), 2
+ )
+ self.report.lineage_mem_size[project_id] = humanfriendly.format_size(
+ memory_footprint.total_size(lineage)
+ )
+
+ for lineage_key in lineage.keys():
+ # For views, we do not use the upstreams obtained by parsing audit logs
+ # as they may contain indirectly referenced tables.
+ if (
+ lineage_key not in table_refs
+ or lineage_key in views_skip_audit_log_lineage
+ ):
+ continue
+
+ yield from self.gen_lineage_workunits_for_table(
+ lineage, BigQueryTableRef.from_string_name(lineage_key)
+ )
+
+ def populate_view_lineage_with_sql_parsing(
+ self,
+ view_lineage: Dict[str, Set[LineageEdge]],
+ view_refs: Set[str],
+ view_definitions: FileBackedDict[str],
+ sql_parser_schema_resolver: SchemaResolver,
+ default_project: str,
+ ) -> None:
+ for view in view_refs:
+ view_definition = view_definitions[view]
+ raw_view_lineage = sqlglot_lineage(
+ view_definition,
+ schema_resolver=sql_parser_schema_resolver,
+ default_db=default_project,
+ )
+ if raw_view_lineage.debug_info.table_error:
+ logger.debug(
+ f"Failed to parse lineage for view {view}: {raw_view_lineage.debug_info.table_error}"
+ )
+ self.report.num_view_definitions_failed_parsing += 1
+ self.report.view_definitions_parsing_failures.append(
+ f"Table-level sql parsing error for view {view}: {raw_view_lineage.debug_info.table_error}"
+ )
+ continue
+ elif raw_view_lineage.debug_info.column_error:
+ self.report.num_view_definitions_failed_column_parsing += 1
+ self.report.view_definitions_parsing_failures.append(
+ f"Column-level sql parsing error for view {view}: {raw_view_lineage.debug_info.column_error}"
+ )
+ else:
+ self.report.num_view_definitions_parsed += 1
+
+ ts = datetime.now(timezone.utc)
+ view_lineage[view] = set(
+ make_lineage_edges_from_parsing_result(
+ raw_view_lineage,
+ audit_stamp=ts,
+ lineage_type=DatasetLineageTypeClass.VIEW,
+ )
+ )
+
+ def gen_lineage_workunits_for_table(
+ self, lineage: Dict[str, Set[LineageEdge]], table_ref: BigQueryTableRef
+ ) -> Iterable[MetadataWorkUnit]:
+ dataset_urn = self.dataset_urn_builder(table_ref)
+
+ lineage_info = self.get_lineage_for_table(
+ bq_table=table_ref,
+ bq_table_urn=dataset_urn,
+ lineage_metadata=lineage,
+ )
+ if lineage_info:
+ yield from self.gen_lineage(dataset_urn, lineage_info)
+
+ def gen_lineage(
+ self,
+ dataset_urn: str,
+ upstream_lineage: Optional[UpstreamLineageClass] = None,
+ ) -> Iterable[MetadataWorkUnit]:
+ if upstream_lineage is None:
+ return
+
+ if upstream_lineage is not None:
+ if self.config.incremental_lineage:
+ patch_builder: DatasetPatchBuilder = DatasetPatchBuilder(
+ urn=dataset_urn
+ )
+ for upstream in upstream_lineage.upstreams:
+ patch_builder.add_upstream_lineage(upstream)
+
+ yield from [
+ MetadataWorkUnit(
+ id=f"upstreamLineage-for-{dataset_urn}",
+ mcp_raw=mcp,
+ )
+ for mcp in patch_builder.build()
+ ]
+ else:
+ if not self.config.extract_column_lineage:
+ upstream_lineage.fineGrainedLineages = None
+
+ yield from [
+ MetadataChangeProposalWrapper(
+ entityUrn=dataset_urn, aspect=upstream_lineage
+ ).as_workunit()
+ ]
def lineage_via_catalog_lineage_api(
self, project_id: str
@@ -328,22 +454,28 @@ def lineage_via_catalog_lineage_api(
try:
lineage_client: lineage_v1.LineageClient = lineage_v1.LineageClient()
- bigquery_client: BigQueryClient = get_bigquery_client(self.config)
+
+ data_dictionary = BigQuerySchemaApi(
+ self.report.schema_api_perf, self.config.get_bigquery_client()
+ )
+
# Filtering datasets
- datasets = list(bigquery_client.list_datasets(project_id))
+ datasets = list(data_dictionary.get_datasets_for_project_id(project_id))
project_tables = []
for dataset in datasets:
# Enables only tables where type is TABLE, VIEW or MATERIALIZED_VIEW (not EXTERNAL)
project_tables.extend(
[
table
- for table in bigquery_client.list_tables(dataset.dataset_id)
+ for table in data_dictionary.list_tables(
+ dataset.name, project_id
+ )
if table.table_type in ["TABLE", "VIEW", "MATERIALIZED_VIEW"]
]
)
# Convert project tables to .. format
- project_tables = list(
+ project_table_names = list(
map(
lambda table: "{}.{}.{}".format(
table.project, table.dataset_id, table.table_id
@@ -354,7 +486,7 @@ def lineage_via_catalog_lineage_api(
lineage_map: Dict[str, Set[LineageEdge]] = {}
curr_date = datetime.now()
- for table in project_tables:
+ for table in project_table_names:
logger.info("Creating lineage map for table %s", table)
upstreams = set()
downstream_table = lineage_v1.EntityReference()
@@ -411,127 +543,73 @@ def lineage_via_catalog_lineage_api(
raise e
def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]:
+ # We adjust the filter values a bit, since we need to make sure that the join
+ # between query events and read events is complete. For example, this helps us
+ # handle the case where the read happens within our time range but the query
+ # completion event is delayed and happens after the configured end time.
+ corrected_start_time = self.start_time - self.config.max_query_duration
+ corrected_end_time = self.end_time + -self.config.max_query_duration
+ self.report.log_entry_start_time = corrected_start_time
+ self.report.log_entry_end_time = corrected_end_time
+
parse_fn: Callable[[Any], Optional[Union[ReadEvent, QueryEvent]]]
if self.config.use_exported_bigquery_audit_metadata:
- logger.info("Populating lineage info via exported GCP audit logs")
- bq_client = get_bigquery_client(self.config)
- entries = self._get_exported_bigquery_audit_metadata(bq_client)
+ entries = self.get_exported_log_entries(
+ corrected_start_time, corrected_end_time
+ )
parse_fn = self._parse_exported_bigquery_audit_metadata
else:
- logger.info("Populating lineage info via exported GCP audit logs")
- logging_client = _make_gcp_logging_client(project_id)
- entries = self._get_bigquery_log_entries(logging_client)
+ entries = self.get_log_entries_via_gcp_logging(
+ project_id, corrected_start_time, corrected_end_time
+ )
parse_fn = self._parse_bigquery_log_entries
for entry in entries:
- self.report.num_total_log_entries[project_id] += 1
+ self.report.num_lineage_total_log_entries[project_id] += 1
try:
event = parse_fn(entry)
if event:
- self.report.num_parsed_log_entries[project_id] += 1
+ self.report.num_lineage_parsed_log_entries[project_id] += 1
yield event
except Exception as e:
logger.warning(f"Unable to parse log entry `{entry}`: {e}")
self.report.num_lineage_log_parse_failures[project_id] += 1
- def _get_bigquery_log_entries(
- self, client: GCPLoggingClient, limit: Optional[int] = None
- ) -> Iterable[AuditLogEntry]:
- self.report.num_total_log_entries[client.project] = 0
- # Add a buffer to start and end time to account for delays in logging events.
- start_time = (self.start_time - self.config.max_query_duration).strftime(
- BQ_DATETIME_FORMAT
- )
- self.report.log_entry_start_time = start_time
-
- end_time = (self.config.end_time + self.config.max_query_duration).strftime(
- BQ_DATETIME_FORMAT
- )
- self.report.log_entry_end_time = end_time
-
- filter = self.BQ_FILTER_RULE_TEMPLATE_V2.format(
- start_time=start_time,
- end_time=end_time,
- )
-
- logger.info(
- f"Start loading log entries from BigQuery for {client.project} with start_time={start_time} and end_time={end_time}"
+ def get_exported_log_entries(
+ self, corrected_start_time, corrected_end_time, limit=None
+ ):
+ logger.info("Populating lineage info via exported GCP audit logs")
+ bq_client = self.config.get_bigquery_client()
+ entries = self.audit_log_api.get_exported_bigquery_audit_metadata(
+ bigquery_client=bq_client,
+ bigquery_audit_metadata_query_template=bigquery_audit_metadata_query_template_lineage,
+ bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets,
+ use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables,
+ start_time=corrected_start_time,
+ end_time=corrected_end_time,
+ limit=limit,
)
+ return entries
- if self.config.rate_limit:
- with RateLimiter(max_calls=self.config.requests_per_min, period=60):
- entries = client.list_entries(
- filter_=filter,
- page_size=self.config.log_page_size,
- max_results=limit,
- )
- else:
- entries = client.list_entries(
- filter_=filter, page_size=self.config.log_page_size, max_results=limit
- )
+ def get_log_entries_via_gcp_logging(
+ self, project_id, corrected_start_time, corrected_end_time
+ ):
+ logger.info("Populating lineage info via exported GCP audit logs")
+ logging_client = self.config.make_gcp_logging_client(project_id)
logger.info(
- f"Start iterating over log entries from BigQuery for {client.project}"
+ f"Start loading log entries from BigQuery for {project_id} "
+ f"with start_time={corrected_start_time} and end_time={corrected_end_time}"
)
- for entry in entries:
- self.report.num_total_log_entries[client.project] += 1
- if self.report.num_total_log_entries[client.project] % 1000 == 0:
- logger.info(
- f"{self.report.num_total_log_entries[client.project]} log entries loaded for project {client.project} so far..."
- )
- yield entry
-
- logger.info(
- f"Finished loading {self.report.num_total_log_entries[client.project]} log entries from BigQuery project {client.project} so far"
+ entries = self.audit_log_api.get_bigquery_log_entries_via_gcp_logging(
+ logging_client,
+ BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE.format(
+ start_time=corrected_start_time.strftime(BQ_DATETIME_FORMAT),
+ end_time=corrected_end_time.strftime(BQ_DATETIME_FORMAT),
+ ),
+ self.config.log_page_size,
)
-
- def _get_exported_bigquery_audit_metadata(
- self, bigquery_client: BigQueryClient, limit: Optional[int] = None
- ) -> Iterable[BigQueryAuditMetadata]:
- if self.config.bigquery_audit_metadata_datasets is None:
- self.error(
- logger, "audit-metadata", "bigquery_audit_metadata_datasets not set"
- )
- self.report.bigquery_audit_metadata_datasets_missing = True
- return
-
- corrected_start_time = self.start_time - self.config.max_query_duration
- start_time = corrected_start_time.strftime(BQ_DATETIME_FORMAT)
- start_date = corrected_start_time.strftime(BQ_DATE_SHARD_FORMAT)
- self.report.audit_start_time = start_time
-
- corrected_end_time = self.end_time + self.config.max_query_duration
- end_time = corrected_end_time.strftime(BQ_DATETIME_FORMAT)
- end_date = corrected_end_time.strftime(BQ_DATE_SHARD_FORMAT)
- self.report.audit_end_time = end_time
-
- for dataset in self.config.bigquery_audit_metadata_datasets:
- logger.info(
- f"Start loading log entries from BigQueryAuditMetadata in {dataset}"
- )
-
- query: str = self.bigquery_audit_metadata_query_template(
- dataset=dataset,
- use_date_sharded_tables=self.config.use_date_sharded_audit_log_tables,
- limit=limit,
- ).format(
- start_time=start_time,
- end_time=end_time,
- start_date=start_date,
- end_date=end_date,
- )
-
- query_job = bigquery_client.query(query)
-
- logger.info(
- f"Finished loading log entries from BigQueryAuditMetadata in {dataset}"
- )
-
- if self.config.rate_limit:
- with RateLimiter(max_calls=self.config.requests_per_min, period=60):
- yield from query_job
- else:
- yield from query_job
+ return entries
# Currently we only parse JobCompleted events but in future we would want to parse other
# events to also create field level lineage.
@@ -674,39 +752,6 @@ def _create_lineage_map(
logger.info("Exiting create lineage map function")
return lineage_map
- def _compute_bigquery_lineage(
- self,
- project_id: str,
- sql_parser_schema_resolver: SchemaResolver,
- ) -> Dict[str, Set[LineageEdge]]:
- lineage_metadata: Dict[str, Set[LineageEdge]]
- try:
- if self.config.extract_lineage_from_catalog:
- lineage_metadata = self.lineage_via_catalog_lineage_api(project_id)
- else:
- events = self._get_parsed_audit_log_events(project_id)
- lineage_metadata = self._create_lineage_map(
- events, sql_parser_schema_resolver
- )
- except Exception as e:
- if project_id:
- self.report.lineage_failed_extraction.append(project_id)
- self.error(
- logger,
- "lineage",
- f"{project_id}: {e}",
- )
- self.report_status(f"{project_id}-lineage", False)
- lineage_metadata = {}
-
- self.report.lineage_mem_size[project_id] = humanfriendly.format_size(
- memory_footprint.total_size(lineage_metadata)
- )
- self.report.lineage_metadata_entries[project_id] = len(lineage_metadata)
- logger.info(f"Built lineage map containing {len(lineage_metadata)} entries.")
- logger.debug(f"lineage metadata is {lineage_metadata}")
- return lineage_metadata
-
def get_upstream_tables(
self,
bq_table: BigQueryTableRef,
@@ -767,28 +812,11 @@ def get_upstream_tables(
return set(upstreams.values())
- def calculate_lineage_for_project(
- self,
- project_id: str,
- sql_parser_schema_resolver: SchemaResolver,
- ) -> Dict[str, Set[LineageEdge]]:
- with PerfTimer() as timer:
- lineage = self._compute_bigquery_lineage(
- project_id, sql_parser_schema_resolver
- )
-
- self.report.lineage_extraction_sec[project_id] = round(
- timer.elapsed_seconds(), 2
- )
-
- return lineage
-
def get_lineage_for_table(
self,
bq_table: BigQueryTableRef,
bq_table_urn: str,
lineage_metadata: Dict[str, Set[LineageEdge]],
- platform: str,
) -> Optional[UpstreamLineageClass]:
upstream_list: List[UpstreamClass] = []
fine_grained_lineages: List[FineGrainedLineageClass] = []
@@ -796,12 +824,7 @@ def get_lineage_for_table(
# even if the lineage is same but the order is different.
for upstream in sorted(self.get_upstream_tables(bq_table, lineage_metadata)):
upstream_table = BigQueryTableRef.from_string_name(upstream.table)
- upstream_table_urn = mce_builder.make_dataset_urn_with_platform_instance(
- platform,
- upstream_table.table_identifier.get_table_name(),
- self.config.platform_instance,
- self.config.env,
- )
+ upstream_table_urn = self.dataset_urn_builder(upstream_table)
# Generate table-level lineage.
upstream_table_class = UpstreamClass(
@@ -852,19 +875,27 @@ def get_lineage_for_table(
def test_capability(self, project_id: str) -> None:
if self.config.use_exported_bigquery_audit_metadata:
- bigquery_client: BigQueryClient = BigQueryClient(project=project_id)
- entries = self._get_exported_bigquery_audit_metadata(
- bigquery_client=bigquery_client, limit=1
- )
- for entry in entries:
+ for entry in self.get_exported_log_entries(
+ self.start_time,
+ self.end_time,
+ limit=1,
+ ):
logger.debug(
f"Connection test got one exported_bigquery_audit_metadata {entry}"
)
else:
- gcp_logging_client: GCPLoggingClient = _make_gcp_logging_client(
- project_id, self.config.extra_client_options
+ gcp_logging_client: GCPLoggingClient = self.config.make_gcp_logging_client(
+ project_id
)
- for entry in self._get_bigquery_log_entries(gcp_logging_client, limit=1):
+ for entry in self.audit_log_api.get_bigquery_log_entries_via_gcp_logging(
+ gcp_logging_client,
+ filter=BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE.format(
+ self.start_time.strftime(BQ_DATETIME_FORMAT),
+ self.end_time.strftime(BQ_DATETIME_FORMAT),
+ ),
+ log_page_size=self.config.log_page_size,
+ limit=1,
+ ):
logger.debug(f"Connection test got one audit metadata entry {entry}")
def report_status(self, step: str, status: bool) -> None:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py
new file mode 100644
index 00000000000000..5be7a0a7f6b2f3
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py
@@ -0,0 +1,426 @@
+import textwrap
+from typing import Optional
+
+
+class BigqueryTableType:
+ # See https://cloud.google.com/bigquery/docs/information-schema-tables#schema
+ BASE_TABLE = "BASE TABLE"
+ EXTERNAL = "EXTERNAL"
+ VIEW = "VIEW"
+ MATERIALIZED_VIEW = "MATERIALIZED VIEW"
+ CLONE = "CLONE"
+ SNAPSHOT = "SNAPSHOT"
+
+
+class BigqueryQuery:
+ show_datasets: str = (
+ "select schema_name from `{project_id}`.INFORMATION_SCHEMA.SCHEMATA"
+ )
+
+ datasets_for_project_id: str = """
+select
+ s.CATALOG_NAME as catalog_name,
+ s.schema_name as table_schema,
+ s.location as location,
+ s.CREATION_TIME as created,
+ s.LAST_MODIFIED_TIME as last_altered,
+ o.OPTION_VALUE as comment
+from
+ `{project_id}`.INFORMATION_SCHEMA.SCHEMATA as s
+ left join `{project_id}`.INFORMATION_SCHEMA.SCHEMATA_OPTIONS as o on o.schema_name = s.schema_name
+ and o.option_name = "description"
+order by
+ s.schema_name
+"""
+
+ # https://cloud.google.com/bigquery/docs/information-schema-table-storage?hl=en
+ tables_for_dataset = f"""
+SELECT
+ t.table_catalog as table_catalog,
+ t.table_schema as table_schema,
+ t.table_name as table_name,
+ t.table_type as table_type,
+ t.creation_time as created,
+ ts.last_modified_time as last_altered,
+ tos.OPTION_VALUE as comment,
+ is_insertable_into,
+ ddl,
+ row_count,
+ size_bytes as bytes,
+ num_partitions,
+ max_partition_id,
+ active_billable_bytes,
+ long_term_billable_bytes,
+ REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix,
+ REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base
+
+FROM
+ `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
+ join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME
+ left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
+ and t.TABLE_NAME = tos.TABLE_NAME
+ and tos.OPTION_NAME = "description"
+ left join (
+ select
+ table_name,
+ sum(case when partition_id not in ('__NULL__', '__UNPARTITIONED__', '__STREAMING_UNPARTITIONED__') then 1 else 0 END) as num_partitions,
+ max(case when partition_id not in ('__NULL__', '__UNPARTITIONED__', '__STREAMING_UNPARTITIONED__') then partition_id else NULL END) as max_partition_id,
+ sum(total_rows) as total_rows,
+ sum(case when storage_tier = 'LONG_TERM' then total_billable_bytes else 0 end) as long_term_billable_bytes,
+ sum(case when storage_tier = 'ACTIVE' then total_billable_bytes else 0 end) as active_billable_bytes,
+ from
+ `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.PARTITIONS
+ group by
+ table_name) as p on
+ t.table_name = p.table_name
+WHERE
+ table_type in ('{BigqueryTableType.BASE_TABLE}', '{BigqueryTableType.EXTERNAL}')
+{{table_filter}}
+order by
+ table_schema ASC,
+ table_base ASC,
+ table_suffix DESC
+"""
+
+ tables_for_dataset_without_partition_data = f"""
+SELECT
+ t.table_catalog as table_catalog,
+ t.table_schema as table_schema,
+ t.table_name as table_name,
+ t.table_type as table_type,
+ t.creation_time as created,
+ tos.OPTION_VALUE as comment,
+ is_insertable_into,
+ ddl,
+ REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix,
+ REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base
+
+FROM
+ `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
+ left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
+ and t.TABLE_NAME = tos.TABLE_NAME
+ and tos.OPTION_NAME = "description"
+WHERE
+ table_type in ('{BigqueryTableType.BASE_TABLE}', '{BigqueryTableType.EXTERNAL}')
+{{table_filter}}
+order by
+ table_schema ASC,
+ table_base ASC,
+ table_suffix DESC
+"""
+
+ views_for_dataset: str = f"""
+SELECT
+ t.table_catalog as table_catalog,
+ t.table_schema as table_schema,
+ t.table_name as table_name,
+ t.table_type as table_type,
+ t.creation_time as created,
+ ts.last_modified_time as last_altered,
+ tos.OPTION_VALUE as comment,
+ is_insertable_into,
+ ddl as view_definition,
+ row_count,
+ size_bytes
+FROM
+ `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
+ join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME
+ left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
+ and t.TABLE_NAME = tos.TABLE_NAME
+ and tos.OPTION_NAME = "description"
+WHERE
+ table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}')
+order by
+ table_schema ASC,
+ table_name ASC
+"""
+
+ views_for_dataset_without_data_read: str = f"""
+SELECT
+ t.table_catalog as table_catalog,
+ t.table_schema as table_schema,
+ t.table_name as table_name,
+ t.table_type as table_type,
+ t.creation_time as created,
+ tos.OPTION_VALUE as comment,
+ is_insertable_into,
+ ddl as view_definition
+FROM
+ `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
+ left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
+ and t.TABLE_NAME = tos.TABLE_NAME
+ and tos.OPTION_NAME = "description"
+WHERE
+ table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}')
+order by
+ table_schema ASC,
+ table_name ASC
+"""
+
+ columns_for_dataset: str = """
+select
+ c.table_catalog as table_catalog,
+ c.table_schema as table_schema,
+ c.table_name as table_name,
+ c.column_name as column_name,
+ c.ordinal_position as ordinal_position,
+ cfp.field_path as field_path,
+ c.is_nullable as is_nullable,
+ CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
+ description as comment,
+ c.is_hidden as is_hidden,
+ c.is_partitioning_column as is_partitioning_column,
+ c.clustering_ordinal_position as clustering_ordinal_position,
+from
+ `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c
+ join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name
+ and cfp.column_name = c.column_name
+ORDER BY
+ table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC"""
+
+ optimized_columns_for_dataset: str = """
+select * from
+(select
+ c.table_catalog as table_catalog,
+ c.table_schema as table_schema,
+ c.table_name as table_name,
+ c.column_name as column_name,
+ c.ordinal_position as ordinal_position,
+ cfp.field_path as field_path,
+ c.is_nullable as is_nullable,
+ CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
+ description as comment,
+ c.is_hidden as is_hidden,
+ c.is_partitioning_column as is_partitioning_column,
+ c.clustering_ordinal_position as clustering_ordinal_position,
+ -- We count the columns to be able limit it later
+ row_number() over (partition by c.table_catalog, c.table_schema, c.table_name order by c.ordinal_position asc, c.data_type DESC) as column_num,
+ -- Getting the maximum shard for each table
+ row_number() over (partition by c.table_catalog, c.table_schema, ifnull(REGEXP_EXTRACT(c.table_name, r'(.*)_\\d{{8}}$'), c.table_name), cfp.field_path order by c.table_catalog, c.table_schema asc, c.table_name desc) as shard_num
+from
+ `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c
+ join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name
+ and cfp.column_name = c.column_name
+ )
+-- We filter column limit + 1 to make sure we warn about the limit being reached but not reading too much data
+where column_num <= {column_limit} and shard_num = 1
+ORDER BY
+ table_catalog, table_schema, table_name, ordinal_position, column_num ASC, data_type DESC"""
+
+ columns_for_table: str = """
+select
+ c.table_catalog as table_catalog,
+ c.table_schema as table_schema,
+ c.table_name as table_name,
+ c.column_name as column_name,
+ c.ordinal_position as ordinal_position,
+ cfp.field_path as field_path,
+ c.is_nullable as is_nullable,
+ CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
+ c.is_hidden as is_hidden,
+ c.is_partitioning_column as is_partitioning_column,
+ c.clustering_ordinal_position as clustering_ordinal_position,
+ description as comment
+from
+ `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMNS as c
+ join `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name
+ and cfp.column_name = c.column_name
+where
+ c.table_name = '{table_identifier.table}'
+ORDER BY
+ table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC"""
+
+
+BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE = """
+resource.type=("bigquery_project")
+AND
+(
+ protoPayload.methodName=
+ (
+ "google.cloud.bigquery.v2.JobService.Query"
+ OR
+ "google.cloud.bigquery.v2.JobService.InsertJob"
+ )
+ AND
+ protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE"
+ AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:*
+ AND (
+ protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:*
+ OR
+ protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedViews:*
+ )
+ AND (
+ protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/_.*/tables/anon.*"
+ AND
+ protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/INFORMATION_SCHEMA.*"
+ AND
+ protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/__TABLES__"
+ AND
+ protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable !~ "projects/.*/datasets/_.*/tables/anon.*"
+ )
+
+)
+AND
+timestamp >= "{start_time}"
+AND
+timestamp < "{end_time}"
+""".strip()
+BQ_FILTER_RULE_TEMPLATE_V2_USAGE = """
+resource.type=("bigquery_project" OR "bigquery_dataset")
+AND
+timestamp >= "{start_time}"
+AND
+timestamp < "{end_time}"
+AND protoPayload.serviceName="bigquery.googleapis.com"
+AND
+(
+ (
+ protoPayload.methodName=
+ (
+ "google.cloud.bigquery.v2.JobService.Query"
+ OR
+ "google.cloud.bigquery.v2.JobService.InsertJob"
+ )
+ AND protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE"
+ AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:*
+ AND protoPayload.metadata.jobChange.job.jobConfig.queryConfig:*
+ AND
+ (
+ (
+ protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:*
+ AND NOT protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables =~ "projects/.*/datasets/.*/tables/__TABLES__|__TABLES_SUMMARY__|INFORMATION_SCHEMA.*"
+ )
+ OR
+ (
+ protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable:*
+ )
+ )
+ )
+ OR
+ protoPayload.metadata.tableDataRead.reason = "JOB"
+)
+""".strip(
+ "\t \n"
+)
+
+
+def bigquery_audit_metadata_query_template_lineage(
+ dataset: str, use_date_sharded_tables: bool, limit: Optional[int] = None
+) -> str:
+ """
+ Receives a dataset (with project specified) and returns a query template that is used to query exported
+ AuditLogs containing protoPayloads of type BigQueryAuditMetadata.
+ Include only those that:
+ - have been completed (jobStatus.jobState = "DONE")
+ - do not contain errors (jobStatus.errorResults is none)
+ :param dataset: the dataset to query against in the form of $PROJECT.$DATASET
+ :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log
+ tables
+ :param limit: set a limit for the maximum event to return. It is used for connection testing currently
+ :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery
+ """
+ limit_text = f"limit {limit}" if limit else ""
+
+ shard_condition = ""
+ if use_date_sharded_tables:
+ from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`"
+ shard_condition = (
+ """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """
+ )
+ else:
+ from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`"
+
+ query = f"""
+ SELECT
+ timestamp,
+ logName,
+ insertId,
+ protopayload_auditlog AS protoPayload,
+ protopayload_auditlog.metadataJson AS metadata
+ FROM
+ {from_table}
+ WHERE (
+ timestamp >= "{{start_time}}"
+ AND timestamp < "{{end_time}}"
+ )
+ {shard_condition}
+ AND protopayload_auditlog.serviceName="bigquery.googleapis.com"
+ AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE"
+ AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL
+ AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL
+ QUALIFY ROW_NUMBER() OVER (PARTITION BY insertId, timestamp, logName) = 1
+ {limit_text};
+ """
+
+ return textwrap.dedent(query)
+
+
+def bigquery_audit_metadata_query_template_usage(
+ dataset: str,
+ use_date_sharded_tables: bool,
+ limit: Optional[int] = None,
+) -> str:
+ """
+ Receives a dataset (with project specified) and returns a query template that is used to query exported
+ v2 AuditLogs containing protoPayloads of type BigQueryAuditMetadata.
+ :param dataset: the dataset to query against in the form of $PROJECT.$DATASET
+ :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log
+ tables
+ :param limit: maximum number of events to query for
+ :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery
+ """
+
+ limit_text = f"limit {limit}" if limit else ""
+
+ shard_condition = ""
+ if use_date_sharded_tables:
+ from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`"
+ shard_condition = (
+ """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """
+ )
+ else:
+ from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`"
+
+ # Deduplicates insertId via QUALIFY, see:
+ # https://cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry, insertId field
+ query = f"""
+ SELECT
+ timestamp,
+ logName,
+ insertId,
+ protopayload_auditlog AS protoPayload,
+ protopayload_auditlog.metadataJson AS metadata
+ FROM
+ {from_table}
+ WHERE (
+ timestamp >= "{{start_time}}"
+ AND timestamp < "{{end_time}}"
+ )
+ {shard_condition}
+ AND protopayload_auditlog.serviceName="bigquery.googleapis.com"
+ AND
+ (
+ (
+ protopayload_auditlog.methodName IN
+ (
+ "google.cloud.bigquery.v2.JobService.Query",
+ "google.cloud.bigquery.v2.JobService.InsertJob"
+ )
+ AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE"
+ AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL
+ AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL
+ AND (
+ JSON_EXTRACT_ARRAY(protopayload_auditlog.metadataJson,
+ "$.jobChange.job.jobStats.queryStats.referencedTables") IS NOT NULL
+ OR
+ JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig.destinationTable") IS NOT NULL
+ )
+ )
+ OR
+ JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.tableDataRead.reason") = "JOB"
+ )
+ QUALIFY ROW_NUMBER() OVER (PARTITION BY insertId, timestamp, logName) = 1
+ {limit_text};
+ """
+
+ return textwrap.dedent(query)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
index e112db31c5c630..201567e104a510 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
@@ -2,7 +2,6 @@
import json
import logging
import os
-import textwrap
import time
import uuid
from dataclasses import dataclass
@@ -21,9 +20,6 @@
)
import humanfriendly
-from google.cloud.bigquery import Client as BigQueryClient
-from google.cloud.logging_v2.client import Client as GCPLoggingClient
-from ratelimiter import RateLimiter
from datahub.configuration.time_window_config import (
BaseTimeWindowConfig,
@@ -35,8 +31,6 @@
from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.bigquery_v2.bigquery_audit import (
- BQ_AUDIT_V2,
- BQ_FILTER_RULE_TEMPLATE,
AuditEvent,
AuditLogEntry,
BigQueryAuditMetadata,
@@ -45,13 +39,15 @@
QueryEvent,
ReadEvent,
)
+from datahub.ingestion.source.bigquery_v2.bigquery_audit_log_api import (
+ BigQueryAuditLogApi,
+)
from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
-from datahub.ingestion.source.bigquery_v2.common import (
- BQ_DATE_SHARD_FORMAT,
- BQ_DATETIME_FORMAT,
- _make_gcp_logging_client,
- get_bigquery_client,
+from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT
+from datahub.ingestion.source.bigquery_v2.queries import (
+ BQ_FILTER_RULE_TEMPLATE_V2_USAGE,
+ bigquery_audit_metadata_query_template_usage,
)
from datahub.ingestion.source.state.redundant_run_skip_handler import (
RedundantUsageRunSkipHandler,
@@ -108,77 +104,6 @@ class OperationalDataMeta:
custom_type: Optional[str] = None
-def bigquery_audit_metadata_query_template(
- dataset: str,
- use_date_sharded_tables: bool,
- limit: Optional[int] = None,
-) -> str:
- """
- Receives a dataset (with project specified) and returns a query template that is used to query exported
- v2 AuditLogs containing protoPayloads of type BigQueryAuditMetadata.
- :param dataset: the dataset to query against in the form of $PROJECT.$DATASET
- :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log
- tables
- :param limit: maximum number of events to query for
- :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery
- """
-
- limit_text = f"limit {limit}" if limit else ""
-
- shard_condition = ""
- if use_date_sharded_tables:
- from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`"
- shard_condition = (
- """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """
- )
- else:
- from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`"
-
- # Deduplicates insertId via QUALIFY, see:
- # https://cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry, insertId field
- query = f"""
- SELECT
- timestamp,
- logName,
- insertId,
- protopayload_auditlog AS protoPayload,
- protopayload_auditlog.metadataJson AS metadata
- FROM
- {from_table}
- WHERE (
- timestamp >= "{{start_time}}"
- AND timestamp < "{{end_time}}"
- )
- {shard_condition}
- AND protopayload_auditlog.serviceName="bigquery.googleapis.com"
- AND
- (
- (
- protopayload_auditlog.methodName IN
- (
- "google.cloud.bigquery.v2.JobService.Query",
- "google.cloud.bigquery.v2.JobService.InsertJob"
- )
- AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE"
- AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL
- AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL
- AND (
- JSON_EXTRACT_ARRAY(protopayload_auditlog.metadataJson,
- "$.jobChange.job.jobStats.queryStats.referencedTables") IS NOT NULL
- OR
- JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig.destinationTable") IS NOT NULL
- )
- )
- OR
- JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.tableDataRead.reason") = "JOB"
- )
- QUALIFY ROW_NUMBER() OVER (PARTITION BY insertId, timestamp, logName) = 1
- {limit_text};
- """
-
- return textwrap.dedent(query)
-
-
class BigQueryUsageState(Closeable):
read_events: FileBackedDict[ReadEvent]
query_events: FileBackedDict[QueryEvent]
@@ -375,7 +300,8 @@ class BigQueryUsageExtractor:
* Aggregation of these statistics into buckets, by day or hour granularity
:::note
- 1. Depending on the compliance policies setup for the bigquery instance, sometimes logging.read permission is not sufficient. In that case, use either admin or private log viewer permission.
+ 1. Depending on the compliance policies setup for the bigquery instance, sometimes logging.read permission is not sufficient.
+ In that case, use either admin or private log viewer permission.
:::
"""
@@ -674,109 +600,6 @@ def _store_usage_event(
return True
return False
- def _get_exported_bigquery_audit_metadata(
- self,
- bigquery_client: BigQueryClient,
- limit: Optional[int] = None,
- ) -> Iterable[BigQueryAuditMetadata]:
- if self.config.bigquery_audit_metadata_datasets is None:
- self.report.bigquery_audit_metadata_datasets_missing = True
- return
-
- corrected_start_time = self.start_time - self.config.max_query_duration
- start_time = corrected_start_time.strftime(BQ_DATETIME_FORMAT)
- start_date = corrected_start_time.strftime(BQ_DATE_SHARD_FORMAT)
- self.report.audit_start_time = start_time
-
- corrected_end_time = self.end_time + self.config.max_query_duration
- end_time = corrected_end_time.strftime(BQ_DATETIME_FORMAT)
- end_date = corrected_end_time.strftime(BQ_DATE_SHARD_FORMAT)
- self.report.audit_end_time = end_time
-
- for dataset in self.config.bigquery_audit_metadata_datasets:
- logger.info(
- f"Start loading log entries from BigQueryAuditMetadata in {dataset}"
- )
-
- query = bigquery_audit_metadata_query_template(
- dataset,
- self.config.use_date_sharded_audit_log_tables,
- limit=limit,
- ).format(
- start_time=start_time,
- end_time=end_time,
- start_date=start_date,
- end_date=end_date,
- )
-
- query_job = bigquery_client.query(query)
- logger.info(
- f"Finished loading log entries from BigQueryAuditMetadata in {dataset}"
- )
- if self.config.rate_limit:
- with RateLimiter(max_calls=self.config.requests_per_min, period=60):
- yield from query_job
- else:
- yield from query_job
-
- def _get_bigquery_log_entries_via_gcp_logging(
- self, client: GCPLoggingClient, limit: Optional[int] = None
- ) -> Iterable[AuditLogEntry]:
- filter = self._generate_filter(BQ_AUDIT_V2)
- logger.debug(filter)
-
- list_entries: Iterable[AuditLogEntry]
- rate_limiter: Optional[RateLimiter] = None
- if self.config.rate_limit:
- # client.list_entries is a generator, does api calls to GCP Logging when it runs out of entries and needs to fetch more from GCP Logging
- # to properly ratelimit we multiply the page size by the number of requests per minute
- rate_limiter = RateLimiter(
- max_calls=self.config.requests_per_min * self.config.log_page_size,
- period=60,
- )
-
- list_entries = client.list_entries(
- filter_=filter,
- page_size=self.config.log_page_size,
- max_results=limit,
- )
-
- for i, entry in enumerate(list_entries):
- if i == 0:
- logger.info(f"Starting log load from GCP Logging for {client.project}")
- if i % 1000 == 0:
- logger.info(f"Loaded {i} log entries from GCP Log for {client.project}")
- self.report.total_query_log_entries += 1
-
- if rate_limiter:
- with rate_limiter:
- yield entry
- else:
- yield entry
-
- logger.info(
- f"Finished loading {self.report.total_query_log_entries} log entries from GCP Logging for {client.project}"
- )
-
- def _generate_filter(self, audit_templates: Dict[str, str]) -> str:
- # We adjust the filter values a bit, since we need to make sure that the join
- # between query events and read events is complete. For example, this helps us
- # handle the case where the read happens within our time range but the query
- # completion event is delayed and happens after the configured end time.
-
- start_time = (self.start_time - self.config.max_query_duration).strftime(
- BQ_DATETIME_FORMAT
- )
- self.report.log_entry_start_time = start_time
- end_time = (self.end_time + self.config.max_query_duration).strftime(
- BQ_DATETIME_FORMAT
- )
- self.report.log_entry_end_time = end_time
- filter = audit_templates[BQ_FILTER_RULE_TEMPLATE].format(
- start_time=start_time, end_time=end_time
- )
- return filter
-
@staticmethod
def _get_destination_table(event: AuditEvent) -> Optional[BigQueryTableRef]:
if (
@@ -1011,27 +834,54 @@ def _parse_exported_bigquery_audit_metadata(
def _get_parsed_bigquery_log_events(
self, project_id: str, limit: Optional[int] = None
) -> Iterable[AuditEvent]:
+ audit_log_api = BigQueryAuditLogApi(
+ self.report.audit_log_api_perf,
+ self.config.rate_limit,
+ self.config.requests_per_min,
+ )
+ # We adjust the filter values a bit, since we need to make sure that the join
+ # between query events and read events is complete. For example, this helps us
+ # handle the case where the read happens within our time range but the query
+ # completion event is delayed and happens after the configured end time.
+ corrected_start_time = self.start_time - self.config.max_query_duration
+ corrected_end_time = self.end_time + -self.config.max_query_duration
+ self.report.audit_start_time = corrected_start_time
+ self.report.audit_end_time = corrected_end_time
+
parse_fn: Callable[[Any], Optional[AuditEvent]]
if self.config.use_exported_bigquery_audit_metadata:
- bq_client = get_bigquery_client(self.config)
- entries = self._get_exported_bigquery_audit_metadata(
+ bq_client = self.config.get_bigquery_client()
+
+ entries = audit_log_api.get_exported_bigquery_audit_metadata(
bigquery_client=bq_client,
+ bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets,
+ bigquery_audit_metadata_query_template=bigquery_audit_metadata_query_template_usage,
+ use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables,
+ start_time=corrected_start_time,
+ end_time=corrected_end_time,
limit=limit,
)
parse_fn = self._parse_exported_bigquery_audit_metadata
else:
- logging_client = _make_gcp_logging_client(
- project_id, self.config.extra_client_options
+ logging_client = self.config.make_gcp_logging_client(project_id)
+ logger.info(
+ f"Start loading log entries from BigQuery for {project_id} "
+ f"with start_time={corrected_start_time} and end_time={corrected_end_time}"
)
- entries = self._get_bigquery_log_entries_via_gcp_logging(
- logging_client, limit=limit
+ entries = audit_log_api.get_bigquery_log_entries_via_gcp_logging(
+ logging_client,
+ filter=self._generate_filter(corrected_start_time, corrected_end_time),
+ log_page_size=self.config.log_page_size,
+ limit=limit,
)
parse_fn = self._parse_bigquery_log_entry
for entry in entries:
try:
+ self.report.num_usage_total_log_entries[project_id] += 1
event = parse_fn(entry)
if event:
+ self.report.num_usage_parsed_log_entries[project_id] += 1
yield event
except Exception as e:
logger.warning(
@@ -1042,6 +892,12 @@ def _get_parsed_bigquery_log_events(
f"log-parse-{project_id}", e, group="usage-log-parse"
)
+ def _generate_filter(self, corrected_start_time, corrected_end_time):
+ return BQ_FILTER_RULE_TEMPLATE_V2_USAGE.format(
+ start_time=corrected_start_time.strftime(BQ_DATETIME_FORMAT),
+ end_time=corrected_end_time.strftime(BQ_DATETIME_FORMAT),
+ )
+
def get_tables_from_query(
self, default_project: str, query: str
) -> Optional[List[BigQueryTableRef]]:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py
index c8623798f69374..bbe52b5d98ba36 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py
@@ -365,8 +365,8 @@ def populate_lineage(
# Populate table level lineage by getting upstream tables from stl_scan redshift table
query = RedshiftQuery.stl_scan_based_lineage_query(
self.config.database,
- self.config.start_time,
- self.config.end_time,
+ self.start_time,
+ self.end_time,
)
populate_calls.append((query, LineageCollectorType.QUERY_SCAN))
elif self.config.table_lineage_mode == LineageMode.SQL_BASED:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
index 811ea67981e180..240e0ffa1a0b6d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
@@ -543,15 +543,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
self.connection.close()
- lru_cache_functions: List[Callable] = [
- self.data_dictionary.get_tables_for_database,
- self.data_dictionary.get_views_for_database,
- self.data_dictionary.get_columns_for_schema,
- self.data_dictionary.get_pk_constraints_for_schema,
- self.data_dictionary.get_fk_constraints_for_schema,
- ]
- for func in lru_cache_functions:
- self.report.lru_cache_info[func.__name__] = func.cache_info()._asdict() # type: ignore
+ self.report_cache_info()
# TODO: The checkpoint state for stale entity detection can be committed here.
@@ -596,6 +588,17 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
) and self.usage_extractor:
yield from self.usage_extractor.get_usage_workunits(discovered_datasets)
+ def report_cache_info(self):
+ lru_cache_functions: List[Callable] = [
+ self.data_dictionary.get_tables_for_database,
+ self.data_dictionary.get_views_for_database,
+ self.data_dictionary.get_columns_for_schema,
+ self.data_dictionary.get_pk_constraints_for_schema,
+ self.data_dictionary.get_fk_constraints_for_schema,
+ ]
+ for func in lru_cache_functions:
+ self.report.lru_cache_info[func.__name__] = func.cache_info()._asdict() # type: ignore
+
def report_warehouse_failure(self):
if self.config.warehouse is not None:
self.report_error(
diff --git a/metadata-ingestion/src/datahub/utilities/perf_timer.py b/metadata-ingestion/src/datahub/utilities/perf_timer.py
index 3fac1d68c3a9ea..18384420bfefbd 100644
--- a/metadata-ingestion/src/datahub/utilities/perf_timer.py
+++ b/metadata-ingestion/src/datahub/utilities/perf_timer.py
@@ -1,26 +1,49 @@
+import logging
import time
from contextlib import AbstractContextManager
from typing import Any, Optional
+logger: logging.Logger = logging.getLogger(__name__)
+
class PerfTimer(AbstractContextManager):
"""
A context manager that gives easy access to elapsed time for performance measurement.
+
"""
- start_time: Optional[float] = None
- end_time: Optional[float] = None
+ def __init__(self) -> None:
+ self.start_time: Optional[float] = None
+ self.end_time: Optional[float] = None
+ self._past_active_time: float = 0
+ self.paused: bool = False
+ self._error_state = False
def start(self) -> None:
+ if self.end_time is not None:
+ self._past_active_time = self.elapsed_seconds()
+
self.start_time = time.perf_counter()
self.end_time = None
+ self.paused = False
+
+ def pause(self) -> "PerfTimer":
+ self.assert_timer_is_running()
+ self._past_active_time = self.elapsed_seconds()
+ self.start_time = None
+ self.end_time = None
+ self.paused = True
+ return self
def finish(self) -> None:
- assert self.start_time is not None
+ self.assert_timer_is_running()
self.end_time = time.perf_counter()
def __enter__(self) -> "PerfTimer":
- self.start()
+ if self.paused: # Entering paused timer context, NO OP
+ pass
+ else:
+ self.start()
return self
def __exit__(
@@ -29,16 +52,46 @@ def __exit__(
exc: Any,
traceback: Any,
) -> Optional[bool]:
- self.finish()
+ if self.paused: # Exiting paused timer context, resume timer
+ self.start()
+ else:
+ self.finish()
return None
def elapsed_seconds(self) -> float:
"""
Returns the elapsed time in seconds.
"""
+ if self.paused or not self.start_time:
+ return self._past_active_time
- assert self.start_time is not None
if self.end_time is None:
- return time.perf_counter() - self.start_time
+ return (time.perf_counter() - self.start_time) + (self._past_active_time)
+ else:
+ return (self.end_time - self.start_time) + self._past_active_time
+
+ def assert_timer_is_running(self) -> None:
+ """
+ Returns true if timer is in running state.
+ Timer is in NOT in running state if
+ 1. it has never been started.
+ 2. it is in paused state.
+ 3. it had been started and finished in the past but not started again.
+ """
+ if self.start_time is None or self.paused or self.end_time:
+ self._error_state = True
+ logger.warning("Did you forget to start the timer ?")
+
+ def __repr__(self) -> str:
+ return repr(self.as_obj())
+
+ def __str__(self) -> str:
+ return self.__repr__()
+
+ def as_obj(self) -> Optional[str]:
+ if self.start_time is None:
+ return None
else:
- return self.end_time - self.start_time
+ time_taken = self.elapsed_seconds()
+ state = " (error)" if self._error_state else ""
+ return f"{time_taken:.3f} seconds{state}"
diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
index cc3ee1f6ceaa47..602401134dcd30 100644
--- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
+++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
@@ -4,8 +4,10 @@
from freezegun import freeze_time
from google.cloud.bigquery.table import TableListItem
+from datahub.ingestion.source.bigquery_v2.bigquery import BigqueryV2Source
from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
BigqueryDataset,
+ BigQuerySchemaApi,
BigqueryTable,
)
from tests.test_helpers import mce_helpers
@@ -15,15 +17,9 @@
@freeze_time(FROZEN_TIME)
-@patch(
- "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_tables_for_dataset"
-)
-@patch(
- "datahub.ingestion.source.bigquery_v2.bigquery.BigqueryV2Source.get_core_table_details"
-)
-@patch(
- "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_datasets_for_project_id"
-)
+@patch.object(BigQuerySchemaApi, "get_tables_for_dataset")
+@patch.object(BigqueryV2Source, "get_core_table_details")
+@patch.object(BigQuerySchemaApi, "get_datasets_for_project_id")
@patch("google.cloud.bigquery.Client")
def test_bigquery_v2_ingest(
client,
diff --git a/metadata-ingestion/tests/unit/test_bigquery_lineage.py b/metadata-ingestion/tests/unit/test_bigquery_lineage.py
index 9b09fa36ba5862..e23494963e475d 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_lineage.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_lineage.py
@@ -3,6 +3,7 @@
import pytest
+import datahub.emitter.mce_builder as builder
from datahub.ingestion.source.bigquery_v2.bigquery_audit import (
BigQueryTableRef,
QueryEvent,
@@ -81,7 +82,9 @@ def lineage_entries() -> List[QueryEvent]:
def test_lineage_with_timestamps(lineage_entries: List[QueryEvent]) -> None:
config = BigQueryV2Config()
report = BigQueryV2Report()
- extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report)
+ extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(
+ config, report, lambda x: builder.make_dataset_urn("bigquery", str(x))
+ )
bq_table = BigQueryTableRef.from_string_name(
"projects/my_project/datasets/my_dataset/tables/my_table"
@@ -96,7 +99,6 @@ def test_lineage_with_timestamps(lineage_entries: List[QueryEvent]) -> None:
bq_table=bq_table,
bq_table_urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,my_project.my_dataset.my_table,PROD)",
lineage_metadata=lineage_map,
- platform="bigquery",
)
assert upstream_lineage
assert len(upstream_lineage.upstreams) == 4
@@ -105,7 +107,9 @@ def test_lineage_with_timestamps(lineage_entries: List[QueryEvent]) -> None:
def test_column_level_lineage(lineage_entries: List[QueryEvent]) -> None:
config = BigQueryV2Config(extract_column_lineage=True, incremental_lineage=False)
report = BigQueryV2Report()
- extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report)
+ extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(
+ config, report, lambda x: builder.make_dataset_urn("bigquery", str(x))
+ )
bq_table = BigQueryTableRef.from_string_name(
"projects/my_project/datasets/my_dataset/tables/my_table"
@@ -120,7 +124,6 @@ def test_column_level_lineage(lineage_entries: List[QueryEvent]) -> None:
bq_table=bq_table,
bq_table_urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,my_project.my_dataset.my_table,PROD)",
lineage_metadata=lineage_map,
- platform="bigquery",
)
assert upstream_lineage
assert len(upstream_lineage.upstreams) == 2
diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py
index 6907f926249f50..4fc6c31626ba82 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_source.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_source.py
@@ -18,9 +18,10 @@
BigQueryTableRef,
)
from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
+from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
- BigQueryDataDictionary,
BigqueryProject,
+ BigQuerySchemaApi,
BigqueryView,
)
from datahub.ingestion.source.bigquery_v2.lineage import (
@@ -92,15 +93,17 @@ def test_bigquery_uri_with_credential():
raise e
-@patch("google.cloud.bigquery.client.Client")
-def test_get_projects_with_project_ids(client_mock):
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_get_projects_with_project_ids(get_bq_client_mock):
+ client_mock = MagicMock()
+ get_bq_client_mock.return_value = client_mock
config = BigQueryV2Config.parse_obj(
{
"project_ids": ["test-1", "test-2"],
}
)
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1"))
- assert source._get_projects(client_mock) == [
+ assert source._get_projects() == [
BigqueryProject("test-1", "test-1"),
BigqueryProject("test-2", "test-2"),
]
@@ -110,14 +113,17 @@ def test_get_projects_with_project_ids(client_mock):
{"project_ids": ["test-1", "test-2"], "project_id": "test-3"}
)
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test2"))
- assert source._get_projects(client_mock) == [
+ assert source._get_projects() == [
BigqueryProject("test-1", "test-1"),
BigqueryProject("test-2", "test-2"),
]
assert client_mock.list_projects.call_count == 0
-def test_get_projects_with_project_ids_overrides_project_id_pattern():
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_get_projects_with_project_ids_overrides_project_id_pattern(
+ get_bq_client_mock,
+):
config = BigQueryV2Config.parse_obj(
{
"project_ids": ["test-project", "test-project-2"],
@@ -125,7 +131,7 @@ def test_get_projects_with_project_ids_overrides_project_id_pattern():
}
)
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
- projects = source._get_projects(MagicMock())
+ projects = source._get_projects()
assert projects == [
BigqueryProject(id="test-project", name="test-project"),
BigqueryProject(id="test-project-2", name="test-project-2"),
@@ -143,7 +149,8 @@ def test_platform_instance_config_always_none():
assert config.platform_instance is None
-def test_get_dataplatform_instance_aspect_returns_project_id():
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_get_dataplatform_instance_aspect_returns_project_id(get_bq_client_mock):
project_id = "project_id"
expected_instance = (
f"urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,{project_id})"
@@ -162,7 +169,8 @@ def test_get_dataplatform_instance_aspect_returns_project_id():
assert metadata.aspect.instance == expected_instance
-def test_get_dataplatform_instance_default_no_instance():
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_get_dataplatform_instance_default_no_instance(get_bq_client_mock):
config = BigQueryV2Config.parse_obj({})
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
@@ -176,18 +184,22 @@ def test_get_dataplatform_instance_default_no_instance():
assert metadata.aspect.instance is None
-@patch("google.cloud.bigquery.client.Client")
-def test_get_projects_with_single_project_id(client_mock):
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_get_projects_with_single_project_id(get_bq_client_mock):
+ client_mock = MagicMock()
+ get_bq_client_mock.return_value = client_mock
config = BigQueryV2Config.parse_obj({"project_id": "test-3"})
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1"))
- assert source._get_projects(client_mock) == [
+ assert source._get_projects() == [
BigqueryProject("test-3", "test-3"),
]
assert client_mock.list_projects.call_count == 0
-@patch("google.cloud.bigquery.client.Client")
-def test_get_projects_by_list(client_mock):
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_get_projects_by_list(get_bq_client_mock):
+ client_mock = MagicMock()
+ get_bq_client_mock.return_value = client_mock
client_mock.list_projects.return_value = [
SimpleNamespace(
project_id="test-1",
@@ -201,15 +213,16 @@ def test_get_projects_by_list(client_mock):
config = BigQueryV2Config.parse_obj({})
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1"))
- assert source._get_projects(client_mock) == [
+ assert source._get_projects() == [
BigqueryProject("test-1", "one"),
BigqueryProject("test-2", "two"),
]
assert client_mock.list_projects.call_count == 1
-@patch.object(BigQueryDataDictionary, "get_projects")
-def test_get_projects_filter_by_pattern(get_projects_mock):
+@patch.object(BigQuerySchemaApi, "get_projects")
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_get_projects_filter_by_pattern(get_bq_client_mock, get_projects_mock):
get_projects_mock.return_value = [
BigqueryProject("test-project", "Test Project"),
BigqueryProject("test-project-2", "Test Project 2"),
@@ -219,31 +232,35 @@ def test_get_projects_filter_by_pattern(get_projects_mock):
{"project_id_pattern": {"deny": ["^test-project$"]}}
)
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
- projects = source._get_projects(MagicMock())
+ projects = source._get_projects()
assert projects == [
BigqueryProject(id="test-project-2", name="Test Project 2"),
]
-@patch.object(BigQueryDataDictionary, "get_projects")
-def test_get_projects_list_empty(get_projects_mock):
+@patch.object(BigQuerySchemaApi, "get_projects")
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_get_projects_list_empty(get_bq_client_mock, get_projects_mock):
get_projects_mock.return_value = []
config = BigQueryV2Config.parse_obj(
{"project_id_pattern": {"deny": ["^test-project$"]}}
)
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
- projects = source._get_projects(MagicMock())
+ projects = source._get_projects()
assert len(source.report.failures) == 1
assert projects == []
-@patch.object(BigQueryDataDictionary, "get_projects")
+@patch.object(BigQueryV2Config, "get_bigquery_client")
def test_get_projects_list_failure(
- get_projects_mock: MagicMock, caplog: pytest.LogCaptureFixture
+ get_bq_client_mock: MagicMock,
+ caplog: pytest.LogCaptureFixture,
) -> None:
error_str = "my error"
- get_projects_mock.side_effect = GoogleAPICallError(error_str)
+ bq_client_mock = MagicMock()
+ get_bq_client_mock.return_value = bq_client_mock
+ bq_client_mock.list_projects.side_effect = GoogleAPICallError(error_str)
config = BigQueryV2Config.parse_obj(
{"project_id_pattern": {"deny": ["^test-project$"]}}
@@ -251,27 +268,29 @@ def test_get_projects_list_failure(
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
caplog.records.clear()
with caplog.at_level(logging.ERROR):
- projects = source._get_projects(MagicMock())
+ projects = source._get_projects()
assert len(caplog.records) == 1
assert error_str in caplog.records[0].msg
assert len(source.report.failures) == 1
assert projects == []
-@patch.object(BigQueryDataDictionary, "get_projects")
-def test_get_projects_list_fully_filtered(get_projects_mock):
+@patch.object(BigQuerySchemaApi, "get_projects")
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_get_projects_list_fully_filtered(get_projects_mock, get_bq_client_mock):
get_projects_mock.return_value = [BigqueryProject("test-project", "Test Project")]
config = BigQueryV2Config.parse_obj(
{"project_id_pattern": {"deny": ["^test-project$"]}}
)
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
- projects = source._get_projects(MagicMock())
+ projects = source._get_projects()
assert len(source.report.failures) == 0
assert projects == []
-def test_simple_upstream_table_generation():
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_simple_upstream_table_generation(get_bq_client_mock):
a: BigQueryTableRef = BigQueryTableRef(
BigqueryTableIdentifier(
project_id="test-project", dataset="test-dataset", table="a"
@@ -302,7 +321,10 @@ def test_simple_upstream_table_generation():
assert list(upstreams)[0].table == str(b)
-def test_upstream_table_generation_with_temporary_table_without_temp_upstream():
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_upstream_table_generation_with_temporary_table_without_temp_upstream(
+ get_bq_client_mock,
+):
a: BigQueryTableRef = BigQueryTableRef(
BigqueryTableIdentifier(
project_id="test-project", dataset="test-dataset", table="a"
@@ -332,7 +354,8 @@ def test_upstream_table_generation_with_temporary_table_without_temp_upstream():
assert list(upstreams) == []
-def test_upstream_table_column_lineage_with_temp_table():
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_upstream_table_column_lineage_with_temp_table(get_bq_client_mock):
from datahub.ingestion.api.common import PipelineContext
a: BigQueryTableRef = BigQueryTableRef(
@@ -406,7 +429,10 @@ def test_upstream_table_column_lineage_with_temp_table():
assert upstream.column_confidence == 0.7
-def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstream():
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstream(
+ get_bq_client_mock,
+):
a: BigQueryTableRef = BigQueryTableRef(
BigqueryTableIdentifier(
project_id="test-project", dataset="test-dataset", table="a"
@@ -466,11 +492,11 @@ def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstr
assert sorted_list[1].table == str(e)
-@patch(
- "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_tables_for_dataset"
-)
-@patch("google.cloud.bigquery.client.Client")
-def test_table_processing_logic(client_mock, data_dictionary_mock):
+@patch.object(BigQuerySchemaApi, "get_tables_for_dataset")
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_table_processing_logic(get_bq_client_mock, data_dictionary_mock):
+ client_mock = MagicMock()
+ get_bq_client_mock.return_value = client_mock
config = BigQueryV2Config.parse_obj(
{
"project_id": "test-project",
@@ -523,7 +549,7 @@ def test_table_processing_logic(client_mock, data_dictionary_mock):
_ = list(
source.get_tables_for_dataset(
- conn=client_mock, project_id="test-project", dataset_name="test-dataset"
+ project_id="test-project", dataset_name="test-dataset"
)
)
@@ -531,17 +557,19 @@ def test_table_processing_logic(client_mock, data_dictionary_mock):
# args only available from python 3.8 and that's why call_args_list is sooo ugly
tables: Dict[str, TableListItem] = data_dictionary_mock.call_args_list[0][0][
- 3
+ 2
] # alternatively
for table in tables.keys():
assert table in ["test-table", "test-sharded-table_20220102"]
-@patch(
- "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_tables_for_dataset"
-)
-@patch("google.cloud.bigquery.client.Client")
-def test_table_processing_logic_date_named_tables(client_mock, data_dictionary_mock):
+@patch.object(BigQuerySchemaApi, "get_tables_for_dataset")
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_table_processing_logic_date_named_tables(
+ get_bq_client_mock, data_dictionary_mock
+):
+ client_mock = MagicMock()
+ get_bq_client_mock.return_value = client_mock
# test that tables with date names are processed correctly
config = BigQueryV2Config.parse_obj(
{
@@ -595,7 +623,7 @@ def test_table_processing_logic_date_named_tables(client_mock, data_dictionary_m
_ = list(
source.get_tables_for_dataset(
- conn=client_mock, project_id="test-project", dataset_name="test-dataset"
+ project_id="test-project", dataset_name="test-dataset"
)
)
@@ -603,7 +631,7 @@ def test_table_processing_logic_date_named_tables(client_mock, data_dictionary_m
# args only available from python 3.8 and that's why call_args_list is sooo ugly
tables: Dict[str, TableListItem] = data_dictionary_mock.call_args_list[0][0][
- 3
+ 2
] # alternatively
for table in tables.keys():
assert tables[table].table_id in ["test-table", "20220103"]
@@ -644,16 +672,16 @@ def bigquery_view_2() -> BigqueryView:
)
-@patch(
- "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_query_result"
-)
-@patch("google.cloud.bigquery.client.Client")
+@patch.object(BigQuerySchemaApi, "get_query_result")
+@patch.object(BigQueryV2Config, "get_bigquery_client")
def test_get_views_for_dataset(
- client_mock: Mock,
+ get_bq_client_mock: Mock,
query_mock: Mock,
bigquery_view_1: BigqueryView,
bigquery_view_2: BigqueryView,
) -> None:
+ client_mock = MagicMock()
+ get_bq_client_mock.return_value = client_mock
assert bigquery_view_1.last_altered
row1 = create_row(
dict(
@@ -675,9 +703,11 @@ def test_get_views_for_dataset(
)
)
query_mock.return_value = [row1, row2]
+ bigquery_data_dictionary = BigQuerySchemaApi(
+ BigQueryV2Report().schema_api_perf, client_mock
+ )
- views = BigQueryDataDictionary.get_views_for_dataset(
- conn=client_mock,
+ views = bigquery_data_dictionary.get_views_for_dataset(
project_id="test-project",
dataset_name="test-dataset",
has_data_read=False,
@@ -686,7 +716,10 @@ def test_get_views_for_dataset(
@patch.object(BigqueryV2Source, "gen_dataset_workunits", lambda *args, **kwargs: [])
-def test_gen_view_dataset_workunits(bigquery_view_1, bigquery_view_2):
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_gen_view_dataset_workunits(
+ get_bq_client_mock, bigquery_view_1, bigquery_view_2
+):
project_id = "test-project"
dataset_name = "test-dataset"
config = BigQueryV2Config.parse_obj(
diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
index 6ee1f05f0582cc..4cf42da4395f94 100644
--- a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
+++ b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
@@ -4,7 +4,6 @@
from freezegun import freeze_time
from datahub.ingestion.source.bigquery_v2.bigquery_audit import (
- BQ_AUDIT_V2,
BigqueryTableIdentifier,
BigQueryTableRef,
)
@@ -111,10 +110,12 @@ def test_bigqueryv2_filters():
OR
protoPayload.metadata.tableDataRead.reason = "JOB"
)""" # noqa: W293
- source = BigQueryUsageExtractor(
- config, BigQueryV2Report(), dataset_urn_builder=lambda _: ""
- )
- filter: str = source._generate_filter(BQ_AUDIT_V2)
+
+ corrected_start_time = config.start_time - config.max_query_duration
+ corrected_end_time = config.end_time + config.max_query_duration
+ filter: str = BigQueryUsageExtractor(
+ config, BigQueryV2Report(), lambda x: ""
+ )._generate_filter(corrected_start_time, corrected_end_time)
assert filter == expected_filter
diff --git a/metadata-ingestion/tests/unit/utilities/test_perf_timer.py b/metadata-ingestion/tests/unit/utilities/test_perf_timer.py
new file mode 100644
index 00000000000000..d5fde314c2b57a
--- /dev/null
+++ b/metadata-ingestion/tests/unit/utilities/test_perf_timer.py
@@ -0,0 +1,46 @@
+import time
+from functools import partial
+
+import pytest
+
+from datahub.utilities.perf_timer import PerfTimer
+
+approx = partial(pytest.approx, rel=1e-2)
+
+
+def test_perf_timer_simple():
+ with PerfTimer() as timer:
+ time.sleep(1)
+ assert approx(timer.elapsed_seconds()) == 1
+
+ assert approx(timer.elapsed_seconds()) == 1
+
+
+def test_perf_timer_paused_timer():
+ with PerfTimer() as current_timer:
+ time.sleep(1)
+ assert approx(current_timer.elapsed_seconds()) == 1
+ with current_timer.pause():
+ time.sleep(2)
+ assert approx(current_timer.elapsed_seconds()) == 1
+ assert approx(current_timer.elapsed_seconds()) == 1
+ time.sleep(1)
+
+ assert approx(current_timer.elapsed_seconds()) == 2
+
+
+def test_generator_with_paused_timer():
+ def generator_function():
+ with PerfTimer() as inner_timer:
+ time.sleep(1)
+ for i in range(10):
+ time.sleep(0.2)
+ with inner_timer.pause():
+ time.sleep(0.2)
+ yield i
+ assert approx(inner_timer.elapsed_seconds()) == 1 + 0.2 * 10
+
+ with PerfTimer() as outer_timer:
+ seq = generator_function()
+ list([i for i in seq])
+ assert approx(outer_timer.elapsed_seconds()) == 1 + 0.2 * 10 + 0.2 * 10
From f4da93988e8cbb14c74946ddc72fdbd4205a015e Mon Sep 17 00:00:00 2001
From: Tony Ouyang
Date: Fri, 15 Sep 2023 13:26:17 -0700
Subject: [PATCH 24/65] feat(ingestion/dynamodb): Add DynamoDB as new metadata
ingestion source (#8768)
Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
---
.../app/ingest/source/builder/constants.ts | 4 +
.../app/ingest/source/builder/sources.json | 7 +
datahub-web-react/src/images/dynamodblogo.png | Bin 0 -> 60888 bytes
.../docs/sources/dynamodb/dynamodb_post.md | 29 ++
.../docs/sources/dynamodb/dynamodb_pre.md | 26 +
.../docs/sources/dynamodb/dynamodb_recipe.yml | 25 +
metadata-ingestion/setup.py | 2 +
.../ingestion/source/dynamodb/__init__.py | 0
.../ingestion/source/dynamodb/dynamodb.py | 469 ++++++++++++++++++
...default_platform_instance_mces_golden.json | 132 +++++
...ynamodb_platform_instance_mces_golden.json | 132 +++++
.../integration/dynamodb/test_dynamodb.py | 95 ++++
.../main/resources/boot/data_platforms.json | 10 +
13 files changed, 931 insertions(+)
create mode 100644 datahub-web-react/src/images/dynamodblogo.png
create mode 100644 metadata-ingestion/docs/sources/dynamodb/dynamodb_post.md
create mode 100644 metadata-ingestion/docs/sources/dynamodb/dynamodb_pre.md
create mode 100644 metadata-ingestion/docs/sources/dynamodb/dynamodb_recipe.yml
create mode 100644 metadata-ingestion/src/datahub/ingestion/source/dynamodb/__init__.py
create mode 100644 metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py
create mode 100644 metadata-ingestion/tests/integration/dynamodb/dynamodb_default_platform_instance_mces_golden.json
create mode 100644 metadata-ingestion/tests/integration/dynamodb/dynamodb_platform_instance_mces_golden.json
create mode 100644 metadata-ingestion/tests/integration/dynamodb/test_dynamodb.py
diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts
index 8d41c3533575a6..61667a941765c3 100644
--- a/datahub-web-react/src/app/ingest/source/builder/constants.ts
+++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts
@@ -27,6 +27,7 @@ import powerbiLogo from '../../../../images/powerbilogo.png';
import modeLogo from '../../../../images/modelogo.png';
import databricksLogo from '../../../../images/databrickslogo.png';
import verticaLogo from '../../../../images/verticalogo.png';
+import dynamodbLogo from '../../../../images/dynamodblogo.png';
export const ATHENA = 'athena';
export const ATHENA_URN = `urn:li:dataPlatform:${ATHENA}`;
@@ -43,6 +44,8 @@ export const DBT = 'dbt';
export const DBT_URN = `urn:li:dataPlatform:${DBT}`;
export const DRUID = 'druid';
export const DRUID_URN = `urn:li:dataPlatform:${DRUID}`;
+export const DYNAMODB = 'dynamodb';
+export const DYNAMODB_URN = `urn:li:dataPlatform:${DYNAMODB}`;
export const ELASTICSEARCH = 'elasticsearch';
export const ELASTICSEARCH_URN = `urn:li:dataPlatform:${ELASTICSEARCH}`;
export const FEAST = 'feast';
@@ -107,6 +110,7 @@ export const PLATFORM_URN_TO_LOGO = {
[CLICKHOUSE_URN]: clickhouseLogo,
[DBT_URN]: dbtLogo,
[DRUID_URN]: druidLogo,
+ [DYNAMODB_URN]: dynamodbLogo,
[ELASTICSEARCH_URN]: elasticsearchLogo,
[FEAST_URN]: feastLogo,
[GLUE_URN]: glueLogo,
diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json
index 13643c58f72e1a..b4ea2db018bd84 100644
--- a/datahub-web-react/src/app/ingest/source/builder/sources.json
+++ b/datahub-web-react/src/app/ingest/source/builder/sources.json
@@ -125,6 +125,13 @@
"docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/mongodb/",
"recipe": "source:\n type: mongodb\n config:\n # Coordinates\n connect_uri: # Your MongoDB connect URI, e.g. \"mongodb://localhost\"\n\n # Credentials\n # Add secret in Secrets Tab with relevant names for each variable\n username: \"${MONGO_USERNAME}\" # Your MongoDB username, e.g. admin\n password: \"${MONGO_PASSWORD}\" # Your MongoDB password, e.g. password_01\n\n # Options (recommended)\n enableSchemaInference: True\n useRandomSampling: True\n maxSchemaSize: 300"
},
+ {
+ "urn": "urn:li:dataPlatform:dynamodb",
+ "name": "dynamodb",
+ "displayName": "DynamoDB",
+ "docsUrl": "https://datahubproject.io/docs/metadata-ingestion/",
+ "recipe": "source:\n type: dynamodb\n config:\n platform_instance: \"AWS_ACCOUNT_ID\"\n aws_access_key_id : '${AWS_ACCESS_KEY_ID}'\n aws_secret_access_key : '${AWS_SECRET_ACCESS_KEY}'\n # User could use the below option to provide a list of primary keys of a table in dynamodb format,\n # those items from given primary keys will be included when we scan the table.\n # For each table we can retrieve up to 16 MB of data, which can contain as many as 100 items.\n # We'll enforce the the primary keys list size not to exceed 100\n # The total items we'll try to retrieve in these two scenarios:\n # 1. If user don't specify include_table_item: we'll retrieve up to 100 items\n # 2. If user specifies include_table_item: we'll retrieve up to 100 items plus user specified items in\n # the table, with a total not more than 200 items\n # include_table_item:\n # table_name:\n # [\n # {\n # 'partition_key_name': { 'attribute_type': 'attribute_value' },\n # 'sort_key_name': { 'attribute_type': 'attribute_value' },\n # },\n # ]"
+ },
{
"urn": "urn:li:dataPlatform:glue",
"name": "glue",
diff --git a/datahub-web-react/src/images/dynamodblogo.png b/datahub-web-react/src/images/dynamodblogo.png
new file mode 100644
index 0000000000000000000000000000000000000000..f5beafb035772cacfc5fbe5da985ae8da1c74973
GIT binary patch
literal 60888
zcmeEvc|29y8~5(kO#^AB2&JefM+s$2rAg7K!R&_07#T91QmK$`10qAIR5B%;2q!{i
zER~sqL}iLN#NmC`S_gH1_w)Y$zMp&kh-2-wp85Mc>)C7XF6!*u%r|BJ6pEtwwr<(D
zo1&&ND2k_d(nR=+SM!e<@ME6CrUMRpY|I>-j87b)*8gQ=azt#am9hDe-A9c7ay+%S4$r~>H6#*>fVqP@I%Vvd7j}ip!MJ9KMnk+f&Vn{p9cQZz<(O}|3?E%eGk(=
zkT`;Q;X_@;YM#zRKRNtrLq+qcD?K$%nJYX8vpa@+>IeJ{J%_%i%^e4q(uXFrqbW3c
zU_8f0w7v0g#&G@SNJ_nYJ8S6sfUNcKy-79b!U(0)O6Wp|AL}e_V9pixCRQzXXf=1p
zb{Tg_b987H8M@VSsJwPk^*FJPP-gm!)bX$o^EIJtd**OOl
zEf4&lN4Ro_oJNPT$gJs|7Tg6-k1VKufr%$i`(eo)x;Qe#e8HtfkIW*pct#hT#-$}G
zydYWO4=3uC63y1;ELKYAV#2+$*c>u6b##`4M2R~zI;-;%8G55=>?&@{Lw`Q*54L8@G^9fr
z!iJ^Q)A;lmjPdHV>Ze`B9gtd$TtQ6uB+DHmin3vjFdx5yNPKvn3L&LnJ7UW6%a{~>
zMZrv>j1aLA1}!&$`M{#@%%)Ze!2+Ll%w{NPU>X6ESWiY<6VJE#!2(?pywyUtu_xuv
zWFS9?zh1#r;Vd4m{pn5MDll{)YdKe}KS#u3t|DS-StW7B(ij!%6cOwG(-E;oZ9?OU
zwdWvLtdvo)g1BNWYvqbnH7b@CSF938u2@#1Vx8xTwIz)!mi(w#*+eYb^%|~NLZf2U
z6R`x(#B+`#Pn*T?lpq!nB&?^-OwQbaLg5d#4T-ImomjQ%0$IK6$q+xcOF_J7{Y}yh_we!e)6WXAQ0`o0*(^fn+l!OME|SHJvm8YhV(lwAkqEn`ju)E_Q|6H=-C}r(I?+DMD9iCO
zXcY{mq)nzwAXKKolr3B;(+QQ+go^gefy9Sct_>Ce{xCBAQ$x
z^RSTde`3DkrHz$VgHs{CwqoF;-^2b2X{G8Q5*JWRYbO>(8aWn4NpC1G+6MFh5zS{{
z>5Z1b(vT`oWjXGLq>U`;Ljr$0xDSk!&P-0^igJ*M5=2A^0#T4pAyFELDA$Q7a%bCG
zxuVDuQSK8_&Vwji-%li>(1<8B5JibA$_^sRZ6Zqjq$*@b^XZJlwODu>sa0sa#!7?X
zM8?jGSjB>*MrhetKCKR*5!A9EgE-Q8Vy@H~$=OGUCbL6SbK82dIbGk1)i_ncASbhe
zF#qLjn{6mr*LEMr)>8mOmpQ@OC|IG5=+8Uk16iC~!!^J>HDdC5>m=+ZbQ-qPba6}X
zJmPZG_@g&rdz{5SN6(&IRSyP788W-{7|}Zq!3l35Wyd9z{hpBGmT+Mbmfnrr1--co
zDiZ@MYn4x1OBURqwL$}#mbD(f!w>5$M<9dM(!eF;Y0z&0S>aeUb#iwEPFezWG>3%sAkZVh8ie8&2-(S8q2jSnG!Tjltd?;^
zvAn2~qNwpRlE>pRc0xPThiH$+i?zoKgr1YkrqsEOD1uwOEtp(&dOKm9XC+q=HJI^;
z@y1HFBkRo}ieNclXHon+egWeKF%;8}%idAUo-JUttqf7$X(g^ENI5nENNwhZfbs*v
z`b$ho4ER$fC6p6uE0>flA$28wfg%D4z(wYIE-7O|iULw{TvA?KQlyMyP0n%@+eExs
z>@FdttcA72KRtQ;QQ|J+jk%<(F)4-@km8mM%&lBfg@hE+g*=zkH7==WJk&YOSgC|N
zOI?z?ASvdauP>kWgRAzEgGB9ps8Xe<3^7X{O>W|*T_DElBvs?3_yvO8`haOfxS*ps
z^mNeQF0TG~9}%I1F!01LF;=3X${GUk^b5kC(FQD3@*+zfvk^iU2_aI`G3TCbV_Jd?
zkXU#mi2Kp`usse0$1eyPvF}~NLp%Y-?K6|dCvqVvNRP-g7w;U6mSs5_&Eaa+hqw~A
zu#*NwJjUDTKb(wYiH;5sp&o)z`?x|$bA_6Pg`$B_@gqXzaD~#qLfr?U9JxZpbA=++
z9dkAag?1lED0!|>ld({OAXNE?Q1^*YG$Pan5Gsu;R5@3uiA1QSS&sB!Zj7?75&iMv
zLXtWCY#U|BoyC$OQV9~NVqwT;WM~H&B2^x2mK^Ea-O{wqSiIn1VEVupzeRVUkx*PgyDHO*I^pNZ(^n=RE=QJ_TgPIt8qQRuMJ)VsfU
z>fMhz7w#RJx>_r(-?aK}>CJ@i8!JETJvmOk$U$?Xhw8gX$*YYW8x=EuE@95bs!-em
zOf-U2OmLWSmzxZ5foyvoJM^u|#!AfBaGeo^93^=|2IoaZQtnH9eN#5v{ZnRcCaCNd
zswgD@U^@@CbE+CWhkJT|R&I|3_cog8VLSnTYHhn}7M(p*Yc@Rv)g^M9(i9KJ>~yMF
z3SWXIfWuPMtiSIb9K32Tb+$S%6uwf0PY&JMVG6$(&BW~Z)`&LtidmyB3PrtzttzEI?1aN00O|%CRHw6Y7|5ze{R!+>HiS)1&ZJa5l`7Er4&RGhFLt|f|
zm@Wo*fIKhP5O^ax7foQgR>tg1vEqea-KJp~!
z7Z*0<#_YI;E?@=4&Nbiu3319D4Om?G#_zE4QQ@>
zi^jQV^w!?A4__{!Mf(E4~Rl52uMXpih8L&Tj?ZA#I@E2l)t
z2q4oNy#r&gFC(Y2B{9zrY=>0kw83AfTlZ`Zg<4mknHTWN*RRT@?a{Nsx-^<1e<(CI
z2k8x4VrgTqqn*1cb~mlJ0m%3i5|!z@{hhSOQHC)YsW>5Mb;3{{ZpVUKAw>lld)o~J
zU`cdl3>o|h{@93#tp@i#A--SGDi^g?>5mRrolNzxn+{BCV#}K8R-a8ehKxn+)d9=Y
z%0igXE3o#1)+6^lfZdmR)Ecrss!AS@;*vi|c586W%GhK=bi5Pz>Lxx|W;Ip;ZNpW$
z;{s*+Sjq?q5*u;zfib2f4xWMw;8ORc>`-u1CufPX3-?E*S5!UGGGe<}6u9ESS`q;txG4o;Ph
zL!}Vw$|4>pYes*=VXN*{4H~h+GbQlM4NH#BRXCWm1M6$zrN7e*SAYXq;GMks_$5aj
z6B+Hn*roWA^lrs~OMXB&6^!Z#s)PnAE8IsQK|MR?m3!icFDr#WY;}8V;Q5Qe9`^eL
z-JgNgC~Aj!1$eu9HW|~Xh9$t(so=?*+Tm9|Tt8b}?psp(2)P$U^BRr&9;_w{`r<;9
zY7Y|jLB=@jU%`dPaMMl4{ulVt0rYADT}I?$&sYP9P`xFIc$Chel|}obOZBl!oxji<1va<%A?=}0?c4C%a=^J+++q?$0*Q)
z-Q_`e-@R%g1XHrcnyd2%{8;CHi;vE2>?N+mOt5;ouI3wpIURmXFL=wl$NG9H*do+U
zOVH@geH#%U*pc3=Uo|W9Ii9^3@xn@!_TKd=1N+ej44MxZU~RL91iI*Zo*lZi;&0?2
zN!b0DOM^KQ_X=7C;kynyC0AY}(Debk%7X{l40kDC`Ryposm8a!_9CQ#OIX&ux`S7(
zh?5z$T94$Fy{IaqBF_ARTt+4k_zp19N-vjf03kuF6R%Y&!Qg7kMPb
zvW!Vyr=R&4tfMmyPds)(=+1sWEX@7;HD4V%1i^w#2ds+wpbEykZP*2=QuYom5;K2b
zmhLazny8fXL4Gn?_<=7V@$rQyn6HyjV0aX_bk-imO8SD$qqQDzG(Uk40|9;8p;1|E
zld&6`x6-WZR*_}pJoA8`Ak?a%nDfBPst0Fl3Pm82^kat+a$TGOV)0^ri$0nOT6th4
ze7y~_*r2?@tiVKXK|*V|A4Ni(O0rM+@Ff|EjcXN(G8Y3;>mQP=u5(!S1pAvW(3piC
zCJ5l6GUsF&_NR!CzarNsEUW~mg7J1MQB>jdUy$n>4%eV7kWJ^7?H6?a0vdvHW`o|b
zgrf*zx#JMBZ+7flPZrqWkpAI|HMT<^mPTnc9VIaZb_irH$18p7N6
z_T+UnAt+F2s@S};+uliATOK1xz5p5NTbVG3ol78D0+lN}oJbs*y*Xi*3|@&d;w@;X
zfOdGnXN7|0iD#KzMcKzl3lollOxxcCuyqlOR3NdP24r(_d(UYY%oEC816)b{Lg~VrfF{zPgRx-2TKJXV0%tnpY$qF3%#^fXk`a0D3n24pYgCU!N9Kz;L24&D|*
zG@-@(&|rBH6;DIgk;Pb&DBzc#dn>U7i9_FdVY__$d=v-%*q3VU1ygL!y`9J%Jq8%1
zgrx06tvZS>>LYZG3kQngb%46KF!sd;ng(fb1d=Ip%g7I34m}6D(=pTceidn1#mr|w
z63;TJ7Ri(w5#~}rw#7A4awXzjb-`1B(|qi_^eLU}ZQx#?5ufy%H|`#MHe;Euyf}~m
zG<$I@y0pg!AcI+ekv37Rw8bQ7utzsDi<(WH==%z`ZtPwTkiL}&sO08rxs
zKM1$Oj3gExbxdL;lYn_}oAKPV6lxCoLEMy_F9W$HdzYZqIt)?|B4JxnCbJj5_~F!T
zw%)f&mw@quXCpO=`ZcT&O(^%Q^C
zMxos9t7dHtsmICG@hb$(qd$BxSOfkZfqO6NjgV{=1TIz~RnsMW{vLo#jT3*wRwT80
zBoQjBB_BKtUy`pXkNC8hiGaC8aBJ`lA)hlOx`Gyz9S&_jkBXBW82y45kHS|$&`3F7
zG6DHp#0-QlmtqM%(IQbCy1}xLA2_fEeeZ7p>9cQ@id;@Qp(_q?2&9@|097l<~9C
z4I(`idq@&^Q1!;ecmqiFiKCF$K{qI=hyVzY%)65#WsW0hU@U(<2vtdDiE}614T;&Q
zi%OjLxbgQOvhQ92DOM;qv|+)}^#Gp)(2;_OWN^r~ffCLi33qjU6an-~T&$4lo>Kf9
zs@iY5vPcY!BS`=i)s-1sF(k+KB>S*SEC-ii9Na=OLudcUK5H-PT2l|6Lf$CUFn($b
z2^~Zz76`mpjo>xn4Yu2n1-Wgo2|GweOJSp{kJWGxtKm&UpfoqF(DtVsCOZ=Pa;%0M
zfbOFJMw~cPfa{M7vat
zL*&5}JdE&b$9gV)*_we!`6xslVRhU9bv#?P)K`8Yj(c0j{jiHL{0P3h9NikAv}%m2
zMHD9JuR$9NivG}I1i-XHutv_L
zHz2bS;&~7D{30lrn?8rrDk&fXadZ+_UoQdJZ!JSKZ9g{ThVL^)W7LClBLUAmh#L*&
z5@7i+n~CLhWMDUiw~X{e3?)UdRdu?=Z9`B=v&EgJYJom3;?S5Ch;n6yEY2YGN>a(E
zAA_wFb1?>!i>NI4Q9)7+z4DicH`gp26e%)We|&rmrz4h`z{#SL>Z=S^TvMok`t33ptpz8z2T1;dgq
z3~dIEo7Al!kz$xzIloKC4#D=!$u6G7>J-bjpit)QJsBBqpTJ>>$FrODzR#ok&ti0k
z8QR3faUzj7HsV%W?#iu+eh4L+e4>~OSYBOTPR8Y~`zE%1pGO5{2tE`2v!huclQx|m
zzR;$a^Z@MN($M}*?YZ-m>Tw>QEH=1`bgZvary{aeO2?e_$rQG7
zG?We-Cpd%AJxj*N-qh8-?JzQaE_O)Y9SEZ!BY6C1SZ2#(rms3Fn^SXzU3cEtCS{P9
zdaqsmd1%PdiGEPRw%PpUD~?S{S(yRa_Szo8&Lw?S?lA0X8>B-fs-5~in=yX%QqkTG
zwL(_M{DrE<(amm#7%mlON3uAe0Hc6O)%|qF3(k2|i7Cdc6$d_cN^MPyhT4(oo6leC
zcc!;Cx@#U~5$iSN;ryKw*|kyDA=6nfEpJB$MT=~hdA)CHxFOnp+Rm3p{j8{~5b2es
z)fva;54QS5@wKaaEwHjEZk+dnZvr(*AYgyiJ*l!cEdADLayR(e-Llf#dXf&KiotCq
zcH!k*8+N`B&_1nHLfgE4AU;J%LzOn2VtJXcL{if7SnhU5bJXd%(;4=>DamDJ?el0O~uXx)o~kKfj(}I5ZtP0B-bX3#({y4qv;%MkKyn;XDWjTRl+X1O4i0>h4WmwCXxO6bQZS
zH^l(BUA;X!UBXm64p`g!htxElXESORP{)?9+)=T#s7w5BzHPu$97ym}e;RIDv|s0WeH`p>O8;h*5@TU$IQeBAwXK8K0)LIom+wZ
zaDS_i3E@3xYSET?iEOq8i6Z(+92d!-oM%fYW-##lgqcwwRIyr&(RllvoV#3;%%zI*VfONLaK44}z&-Pjv1
z2KsYkczNhYFKpl6KDRjlIb%oS3kc}Gzq1)C*4Bn=_;R}%j(RECYGn&1cI547OI!G9
zJh-ExxU8-J;TqG<48~IvF{We%t0;)m2;LZniZMmpRq=q0H%3EKR|F=aSNXGSJrzEs
zD;$AL!u*j_?*1h||41jq<@iD2(@L{!j{}~CAvFCPCD1;!Id3+y37KH(j^=l~_U4G5
z>3yCF;Y7`mhj7eomv!)eG4QNmwfbUPa}AFq6i{eBE-G;URbJ~%LMCX@_;V9aM^-%h
z5pQMVYKrQajc`oG`V@~@ATxBb%fVYhiy*z?E#pfD2O6_CC(LpIs@YoWDoeuhTuT(D
zQ}pxC;-Ai3IdtM_g?obYL^B_ldCAVzx7rY4XhAH3Kzc>M-HKz#`~YV}UQUWJ@tqPq
z8^ez4%_*7v$pnZg#dCe^uMFQNU1a~n{E5Iqknz^w6#`|MbDJL>r6mYx3sv7v=rUIO
zl>YX9FCzie^7*sCn^&Br&T6`D6NB?6Iu$WTaWE0r*sH(Nto=i5_wVq!m;sXC-UV4G
zG9sP*$ZaA;_rG|1ASb&VNglkORWz?(6avTz1&|qT4Vi)S3!zqu#1O=fccl`Neszlw
zcvg55;0MBvdCz@nZr6hE-yV%V&;_t&rh_U0C|p=t2Fic7>umo8fvMc(6Wi_5TzP8+
z0{8PyIHn222R?wqq#y`Q#0SJyL!IG>dgA82i1?{jJjRSnp|0AHRZt1j6IYEQ@aeC!
z81}gabDMGUF~gmFGmL#baS|_XVzHle8GFti3mcuun{T^5FD{f3oV51}Gj!R7QNzjA
z-I@WYWl8#0M#(u>(s?V{b!Xj>&8g+)&HbfaZo0}c?k@e_EY4tq!1?jCw1&DA1t0tD
zrBLAZTSAkYJ`c9aCj@iaqVK=Bndw}~nzl#5SJqXTqp}NNDd{8t4+0$wK68tiKgwA}
zvummh4AYLlX1v6*8Pzhu9Wrs88Z{m}r9EBd&7ZSnhTJN|Skg^itB#4bbIb)>Rry`2
z%RNe@hTJXWVK=ZplkFWpFd$H0H{4;9=nBl1i@we!#FO5oZYagB~ZQK(_vKD$go;>BJ=usKb(|3%uBj*fu!
zuHbHz#=K{FIs6o1#>!|C@MhW%|M80fzBvUawB+qAG=(_a?{=+gIA3z*y-F}
zp1XI$_RE{3_#Ypd;twc~Z(L{_waDbt5Q0rf6gGtjYv>cVy|aYP?xjS_`C1$V+fmo}
zvpQ)OqP>p!kTjUDbF9Go!1eh$n9Z{odluSxxmJG~dWQ_S_A}N#+@nIZi_Y=>c(uxR
zwXSjga9dH!TIzr5n!=UsQMzN*=n!46;8k;!N5Yy|x++uWh}f#rik{iU75B36M;
zs={acyFGV7To1i3>ZqIl$G*g$WuC+QaYK#a!)2L*fKczlQF<c6!}vI;6^C9!j*fuHhgB$i
zgC$L3{SsvMg4AD&B`PA!q%{NlW|ed10pTDx`?S3Eh#kta?#H3>*FmR3s0Ytbdjneh
zdwaoNMZ9l*LH81xefQKOts)!Pg3rUHRI{aLF!tEVo*1qg$(22*+@{apK5BHbi0>=@
z+H=Mo`{q5(Gl#O;5o+9}&i)~_x4%qif3IH6^4Ng3(LHkJunE%9hP@$QgCqUn7{XIQ
zyOfH<1DaoZ4WMF)(U$jGfak4-#pb?}5FXUH>q?EB+IyJ{`=
zxK@_~i=AtKf%)^$McA`?v|N3U;qDkp*oI@IK`lWJYxrRHH;_;`?9cx9UWrN_mal)7vWk<{0i+1$qY#O%9CJlk##(P&z4Q
z8+9IFHW%SE3bqfZ6F`T19;C!cLWzBk@vks%^!3+D`vKfWUDI<{F%v+;^(a!FmAxiF
zU6QfA67%S2g*gx1Q{KD7#zsNebueBTjNh~$OrIwN_eADvdYTVcqByNYj;AAI*Wso2utEl{VnT-#xGYTiKg;F)pbZr#dEM0UcxKYH%P-%25M@m6KeLt!14
znqglzJpFmN;b4|6Fow1UzzgL^8$<6B=QvOHb7gIjYJRi{d8Zqgjbg*(JAfdiV~(5|
zJicqaV0*{;=WGey0oNvEgo8i+tzL#p$;oncE5KWD)D>-ZrhtN!+Q+J@6&u*j*zhV)
z+MGfe9tIwe^X587e?wN5^R%A>^*L<+H{JT*`#FW}cBN<)^*Ok=N`JHLR}^ImVreD$
z+r6i5m#Tkvnl}oaa5Ytm
zce>R5MWVf%4_CU{yzxVKlC?JfV!R_Q9a5NZ|m@m%gj!)8lD-4V^LI{?rrL3ErSm^uaWvYj>n(B~ov2gOf)lk{Drtv=Fz$h7WZFF9EbdOHa-&D8^Jym|o
z&DpD*1=7F`xHr|+vlsl?mR)N-sh_D1^B?#DyIEB{?Cfg|FPX5Xf=V9JO+5^$Z@^9*
zadaf6?)##KaKz+Ou4(lZjowYD5s?D=R;fi-lbhEO<9;vrs1gev4yUtWU~|*O%NlIw
z@DM|q7@|wgoV@>slNp({{VO0um|PfB0*
z8z4|xuZy~)MkPSpBPoakEK96@*5glJG-cjO@AMkXzRn6@|7;3>dtk9$h--Db$9LOo
z6i765k{BW)0vaAxN1{{Vy%>R)Xo
zNFHzVUH>~E(9c&PH>;adJlYf@bvaOo!aYfPv7E2$fJgQF4ATEW0n6pyTQcY3Y1itn
zB3|sAdvmgp{%2wRGwpuQN7Ufne@6r+_kgB&m+`Q7gi1%pJU|@E*7E;?2z}&KPv6Bs
zmcC%G!3|O!L&!pFtenUaZu$vO6ZOVGP2^F#DVv2VyPf%eqNt;dV4@uiGKdV?O@+
z3)|SF&ft3L)zT#XFJ;0$oJ5R%jv(4dl>kYv8CeO{J=LE@
z6w>neI_%Pra9Z#@6l*xqNkPpj>~N%aU?wf$Mbx-wJu`S_H=Pt;GIPOSKZkl2(6;c*
z+q>?Kn9$^=rMq8Q>zTxl_h_|c*jUy+tvS~H!ke#b$%=j#Wxd4fhJoWK#*8RYAK|A_
zQf*&M5DiM9@h|7=8`yFmDtBM5-qRX(hyA7k4d_YYyixYq!A&?SG7_dB;GsU}GX!%o
zhwC!CTEo(#Ub)YG93$tL+DDAH5j=O7mD*}Qq>*nMz2N(qE;v|ccqg^`(r}OAu+>8NtKP*YbKnR5gdcaS
z_RrQW<{U!n>S~}HF!T+x6
zlzbRsCE}<+Q|MBw*H!!184d~OM3^H|gW}j8Xw3HByFb}OwTW(T5zj;9SpYP^mY`DwDj`zi{qFU_vuv_VkWH%oo)ByXPKf~k1}*P;fU9$
z$c!10*&Me*75$SazHxUHFIOGppX2SEnKnywh$Dm6Bac#{vFhjbRe^8oTm!sNQfSAW
zdGvRRCU9xJ%rr+Q69BQux1hA4R1F5Xj!yLc83AH*FqYv6zzV!W@j
z!}B0mk@?5iw3OV=wtV5#C>B_rp0g(jg$@-Ydhcsj#e36dJ?Td%=A7vL`MWrM3ny|6
zZt?m{PhibmHZfhXyyoH!zqSL1I4?9{#$4`$+}1h|KVCQyGjCT+xVwB)D5C9Z8}G
zrcrtcuIzw@;lYl9{y(;-%uw$RO8WjauQJ)Ss=l9d
z9rp$vo(pTxnu!W*H@!Q_6izG)HZSfGhFZ(#_U^-%20~+$zqbsgRh(p3++*9xK3Pit
zQ&zP-Z=fdB?X|lL=kz&{s({pWl-^ZsnZ&5=7~yPPpMF$wS;rs#iG8j9?WWeSEVe`2
z0(!r^bN3=m?}3EWez{@o*Jv_&q6j{2e!G`!ZcU6(;iGI*-U~^+wRVV1uPE%eW^&z>
zJv)`{sl;D5E=Ub9A~SPv?dJBR&%v1XJnr=K4iz2Bf7yhol-a{JzVqQWt7Yv@$z3la
zyE|0VJHu4-a$en`J~O}EMJ7IA;5ogp$aE@rK#(|b-9CQR^n$RJNl`wqB{6)IRQr1M
z>Q=T1&99~unR7C0;)2X7pYIU@g??=hR;EZpAH!%GfsKg07k4L#G&zL2&j-EfozpzN
zLfbVlq)sxYWszg}xeiJ%?z^4ZPuI+iku`+^sP1Ioo9_^9w|3ui6nUb1U$-(k9}HT3
zfy;J5!5MbFoU^EaxX&L96Nu4?HU=AITXNpyIu1FBgSr&B&-KOx{?S`;q<2tn?sAIi
zj^?txGBz!0?L3i#GnsSdEojesVXd}b-s93C`$FV|*7zv{ie7b(@O`Lz5!|CX@R#4m
zLt(vU3Yk$%l}#OMorl|Y@D_!QV<-cd1FFpR>+tk-ZHq-p*IP}LK6M4h((Pea-c(Ro
z%E9!6tIZzf-x`p!%zhr#=dR}~J%uI2SC`?t%TsNCN|@#|$6~}lEJ1eqxvPKl9()z1
zmcIrLQno+;cuG=Rd7WpmKy6f6&7Qun*o^U=O}TlTf5cEMd90KEEMQUZiI}y1mEXSn
zm39w++6(kvtTmOou=LkVr`Fk$aTC0(lB!5%CmjttsW^3tfk#
z)6Vcvw)%&@g`yBqF?0^kZU2(xn)-7ZQ`B8r_2q9dqBoAHtW084Yty0CcMhoySl+$X
z1~#Wj&W}k_4)xaVa-9N&fa=t`JrMBEAVWbW^#4#8)nE1sl*Yg((Li~c53h>%YRfzw
zPsJ;eFc&~UBF>(jUc54WeDx7-i0qe7Ha*(IgE?PJIA6Kxa(Q&m(htGSyId>&R00>c
z3o%agK5Xq+qmt9U%tKqZZe07-(ycSR8Fa36|QMCt&&YqR5vHF^gRI^N`if_&@o
z^lOqLp_$E7J4(N`FLOh26PpG31!`qe7tVUQO8qi)S6T@_y+heuXnvv
zJ5IH%#a7M3v%&}9YS-~tbk@vPk|f8W$9rGT7>E)8Popw4(-UU4X|fx@(@uPXEl1z23Lhi!WD+tKUC*(fz_io{*2P+Ppp#&udG%4zMjYj
z%%C48nO^$oSaC%$t;!B{eEOp7cC}9X{AK~D@9IjEoLIf6g8e;oY^AUn=sCL{eY)U2
z_h}R`C3dIwz(vPMp|-+q=1H(bYnTRm$fmho$<`GmYm?5Es9HGD7KU>DRN}q`F}v@{
z^HqeP;t(~NI*Wf1VBklV>rY|C|KYoWKN^gjB!5Xha|3XlA0!lRV5S{dS(<~|9xFFP
z^q8BKcjOq&=awqcj9+pNK)y3v#=JxPSRwMa5{t5RM>(^t(Apz>HXt@i_dZ|#nQoXG
zWOCUeect?qu18MYOHZ|22CLemtcNPw)rl;3MZQ`T_(tnt|A_Xi!J&CL+c58te5bo;
z1g|6$KzDO@a2eOvbJDh7rTk=ZkiD0J%pJiWCV&{CVz7LW6UJFq)s5x59(|0|J8rNM
zb-Alm|Fyt6-A^{0y$G$|2Omczc>!tOcjmlHird}fk7I}73dvp>vYbF&AyjuPl^mSp
zXgmPbu$|5s%ha?K;`wUZ#tlgSRUCdq%^dKG>G=^{=Hlk&j{dBHAF0S=J~**H3zMRD
zEW_(#$oEKB8pCF9RC5_X7?K51Tee`|su~=1zl!X+n>|g3Dy)f=J`j@I!0Ux5K
zQnrp3Sm1xrs50pbZ*d$$ELPS3Mu`GxE=-l)-1_0v^<#zI57PA#0puy`bg1aLB_b}2
z)1vxR^xb(f^dLLFXx|BiN#47!{78Lzb|w}66zfUl!?OD!f$uZu0KH)KXJqx8T~3I_zA>>|(Gb1IgmDZ90At~=5zqShZyQ2I
z1=tTlcLMOD`!)bPJsO{1qJPsFgDdLucAPd8Dnn`pRhMb5b=1D?I-JtoxuE?Cbf5-@
zo(M#TXh(k3OGJ*a4NK3Oa#w19^Fr6|)@Nba2yl!N_L4FMQnn}i%Q^OswJSB}=7gSD
zw+xy`(ny1e-Q@^xx8vkP-`)xN)`66`V8(E6o#OKXyb#BI``Jb(S6PuCcb+9I_|O!9
zurd%oTL8Pq-_eS4+Jg%fyqk5Hkyf1L#E5~7O-l#os?4*p%qm=E>?VwhqT=>S`l|GI
z2Kvd!gIH>h-Ul>zm`fEGNIQ$}^uoJvYd-+X&2gT7XlDhw^V|-Y+&35Ny${tXD-)ZW
zpQyco_@fNt{<$lhYJ-~~oC6ho%Z?SULB(MPKEzBq8Qu$6|{_Jzy@cdXU9Rg
zjA7aKq%`EdZa0QyC|7pk>@pV#+rG#&{^9wD=!>?W_T7~8*-#{TLqU*)UHizb~>m%H$#!y#pmfAQOr
z&Xer!z3!^s?ea~hzJK15c?x4xiEcyO7Llbxwj0pyfN3{>{(}7CPd9bL(?3?C7Perf
z%?EkDOR?YkUc~4eb`7$zm-$N^b@W?tR;Gu|9pAhERhZ$_o3~*;^*Z{N^0jc=jPEBi
z>eHXqsGRSx`=)gFadPtP02@Po)71i5tiGeDwQfLk6*g16S7X1mX4?k4BWZXAeVeDN
zZiEL2GSsYd+haNx>`qM7pBioGDRN%IOryU*sNY5xDiddNXHZ{i=v=va!n2#{&+efZ
zj}YbcyWC{nRS;clkZ;O1>lBH*a!E4ZDZ4lK*wSLXN%!h6orcSb@r}ZW<1gvybDlJp
zo$=E~ysx-%&Ae+?kwQk1!gB>vHltp}+TFYHNm9a0k(!J!n-ij{YmbY8j>2$gFP745
zJ@;78^VEVzhMODTIBo!;DCez_Hy#e&>TinciZf-2n6mfxi9|Xn9y`vqK417@zrM6@MaUDm%mGxIiLo}Rs0Zr$9SeqpY?J&snZ{i{uX)K>nOw)r+_Z^-Uj
zsq%^9H+zuk2gxfAO#j~8dha&v$_=4it}l`$&cIO*5
z8Rjun>N*Y7{6$wZy;Tyv@8|s?J}G(jo7DmuM;Rud4Z=#%C1ckwFAsVF4^)fBhDg);
z_btzCzZw=b8QPS|XV(o6h#K4XcpUKTJW^hHKz8vo=^@XjyE)r4Ub|bE`%eFUOLWIqh=zG)V(jCEr3l-gA~OjTGZU+Ya?L*JPgi5T{n$0h@v$Y(+e!a%PH^
zc361+2KH$aGcn-w&cO!mLEF3`I@Thg5G?0DBf)_!M)fm0&rU;u|ylju6(V8PVD
z;^#9Yii?{X+>3o3C-`bG>u#nVwU{0T$
zl|UCRUUKks#X1u>Ady{aPi
zT#~CoHfXS!4WbCB&YI7anYwc4o$ZpnW-?8iTZ%j^V{7?;afQG!k+;Iz4}7ch4Be&R
z-l9z7Q7e_ld)20XtMfPt$HYoDZ5vax*Ng9^{Vq@36qG$|rTByDH#H1Pg-Efp%Vj*Q
z(6xUeKqyhkco1Iqv3O(mH-!+kWH9^fEm_P_I#R8g-D>vF>|^B>vGba4T-C`R=@JaOvdbSffS
zQ*&arjECyc>C}M3w+{BTpB)$N26Epmxb12aEwAnSVK}k(oC=V@_sf>GR}2Ton7f;5mT^^k7a4yuVD{b@$Zj!(kskW;gPrWL;}Y
z^Vm{zFX@Q>bJ%HpLIN_wdM~g%7kDrhh=dKZ;B>kL$^h%&oa-G|2_8j9U4f^PJu)
z({C7lxsdaD&BSm1XPm(U6miduwMDB~reKYYDX(ZF+jD9Id&s#H4pcuyj~RwSp<$s7
z#rg^j_V|O3!?ib3j2KNKc}7LBvkjMi3g61siXG-mcWgF
zdauJ_R&Cc`{npMXzz-c}Z*X(z@a+DcgiqxwZd)SG^+4)WP033wkF%f%>gd=#4R>+P
zcCW^`vwruKj5)?@S}n_Xp?nluIO@3(alEq=X(ztw35Kl;@ou+^L}~WuQo#{tE%85e
zd*D<5Gq*%EkcZxtMLs{0G~{RInyvhNdw`oV6^qQ}M%l4A+uj(LF=u&VpvL|%^cn}c
zsm?0M4fsRvc*#(aGMti>bg#3owlF>^!DJ~6Y<`JOdy*^kUzTuQ9=GuZMCx}$e_dBR
zHrOl}9Q3*ve89L3W+&>eh8l|g%bly|X9b_}U(TEf+D_J(Xl!9Ysvx%YrH9P1|vfXFN3O<=P8n00v>dhY!xp@q}9m)aqG+QzpBp<6io`UW_^y<(?^<5;i
z)XezP`)3FiUC^CUruHZXQuTkg7$G>2TmtVwCgM=k)VT2agQvT|py|MFG%kl%M_~odl3|;_4
zkQzbh70`gC$F*RBRCpc_s+y5WSC(UhCE_JF2AicT;MlM*TIy
z{Q9WFL@4Eq_-p)7Tg)Jmr4XGn{g$PN{8xMDO)O2nZo?CKs1UT||4rD72gbPf_#R_FT)We0yDWZT`kW45jRFr`Kk
zr*|}fpF78FwTy?H1l38w1~%jeRF~cDZ#9A1lzcei=l>C*V#~(Rq(x2Ey7*0?hLzPa
zOlGHjL(t&%T#S(2d6iBYKO-UrCA{7dI7a?xk+^Q)E@T0`I7KQJ>5fo
z(Sd4mAWg(Iq-6EH9e~~+Wke?!yLzhrj{KWhwQQJCC8R4OgC0UDnT)(Y!Zg?z&Jla$
z$o=el0cYV$(Cu^wc#T4Y${$X_b2peq4&2!VO74V *EW!nr}QXr_{)h9}Zd5
z(UHf9ZZJyQ42C^r_`bNqMlbz2o??JZ#k8MXwYKvgRl$UI;WIfUw`(P
z-pgc0hht4vdi_Dlm#U*b_HX6*6vma>+^J2+i=KX0z#o5p!jIuYb6M-_p&
z7t}JAwo%yu-&45O7OcZ*XV^@Szn<*MfM9R~lGMDvAxzo@-L6zQ;JGeno!2Gj1sIi;*uBJgkl
zWd*>i-VFHwe6u-7`aEP2q|1^jd_50b-w@~ECsY*|G;#CL?}ZgSVUlMli(da0fluv4
zc(2Lst@}`{!O;qul-Kn?7heQ*YG8*~w^@d2j34(QIs}tF_o;bKBCismPUYYD+5a^w
zP1IdSEfFau9X|xpB8yyIL;P6y8osN}o$%~q#0t=}dNZI(iHK13){+w(z36bY8pbJ?
z3Z$bn3^yt!r#y^fJsNU*`A5__G{e*9sNUdAh15)Qct>nW4SvFCr9gIXNuSri+ETIC
z|KhMzy=F>3Ix
zO#lA8`a6M;s
zBtLpW=AACex-Ii%EyHQhTZ4~9{$r>*)PG{B+82_y9Khd+h43ZlpDpAy%-rwv#bYVy
zzd@pu|1|WUC;aCLL<9eM!hfFdpC^p8Es-Yv%MM5h8n5xb+sM-~S^n#=^Uu<<@Q)hh
zv78@0p>4UDiB;@JDz!a5-os9%5#v~s28&n5-FWhXcIakhPxp=R(2Aa(6P)gxpP6ja
z4dSN877hgwP$*HXC8RmgvTXB=@bOf-Rdn*a{UWthJ?ZHNbuu$lkosFmgO>Ssg_L03
z>TLpWoyxY2(9okO^qyw;aiz-1ZOCDDYwMF4Djtq;OBcXyQTz_tvquP#^;7A6!W4b>
z#TGj#qZP=No=rVT&)w}*{rcp=(`uJ!79OpUd!XqHWdYUfD`nT5uV$Xme+l+ew*OZ
zfGqWeQ}00&A6l9FDZ}|9ahc)sP;ZXhl7e5LQ)YSGw5fvg`71|7O+cJ2n6_EgLuT6L
z&+HsQrojAhZqV;T{TkJ=+aH$;iX2zG{}^EKHPMgCNEnx)s5klC=ZSsqEX3*l9ZB*p
zfsYa)tQtf>{VKG1yJw{@y(Gh@o28!c_xit@uJy0U?TDdNt^GtTCU(I^cgbt@(e)1L
zF@879*)o^v6Hl0qAH))D^Dy)s=eBU*#Q*(e>pyMUHI;d57=BbV562HqYj+fyZvEGN;Yn<{~JX
z!)`E#Yn)X_muHTHdp6CIr*`p%-v|+6O+C2OPwtOC!OWCD`aU8~IOIQaK`Yyy9pwZ%
z9M{k1br0m8?|VC#alms$?)ULIyE&Xor@_}$@ctCdpsqxOwpC6|d65EZ=1%Nuuw63y
z(m}zc*SiJ=lU{USHQpufut!&yG>xdy|8QMnFPrgahTOYJEYrc-DXhkieGC;4m8CdV
z_y2tLG51a40Ne*G=2-3zdlTo!_dT-TU3OyUxmKPIFp==sa|2Y`skicd0h&}@Ib|pu
zdsf4KyxY;IGnjeoLqZMUonhn{Ct`zn8rpwIC9@5oUwGO?{NU?1lHK3bZ(sNR?#Isi
z)?t1gnlbKUT{?*A$^$Nr!*>K#r4;0u{Ij3v%s)Dz6M5GY(dXz^6nX76TAbSY(}T)<
zcFME3ILEKUuG@9C!V7_5A=dWDLiLOTZxql=MZXB4W?73q-b&?%@C?c)7TQlYcj_1U
zo_Rb=8^1eC?BIvb{TVV}o8OMh>^RXqAp8p5p!gyF2lNQBGhEL3F_
z$3``y7r#zr^FyQ2t{M#99eY{>zNwRUl;cV8&tlx>8Tt1A!eHlcYjyF7J?6p8`r_nP
zqtYO?FAwGBAEV_TbeOYRp~P&-+{B=aGpD6=-bGjZY0Z48OUpR(P`WDG&hQ1z_=U!@
zZF~I|#835KesP7+X_e+J_chK~UTGE2yc!nuHGi#e#rabQLp!4dMl?dv$y+8zzMsVh
zhDN@LO&AR{^rwYmz6;+b{jS=vMm4%rW2{pS-=_b*_x+FLe~SHo+I?uo06;Y1H#fd|
z9tk|<7$&N6NGVlbet)1S`+IlnUx8IogQhElLp=tomSx~jqc_AHX|{Ru~NJbKJ5*q!T5zb(L<+s{jC`8wP2c`}xp&!(Cs6e;`^G}i
z)=c95{XJi@?)AD$8igOP(>_uxArZCd8(ej9v>KppwV1v=W&N{ZjTgW;`F|4dy0rVX
zo4&Z*H3)9A+*drlckol!0~LSNlbtx0F=-kb(Q34@JE9YGQ)%H6_t)Lb`;k*zRd(_9
z?%>pt$>mXI9+S&q?)jgbywO`p6Sbd0^;OcOPpVu*wMbLsZnN>JmHZzhqwZI+?>#%-
zc3k9X;N?Hi0W?+e@&ioq*w5L9-89=I$-i-|pT+I33gVW3%a72Aaw7Op3f)Qlg|%^tfS%d7V#
z%Plx)`NfYQMPJXr9cTiQk9ocR)3A_?K9_-YvIn>A5m+~F;C+G2ICN{#!5vBo_&H^b
z>VWbMdG|vrI3)8Ur@!gAex~_Qx7v>%gQ1m)ffGTa
z>UJFrbbNvN?2W>6?c$TeEkVs3#bp7-*?s(BqR-ni>C5~)N3O$TL4<_1F(lB7Xt(0E9R
zQYmRT4M&6KS>tK=t-bfD_`di1c|Y%8zklxer_R~W+H37Kul?+2Jwx2Cvru|YXFe+G
z?Y@hzpv920CsQqjW)Dw)<#aBls>gArzFdW?&PdNdACUWK6$4RCxsYSl*PmYf*m$q2
zn^8aDdQ{SO*#F~CmcI+0_qrmvawLvViF1ycrj-G!%XmsPC2@xmXVf=BdS@
zqJ~mlMbtb0;wIIfG67f1sla2w1*Q+u)-Y1zhhc9W`~b0bW$ClxN-TS>see#Y9{TyI
z$wnBj#b(A~S}<;IaQT(rlc#@!_`j(N^=}UNZ>CEr|LY3=bp`*rf`5x%$U6SP70^5S
zMdk2Ipit5Q2_ni|OtxFRlytpoXnCamAbF25r7V^DIoJ6xWKsPq(DANIns4@`+%a5q
zq~710XC&kFIY}?cDmX8+=(Fo96Anr%sCWyA3cHGfzoF>hluVAkI=b^
zQV*tnM_pVgXZTsU6Y{9x*$HzE6l~YUMWSw!+_*)|^;=K!hj-gqKAd{+e_pJ;R4!Z7
zyG>CVw$J8S-bD%Zlf#E?&q~OR-m9q8^pjYNSV2jD>WUgZXlIGu`e3pK9~@Z<6uaCv
zOwG-OLm=fw@+YmQHF&^DAT4<~7&Hi{4$r?d#jCV^g7VFS=osSlD9@rgtLW_TucbzX
z(xkZ9N{kNM*KMH@4PBk;Cg
z9Td5Oh}YKx)bOQouM(5J{p3nphrgCynuM>bUp$K;c(mRdtuV9L4K2G}O!q}!boP@L
zgH4c*l4Z!Y4w`0GSmZTm<ueKdm=
z8iShKz^Af!CL`vwI`pQdeja*(%1a1B^s{foVpVk&nv87|C!Lx(KoaCOGJ;jWaS0A7jhaezWIu
ze1hwJzsQ{zdr@Cj%J~)a;l75>V!6UuERWb0d8g6q
z4YGftU4;zTNcagS_Z%0Q)r%bjv#`c5`c@8t9(bc!xsT1SH~tS5UfqMFbO=dlKi(D5
zdJ3j=&hE|L2yOnoW^h_C6wys_2XAe1Ypp^WsdW!&BsKNm7$22Fa_w--C+gB>Ig7kp
z2~_@Ty`GM)%H8~C&v1K-4P#j*^M6;zc~WAf15F0oCUAG2S!vccufLcN4ok*8_p(GUv`UDW$A0
z(m;0*X*Y=$J8$vXlFITIv61k9I(e!O{Ve_GO#RQVllH5#JZ;fzm&mitXvpNs!qk(&
ze*BuqA%_Y-Di<$y-!0T>KwIHnlnemfgQ_XSHKwlevAfBhnUW8)T;Bv-wckD}OuL
z6ODmIYq()<{K5`0bm2s+MMwp|X5qsWLG<9yJoE9bAX;EoRLYxuCyYThnWRe!f)Vn)
zs3X&A`K`d^a0OcCtBL1fII25NinS&?h$73+oLCI=QQdERzAiStxlZ3u=D1HY=|B}1
z+D@C$0H#5U?6<l$xNK{ZeTJ
zqZNo~9_m<=GSV1~K%CshK?13>C8-lK=-)EI0GTuWabCS~0UJBQTpqD~?3hT1i8qaZ&7)MN1MupAN&{cm!|7Qy@!nZVV?HN5+1strc6B4wV(z`;G~k~
zXWgD&*gc{$CdSTuDIYXKJ&C1FCYIRuJDYzU(l}uzH=I1;hfILU-k*UPX|KF5w|)-S
zXF=qn4@^vdcxBbs)b8}cpMxPjNMbw_ES@L!8;dz6Ea19(dVKin_+k0A#zl-inEm|(
z2e!gHl0jJ=(7I{ok&i50uB01%n$-ZJfLrTYEYBWfJk!FT9sJp*3H!#I>j)38@ad@`4NM9!~uu|H*(9ANL;==Q=sL
zyOsuN$9X7)7YdFWcwKXW@fvbU@_)tgKXEzJRWj^)!#{kXp=t*#I~3D2H-$e^YJp#N
zqt6xRFK*;zBfn%6Uncd9d9mDmi{246
z`TSrOyyz*Sfam8OQ$vUAd)nPv(^V(KjYucFTebPIL8xMho}o*Ada+0G)lT$gj0SXY
z;2J1qgXU|x!kA*TNtvy=%7ip52a#&B{hnA+dP}sopHj%KC`lu;_5YFMU7eF42>vFe
zwZ1jM?^UtHg-AiaOdyjLqcl<%IXY~fu|R3)v`dorLS*WbC+z0ZJD0;LDX)f+D^mPuraPi&IFzpR
z>}sfaI&-u%$-Z)3f%qTkmwo1}+{?=?5Xa54Zb8ZUNjn2aN5$6N50v$tKEK(|j%)MB
zXUF)|V}}%@OBPuei#KOxcT>|cLOZtXZ1&qsQE?NxS8^;Sd(=~EwA=omT$RL>)|D2J
zeZa$5hU4y$zEw2s(T5@_r8;4!NXyanm#lIh8!pg?P+?KyT6h3d8GoYG3W9s{ltz7`
z`~;enTpvoIR@aP)qX8Cfym6W%J$s#o`|`ZsdcNMM%55B=y2{}w^vH{U!zn{NzEN;d(0xP-8oGuQ#B@!P2L9A~v?>
zp)0(vL!uO*KA`Ar6Xx7^R59hs#}8>-j5l7tF{NeDb#$C5*2NLoVZKAnA^n5x0TLGv
zobBpzUXbcw2H9&AyUPv`O!o^7TfHJ}$X>&B`0^2JvG859(~He^bNi#dcrE1lWyE9a
z6DutO{6}RXT!y!eaL1pLmcRPsi}?*?|HBhe#9Z_=)#J?Sma+M
zUTZErvgLsn@0)m!x77zuNPTEu3EjiM*;($ZgQgKHk6Jypa}d!qO1+uR-;O$S_CuNt
z;}(w+A8m>CeX6gK+EUqyh?h!<=Te6YsDz+1o(dmYJnmo5J6%3=R(R3ML;Zu|*WSut
z)VRAM%~LloiNz|AMlV!TRg7CX+3&C9+Y-0^(oCE-(RHh#X10v&n6
zE6TY^G{VehwzN5=-qPw++{YmX
zSD5e@b({NdO`KEI^zu;OzJ7>$RFp)jXKCSSqg-)!Pany+knGmRsvq!sDp^~KmhKUM
zU&Dy1;QSOrZk@M|Qu{jikWl3lUpxxG*eewlxG+wQv+U;95Kzol5R#eX6~`^%n5SD4
zn<@?L^YY@dkt)%L9~wN0+}yjySU5n!J$xUxf5w*vt1pr>QFqCH@=&B+Xnmo)=Na=C
zMf)?_!a5+>LHDc&x16J|mDShf>?xW4i~JHuGnAq8iK5-_TWp3U-P>9ZMR8U-hF*w=
zN`hj}3O>=&sN%?3`cSMj>O?_o@0nNyLo1L?f&|vva%IC^kM%W{8roV#B-NWGMJ)rU
zlEWL0j{aPg`torvydg(YZl3@9FAZx(=P!W;7~-6hF7XvHa&WWJ%nG?QXg-Lzzhkze
zzoy{naz!aK?{`MqM`!ldW+XKzxt;(gL-)fzgud`an9Zu1jF8SQ)v&qYmSANje4LgC(Pm{_9d(DN1ur8#1kN1LVtx>Cs=}Xg=FouWSDm~+KNU7U&5;Ogm^jbx}o|G0j
zPu}BubIbzt4$^X2%u$O6iPy#@|GsVDG(!+QFUDN);Ff}F>&l}LXF7`glbr-FOI7ZN
zyXGAx*aV}dG!3lLG?0k=7<^*?2D|p3#b1zLiPH9pvCXR7(n|mQ+9Q2l>IM(X(UR4$
zylJGByM(rG-JwhmP3$uCD~w)yyd5k+fGr+ix2i4XD{8u-m6B4MiNhRNqmwBjB?vnk
z6aKIJzSb>!k+J4Mc3WfVuD6b1hcywT+X_>w$GvAa#6Zot@|Yk5o0O2H$lMos;`G=~
zo3pa;ebHI%S3{HPx)NC?oo{?S0e1!
zXsmlGy(^At<{?@ggc|urZ;V@^$zhxr(_#}J)m{zjetu|RTW+0g-`ncb;1vO5$CDQ}
zaWl)nEa){NQ}d8h8)ZR8MPEU3aM$M%pE9m59rMLj5y_~j6-{aNoAt(*Tg<_sAKK#j9|$#3TDzEK`WJSD^r`0jmsdHR=BpTah1{6{$rZ|X81Cvq
zOr1So7VcAAF)iDSk%wC1wLAtv8Xn8r%?i2Md8G625`&bGCr)n^h8Z&uwWQLtfAJf&
zqrF%M7{VV~!m!p^Pg`fqS+=(XHV1*!lcYFo(;XxAS~(*_=rJR^)A1T4hg6C9z-kuA
z;u-@v9Fy!b5GjG+*6Mtud94GIhmixMZw^BqSnWkdZFU=I-nF+~1&$2OBS5qK(=(pV
z(TO>g2IkY^4vY>koZg_Eok2vh?Z1QzW^S$TaAB8<dmfI;-@I&RMb2C19)TPQ{Gm;3T8J3ZAjiPzrF*~n9|9jaiS@*AK_-M`;1p{3+d__sMG5?4d@DJ+
zZ^MDhEjz!?Y>BKa+?`rp+0=goY}Uk{@W*eT&)w&Ig~ue5A_Gpk4gs&%9l$PU&E3wk
zGZ}yE$#+?^(evW>t4Mw1$T`Or&vS~O3%9#+TiWWpVu?C|qjJ#Li5y3_&C!)~*M{Dk
zC><=z2tq(f11TS%PNQhf`~Eq)m3zE+RCh(e@JL$!Ywof?Msn`@wAXZG$aw&e9d1lR
zF$FM3JU*2$SKgLxm*N)C&32qTUz
z9p3b2Ji;;i){GknC2L1Ayb;;yh(w~S3I8R;mB;`yr?r0evZFD6$%{+Gu)5F=yueuk
zgpAL&eM1&WQ4ng9j@q0`ompDw^wa-FS6%PVb|8^rku$Z{dH*wHOl|i7eOR{*%WuW8(dd3|
z>#E-b=t$0FbmTXQHrdQ6IyArWh2z|$`n;36Wyn?6`a1uGGCxSfXN5wvdGe)-U4}&U
zk-S|X&|kCreNoqH)5J3G!+pD47ozt<*e^9cYHVnxd5V?%Xg_Div0ow%ooscPOe)&<
z#<#P&5V_*3I`RCFIi#ZCI1~#WC!?hbn%0&~Zq#>7Ec1^*RZRzZu?l_N^Zy9MT#kQI
zChcn%J3`@-gYG}sZ9~#-R6-5vGm)Coc6z5K
zDV^<<9L%dig%(|qY;d()NfhOmF>;zOW&C>Mm!{swhTPwL5bQ(9ZdM+$f_yJwp~-;>
zK~k<=!yl1?Q+C>hsyM@W@}|Y#zm+#Req--sik|st70cml5YDHkm27Q1rpU>$lEQGL$t@7qU%dvl8v%N&
zKed%&2^pTy)MXU*pi@OIq{lf;AoL%?af#GjkM91i&$hjhck{geZ1KDJU2TEquI|xQ
zaH57LNUCYfYu{lI+#0Sf{IDP%6(`itRR}h}NOBSO
z#8e5#KST|F@5z+4dgFY@Yb6f@hqJ3?sH$JnKx-u;N{gqzQd?gFNlW7=ke93IL{Mga
zkUu-1IMX;eh65JrNLnYD$o+GOGcn#*XR#1hfvw#ca+GDrZ!xEX-GA1C)E&Rqa>J(b
zR8&&I+FsrmSI6Lmv?N<%pszQk8iT`=$8XbyVDbdHk~1jfg_s_GG|?Gqz&M@p(sLTq
z^MlOeYK#+8Go
zrd4zL>TfC)P@XO(eQrwd%B|mK=5TO}J=U+7bf!2kEOf=*opUA|RdH!)Y$%M|ew(VA
zlqyx7>717?<5=B!{sPu=4>1Av&eGi+jRxCQ`?^@}eNCU9!WP&q*(cvipT771)Y7ir
zO66>`jh+21z-hx9@VW^X+gGFeWmJ?#^!Fs`YxOCwn?_tJoNauDTjVKeS94DVuac_B
z*C94?RCoL!)8ixj1Df
z)2A?J23x)T98%?0c5W})_OjQ&kd)#r&r5gb=H8GTJs_IsbE28wz0d2b6Av$CeFPIb
zQtx5g`;%MQ<>?+r(fE)%7^@~5c8NeL*As_I^{}?~;tFB={nP)X`aKHmKlo&~m&jB123S@qwy}%RzVV~oxO)gq;M3OhkJHVz`HV{ZRgoRr-HPKu9y
z%)xHI>1elV=tA0|{R1r@O(E&b`vmK(y-7rpqrA?Ee`{y5k4}Y>frFISUn{9Vj>_Sx
z(FF16kkf;0dLq;mFV5D`5`Rv#%TVjX2HRxkvUE!o3zRFFCq+5v2s*9WkDfmdoX3ljK}u*=^zN)n?e9eVYhBZZ
z+gJZV+t44dgwoS%^ody);4=4k^Ui~C8SfQBDf}Cy)%5V@yUe>%!pl*Kb4`_q3I5fa
z*L-u72T&|V#(QDUcsCZ#$iy;^X3f+`pedD^Wp-KU&L9qz$vI+rP_f
zg74x&QxJ{l6Uei}OTWZvaOd-IS$DWPDv&S$hG^wsR*IYgK;J#Y6Fq|oDk8#_MmFm?
zceo3|b>6gb8g8ir4SKF!b+sS~6SdSSDE#10?M{6QW5<5+d!lO(!VluJ$0-n!-4o|=CzfVS&b@|JN9evAfR32KnXJD;R@k#
zAvk<#jWfG#i*xtx5{IrMBKwZN0|*n9aR}cq?PG|xmx}!=h=QRBrb^5HvLIU8fF&<*
z0)M1unZHtyh!4T*=-)8T8@k5t66)VKF_iQ5+rxu3AC=son=z7JzsNWjOjdkzVfB5l
zOgFm#1I$koQaIrCtxVQ+bm#Mkbx0ZLvBZ!O5%JGb2Sk8BtJz$&J~@HQ6UZ2>QF8Oj
z^XGuXfu{Lvywy+n_FtSIb8D6F5;Ph5C>AwclvK||Z95wLsq1s2}3@z_sU5MHDLZjVc85jIQyKrDXgF`HBSc(kgT)_H=!xkz!YteGm*t%izW3sJ==4TV-SO
zO%*$rIhi3=A?|(spzV}$>eJPZHE(M=9y){+q0GgX9~1c_LfdeLm%|+kdG6B2FLm#5
zjOf-4FOuSF4J|jiiYhc~;i~6|2TVz?PqA8w#oTocD(g6FVrsI<*ntOLK%EH_hiG$*CAfL+DK(-@_W%Au+E|)t*g)}
zi?h^WUUcCT^Y6n$Th=20*X7}_LsT3S#bOhc;j0KYUM**cc7%ID%2AOIGAom2RqfNlYZiO#jNb^V$a`7vW`?%7cz>(7x%moW2<377
z$jO@M{7Q!oJt6oiJ{ix>w=Hzer=HI|)u9${A0Io~?MCDepzGb^E00~etQp{*S$`9R
z2G&ngeS#E@{_dQy-7F%l=$IPwMg>LCMv7Op`%sXAL@W`No+%csxceSkTboZO0xn}=
zovBdx3zBz=>UydlrunHQ^sE1;Rxi94tKMyhcBLurouLL?!H$R`(dQD+`;k%@0q(IhO1uBmE$2G-T&0~?;_pikK>7NqZ
zbo}}Ix!(lciMqY+aB*3JPM0%3Rq~e=8ip24L1d>9-xCoyCak&9yXOTg)Et+V%VYg8
zW4^LN*tww$YLfI(V6fg9yh9|ieAZ(B9Zox!_qB+JCkbkPLeipl?cdMvh`bP3U-Zz{|E
zqeetPKMUWn8W|ta&Z(9YN+`+%W-SuK+~md{h<^Rvi?F+UQ;WHYYG%75t_RM05j*%a
zzcN`QR<|0lN;=)@6iTk>Eo|EY`mt@@nlZW{GvosimXK1xY8FccvK*ux-B@*~Zk-oy
zU7v=>)J@+muRvk`Col`DjehRQ-!9!rjeA&F*{HTl*sGMkLI!pQ_}?LAS-NF~$Fb=8uqg^{BbN0sGyRs|B11|1no`-lzqtkF+|()@zy`EMi&YwpWbV~W
zFBQ3-cSDK`v%To)kzVcNn(n>Y=)eNiwG&5_Yk}I+>%3Tt-|xK|d3Rr*1FZ3G7sA}Z
z4;swbIx9et>xAbsW%wmf8PgI-8#Ag~egSTsjSRK#3Ia3Skwo_B|0XEqRaj8UZ@pfG
z1Vm0+biWCF?&xTY$?qRJrRR|cAoMOAl4;9-rnW31
zaxyjhP*{N95>89XPu4r8sm``4yS%!URp-j
z3-&+YtB8qdOmtj_wTvdV-)ha~n(hzZzZaSvl)!##h(tc-4kWRHtI^$er4^m!B$CC@
z`%OD^fACPr~gz
zZ}@ifJqTK$nYHzxlbyUoa=^mruwaw(6T+e_FJkJg9e<&IM_Vb9mH)jeSf5CoSFER6
z&@iGs{0lFno|JVnq38a}e{u@E?|+Lxl<#uv(@Z3H#Ky8i(7;D56C-5YEp
zaj$JHc&Iy`@A_NTo2DyW?rc280VNc?gm!y&OFIhEX#QK%ur4UO+#7S@0tcfT{Jkq;
z&fLVhmJrmlRc7diJ-ikp7AFL^WC?_&7pUgb80^zf%++_ei4@V`^W`z`EtE(2q9TIj
zv`k>uX1jA|-gKbwWvq=2L^PgwGv(bx$K3b+lSjxFk%$RFp+DphGI-_P*jlNKaXqN||9f>zy-*=@j01w0#zK)~-}@1$
z?B>0U)KbJZ+;3Q_y63QbfZ9bBCkK7&5IPv+Ex(1D>e%HopFQB`YBdjqN?`Ns{njBS
zMtzL%ldC`|r}Pj5F8uv-qg9+;Z=q6V$a4GT_O!o&fx;oly=SvfMA!xI{6ebB6h+}I
z$A1SqSIqgm$#-P6D8xh|J}8O;17V7Q56zBhw^*|@x!S}ZLndYR;qE(@L%(Ggth2*N
z|LCZ7=t4fvDfXPz(*BXG)^7s4fr?j1kCe}!Y;74`X*4Q6-DdcEC^L7?c8|8wQb>FR
zW}rkt{TSrJo_T_oQDy~N?V<73@6YXZy{#@uwnDINDY6LjA^~{ux_ZQ5PHk`c-nV|s
z?~*8D{@t(?mcXa4k6mgoup&KL%Q}SW9c|RlT~yXDyjbcp|$>g!M1(GKC*89?!t!t;%ck?x+$$<
zP#;n6gd||HiTj+N>;X9z!XsL1DF996|<|8lfmD5tAl`ww|m{19X;AT169LJ|DcD2LebG
zh+eR!$7@+aAwuqZ_4SxfgtCJ+3z?8?+C$0M$P6>4-O7Z!bE|q7;Z_-7%N`+EddC=@
zLp=A&|KQv8k7ZSML%LV*>#~`2m1dNh(W|48^>SIBFjMkk?EN?_V{R=h=+9SvFoGl^
zJ}5JadJd~T5~Se85bIp%JBzEW{MAL5q@GyB^i%th^-Ym&u$_sl?+)u`0j1}}4g(wf
zZ=o>IAB|kEs?S8xMGNj;q2c-l4kqiFV5zHFw6-YSEbhBw(uUA4mR%9n)_Gm?rf?l{
z4i|7%hK*+HUZGo)_{!fbz1(R59$*sK9{mt@fy<*o@F$Z;)aK;$fY~gxSxIvIyYC?W
zb55pc1`z(ol4Rvnk%ZIR_PiO!i|9mtoqngZtW;C1EpPeTazzl{;=W&UF=g0Qocl{E
zw&58wcVP)wjv=@py3O-zAOY`O4~Nvm^%M^~U$K$D8_nvdfDmi93~Gj>&&FziG-fdJ
zeB%6E$XwlkZ%Q7;H#?mc&p0~r-TdvsYp~;aN;D||8O}$zu>Ad}vdkkvDV@w$%7KC6
z2{PL%T#{mip(xXn#N3-c*;oImYROF7aZVBE-4gzH^Mv6+(D4unz1?o_fYALRSE4t<>-mVT8#%Cy7&ZVfVhyj*42?`OmoLU8-boe46*=3vx1X46AsftAO4c?=echij4`-
zDGyM;aA34U+xl{__jm>f8W6|?zkyO$hyI`YA$_F-KC??>-ZQHF1w=+lGWkk
zli-S?IXDlgJ4_u#d5MHc#rJ;AAbmZ5_rBIgz>HKccMjD<^bvvR!&Jb8n1KplQ)Ffe
zLODsFVVZ95;|_&2Ok~V+wMDTk!-E
zzk1+^6?mJY=I!BNJEjg8p}HM}P%*>@S6pxP?*r#QIw#Wg-sFHGq(ZRSJf3>e(w_gC
zdB&@~^*az%Y!#mfnF!^wYabLb^ldE}-Kz4&SO26Xi+G&~IurRf*$^@NuvlPZ1NaoY
zC6c{j%nwtwdRUJAYmo)eZUXhA;zpwq{&`Q|#wsB`o(3%{jCS;H`=TWT8z4}^rCYhk
z8Xh->3aI#DfUV!?7`17%t`_
zPJ$w_929&e>kfhyP2)6)Zd#Pr|I=ZQ%QVV2{x@f=9I-Ot^iIcNyD%~EN2EfcX(Cs=
zIeA26y5b-bA+HA6+XExUYJ_%YemuhKpB
zgzL3pCG`tw14ta+h@0bkZysT1U*-FIBacLfY_MMUsSWEqTFXN$OG~s4{Zp=z(_Old
zD8yg=vKL1?gbKNdl3RS~6nmGqV<|}j-UMUnppd{5skGxlHuA@`(BD(yT4RzCQvmwS
z9;A2x*>Zdk%0`&pt(kXsw1F*BidtiPKktiy34S|^D{x(KYw3S%)G>8&PDx{!;>Fv-
z;zP^*ceDklO#<5z^!YUxZrd`E3Qy|D24grsmY9sggD6c-n7PU+j)=^l`V;opuGOxd
z4*QqbsffIMDLZ)*C8HDWot|E$_=uN*w_lm(W>%N-fmgR?U^&W+<&=j$Vu#>#C-aGv
z8%4?3wHql6g;Wi1<)MmA-}*I3`;Vpa%s4N!)s*uou(&a^Sxyc9hltNx2Rqp&M^Uaw
zyFdTu#;VLPF?5iGqPa(fpt?Bc4oU;1PhT`LlQJ4XXR;tMwvb|x1fMRB3()T9g^=N)
zp=&Qt{JR4G`>sXK+eC6|O76krf;+6BHE5jTT<+;3g7)A}2cbkS=W}4On(O3JxRVVZ
z<%7vr&627t|CWLYe&9V!-!P9mEdOO_l(LcXDziuVJ?$)x%Remho6pqr?3sf{-*|s&
zorc`*4>c^Pxxi@EjHB-^j_Di0sUW3}1YE=fgyA@1^HM##xP
z@>mnQFi*goT%ZyU?u-Xrpi3^W-EC9lS+6x858I0lS3g7flXVCgh1|11aHrjJ%n@s|
z6=?~jR`J(w!^QU4fU?{-MRspAawMer${`sD$zs^@c|SK+IHcUe?4TR8LP(y3)c0&4
z;#MsP@9gYIT{fVHwq>7HG_cJQ?-`tZ&3`ji2$4i9ZR%x&MnXM5)MyD
zyF*qfeD;e~{Ez?~i@xQEqJ{G9r+1QU7b8eob0aJSmG#s$ywNCfeN*J=d6BISGlL0t
zmm^n+bB@Q@y>P1H+eEM*ULbX-4HYsXbC4niT2D>cHTWclizxok<27@lK~=B2!_@AN
z5p#JV;c1+@9Iget0Wt&2b3kLZLME#G1+&*3SYb479sv$OG+q<$ZM>#q*g_(S(4}vG
zYQ+ImH5g4TexbtQU+?l+WI^Jg#lvjw7c2p;gZa(L
zSBvemh=9B`2i>#r)E<~2dh>vn!?Ww%v5C-@-`T1J`J;E|EU>-Bm~O5kU?mrckfP+{
zP-I1lp^EKK{1K?)Y9m9!6)sCnlt7X#AG!+o1INy;%kT?Kw%d6ut>N=T3y#W)8=KT2
zam>ql;9IMDdSH6eM&=)fiG|QFtZ#@v;E1Hihj;O!@Xvnqo^TAVVJN3>y*3w
zZkzaHSTlG_Yx%`kt=`5j;VU2BO^xCONLiyQHE`x2+UHPr*a|_MHzCYYKb+s}BoG-v
zm~lZZyEQTct`nbxeNiY>@+l7cH(=On)koNst{xxGQC64TsP_cU0B;^OgVymQdcCjW
zwVuIV%&um`J-`Key0_6s1UsT&d?P_TcvIxvk@c^M>lC_N`5x8q*U!ME>bMHg(+7^&
zCigDGI#8Qc%TK_M5cP;f)>5oags=g02m3K^!mFQv+rz<@C|3EaaX+>i-@x%wMc*9QjgeZagWI)5^6Tg(
z6K6n4a=qA2{xVsRWdkgzY%cg^x)f$cKL;^iCf`%zYud;i^>g5*ih)}l>Bmat^y1@{
zt!Wjj9|NTwvytQ}-8Ep@Q`n&A?xDlp5qRr`IUj2q+d6cAKCZG{h+-S_IHo7U9C-9s
zAKM)(F7aX{A)QIsh;eE{qM_ph-zl1LL2LX0DYO!61&lEOxC{PRcFZ5U*4d7&q=&Ho
zG1%n_BgQ7;&H6olX5Yk<_mk~dS!onw!i=_KPH_~jx(=3bgcn%BoQo$Y&!6ySdUl~-
zQ$ZmsufWyJLNb2g7sIC)oE!6I*5oF5ylH-OOGa}nHYAhAr#6mgKaY|PLGH{t$*$&A
z0p@1*lQDP3w8P$}90)%>@7eM`5^3n~+l;Lx#~)wt2q`r)Rbo0KTjZYX+q7oTlN{_R=?e_+l}jkVcjyxK)Y!Bf
zl)>dbL{XDnSdD%FM<9*y!yFOngFPT+#A|Yo(s~-0e|%>Xu|t~^k&ua?M3C-8#OTN0
zsppEq+wSdqZx+${erUe?ZX01pIECDyI
z4qs|-;mL1+^60W{&e8+rbjrDy&$Z>h__PG%gmKuR4dGcuXyE^K%8f}>Qrs^Z&yrmS
zt9gXIlE@8MjU@1mqKYS0gSAx|Z7!!focwSaY618pMC-DnKb3(qJ}sdg$U{-q5{>+`
zQN*D*@lCuylc^~;$op>TAA)aRLDr8BEU<6*avlRCyH-q+9eUn_p*e?u!%LwMq#eSh
zAoBcI^RF6DX0+t7DNF&`_g44l&*^|iULDJ~N62tLkx=G&xg~1pTiPFLa#8j|f6f$L
zb9i}M`E*^Ooj;!EeB$lpysjozGnJdCVTOi@cDcIs+RKYh9nCsA`c>D~HTMrm%aKbJ
zvn-JnI7QuL$R34%vhaCqA=l!X&iUw>9O|Xc!;_5i2XK*)F@8WKXNGh9fs~u&_TSKO
z;30cP-2R7X*a{E-$?e={nH-_B1ZQncuSef~kA-Kvs4jdmSmb@aJxi(6QXC;wKC&p)
zI^m?iO(poki=S53x)s#EP7FN;NKv+L%HlrDp~DFG@H6pRzCrDlA5I*`1}z#@8TUB_
zP5cS>Oc2S>w{PxDjKMx;iet>lhn{#4KLaXK+~bgJgv70UbsZyaAOjh59e$RF_jp56
zg+r^cP1Sy+owN((R*a)3k>0Py4?=OS`<ErB4iLBxK2H_CPo(VLY@FAaH7&bD~={t>R~@{-S8yiM>MTpEO^pH4
z;r(?Cr2^~2XcdY0u(IeHSVi7jvWi>J?Q4Lwp4*VMisC2Xo_{@Dn;M6CceQj|YcYD+
z5p5$`2r%GivZVwTm`d*{7G`=AexT`jdA_}DP3$WSI9<<*@go;4vIjrT7ZrcAJmnf<
zI{l7i{K=SAut?0!@c?~sbEktU7ksC!JQ+vU
zLgirv>UbQ~ZdYTw*57X}3{$>z#!1G@&+s2z9j|r4GF&B?EMZAaxrV&Odi+2LJ?v6y
zcl}ZX;^W}3M89Uv0JkKM6wraw(N@y*LxE{*g(_
z$%l?eJ*>GglrSuB^cw2`)>Uga=km6A6u@raqvOm4tgMQ#1Mr<%5M3JgITK0h5q@br
z{UgkZA4(!pdR*mILJ+YprhY6EGl
zLzcZ4OG`R5sJ#-FN_eZ4IP5bNfuZjaXv>DA;edy-7^}#)ky96Y;!euj066_AzQ7Hr
zikX9+=%ns79cL?$33=Q6J8oxtR)$YnQY^9ullO|&9i++FO)K?f;b+b6HW*})THIwy&(e6o6Z&ODG
z!-@h%`loHC7R1eBBz2aLumoL8?<2@#_yLNe!e^#A_`!pisHxJ-IndaPisHcR^q8GE
zDH%1TLCw7K$cP9TnVUGt33)U$A~+kt+#-*tedI};G=%gYjm&au1=5ybxhmVZwNZf3
z_ac4a^GB%V295NG+3uac@Hu)_Dd0c9V>(jz?c`PEwbLuk&j+G}pmWyq>lj2FyQZv5
z0CIH*B}rPf<+vQKMSykjA2p1Sh5I~7QixxjYWiDS%R^M02Yxk
zEGig_2*FYfSmv;?WDzV&FcuNOk~M}!17i^(SgHXFYHorE%OY4rF_!0mC2I_e2FCK7
zU}*v@3)on;6D*<_%X7f8eGJPMjO97O(gav|*;uw?ER+rwa=&uJ%%@w3Sd^Dwv2>8;
z6^e_`OxpvG#L=L=o5{g4@Y#4|F&cF`m)TNtIFFQ;;=_TD
z<_^NCXOHx|;gLD`Swz|vkaP&x3lBWcC{IXJfv;dx^Md4}APGVucVO0XOv
zAb_QSjb%H*f;EO}2MF}CZiK`XBcaX0VxezyYr8d-=u&z$R$00k8HsUgixOv#93mqg
zWMq?yy&f_waB~Okur)FFVf;)H6?W)Leavs?BAy%`pcRC?Q76dt3itEm*H(E=-KRRBC*<2G34=c51zzyq|S)*{ABhFupC&PD&U3Jh~tqwvhWPD
z@MRno-;r`67!r7tS7XjA?{RDE0Jn=4z6m>?r{o#uWoiw((CdqU#$6d$R?5)?qQI@~
zD{AusK}>r)23bZPH8$PUu=azkz~
zL6bWLGmY9r(42Q`!)Yy`c|Y`(jd7H?2&Ek5dsqhyoqEJ
zFb!J<#Yegs>D|ir^C5x9&x}1n9u3`2%lAX?O^2++PneK)mH578TN-jl*&|-q@P#&D
z`=(8vUO})p7O^3K9ALmQjRqt1iZ2kft{OIgTkEl)m8ZJ3^{_$q5s>=?iRqRGqgiYa
zvi@6t6Oc=8ZFPjrj?ktGW~c5GET(`(l7R5Dv5T>4i@RG}
zBO6Nr8-!rd1}yBg2(z(}wNNU3lL_w?R$_O|E>tN<=6|*vI$F)GFpBnAb|Kqv*p;#m?C84#)>lmH`ybL$;dJ?vTMbe
znpmyNuB!ASQfdvZG7SD6t8+$sQqD
zF$J}cD;s7Ov1WGZLqYA6#x8wG2s{~p|07^cyYF%VMvP+41dvO(ss
zK}hLC(OSOp5RmWVAcQknIv)whUt?HE=>s5MY%B-IKuGCBiELG4gN$Jzr4Q`4-(zF>
zF%Ckoi0I@JkRxMQNa+J0b!;q0$3RHwLp|TRiw!b{g_J%Nt*MrcrFtBMV0o_diGVC0
z!$L|Q010MeIm8BGmp)X#P;kB>a^_Iw;ZB+qyY!(HZ>uy&qBz4ft&QlC11WtdqbO7M
z$Q1SnyY!(V^_HCBhd7gbe-gC%L!OCU`cT^*$goHLWRI{*A4>SaQZmA%8)6G&mp&Bz
z{*ul1$Tqn+*`%>cAId<4jpphY8g}VJ&Aq>xOm=k)jRuKL)ZB+`7_KoGQu=0.6.2"},
+ "dynamodb": aws_common,
# Starting with 7.14.0 python client is checking if it is connected to elasticsearch client. If its not it throws
# UnsupportedProductError
# https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/release-notes.html#rn-7-14-0
@@ -557,6 +558,7 @@ def get_long_description():
"dbt = datahub.ingestion.source.dbt.dbt_core:DBTCoreSource",
"dbt-cloud = datahub.ingestion.source.dbt.dbt_cloud:DBTCloudSource",
"druid = datahub.ingestion.source.sql.druid:DruidSource",
+ "dynamodb = datahub.ingestion.source.dynamodb.dynamodb:DynamoDBSource",
"elasticsearch = datahub.ingestion.source.elastic_search:ElasticsearchSource",
"feast = datahub.ingestion.source.feast:FeastRepositorySource",
"glue = datahub.ingestion.source.aws.glue:GlueSource",
diff --git a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py
new file mode 100644
index 00000000000000..6b7c118373673b
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py
@@ -0,0 +1,469 @@
+import logging
+from dataclasses import field
+from typing import Any, Counter, Dict, Iterable, List, Optional, Type, Union
+
+import boto3
+import pydantic
+from botocore.client import BaseClient
+from pydantic.fields import Field
+
+from datahub.configuration.common import AllowDenyPattern
+from datahub.configuration.source_common import DatasetSourceConfigMixin
+from datahub.emitter.mce_builder import (
+ make_data_platform_urn,
+ make_dataplatform_instance_urn,
+ make_dataset_urn_with_platform_instance,
+)
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.api.decorators import (
+ SupportStatus,
+ capability,
+ config_class,
+ platform_name,
+ support_status,
+)
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability
+from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.schema_inference.object import SchemaDescription
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+ StaleEntityRemovalHandler,
+ StaleEntityRemovalSourceReport,
+ StatefulIngestionConfigBase,
+ StatefulStaleMetadataRemovalConfig,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+ StatefulIngestionSourceBase,
+)
+from datahub.metadata.com.linkedin.pegasus2avro.schema import (
+ ArrayTypeClass,
+ BooleanTypeClass,
+ BytesTypeClass,
+ NullTypeClass,
+ NumberTypeClass,
+ RecordTypeClass,
+ SchemaField,
+ SchemaFieldDataType,
+ SchemalessClass,
+ SchemaMetadata,
+ StringTypeClass,
+ UnionTypeClass,
+)
+from datahub.metadata.schema_classes import (
+ DataPlatformInstanceClass,
+ DatasetPropertiesClass,
+)
+
+MAX_ITEMS_TO_RETRIEVE = 100
+PAGE_SIZE = 100
+MAX_SCHEMA_SIZE = 300
+MAX_PRIMARY_KEYS_SIZE = 100
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class DynamoDBConfig(DatasetSourceConfigMixin, StatefulIngestionConfigBase):
+ # TODO: refactor the config to use AwsConnectionConfig and create a method get_dynamodb_client
+ # in the class to provide optional region name input
+ aws_access_key_id: str = Field(description="AWS Access Key ID.")
+ aws_secret_access_key: pydantic.SecretStr = Field(description="AWS Secret Key.")
+
+ # This config option allows user to include a list of items from a table when we scan and construct the schema,
+ # the key of this dict is table name and the value is the list of item primary keys in dynamodb format,
+ # if the table use composite key then the value should have partition key and sort key present
+ include_table_item: Optional[Dict[str, List[Dict]]] = Field(
+ default=None,
+ description="[Advanced] The primary keys of items of a table in dynamodb format the user would like to include in schema. "
+ 'Refer "Advanced Configurations" section for more details',
+ )
+
+ table_pattern: AllowDenyPattern = Field(
+ default=AllowDenyPattern.allow_all(),
+ description="regex patterns for tables to filter in ingestion.",
+ )
+ # Custom Stateful Ingestion settings
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
+
+
+class DynamoDBSourceReport(StaleEntityRemovalSourceReport):
+ filtered: List[str] = field(default_factory=list)
+
+ def report_dropped(self, name: str) -> None:
+ self.filtered.append(name)
+
+
+# map attribute data types to native types
+_attribute_type_to_native_type_mapping: Dict[str, str] = {
+ "N": "Numbers",
+ "B": "Bytes",
+ "S": "String",
+ "M": "Map",
+ "L": "List",
+ "SS": "String List",
+ "NS": "Number List",
+ "BS": "Binary Set",
+ "NULL": "Null",
+ # if the attribute type is NULL the attribute value will be true or false.
+ "BOOL": "Boolean",
+ "mixed": "mixed",
+}
+# map DynamoDB attribute types to DataHub classes
+_attribute_type_to_field_type_mapping: Dict[str, Type] = {
+ "N": NumberTypeClass,
+ "B": BytesTypeClass,
+ "S": StringTypeClass,
+ "M": RecordTypeClass,
+ "L": ArrayTypeClass,
+ "SS": ArrayTypeClass,
+ "NS": ArrayTypeClass,
+ "BS": ArrayTypeClass,
+ "NULL": BooleanTypeClass,
+ "BOOL": BooleanTypeClass,
+ "mixed": UnionTypeClass,
+}
+
+
+@platform_name("DynamoDB", id="dynamodb")
+@config_class(DynamoDBConfig)
+@support_status(SupportStatus.TESTING)
+@capability(
+ SourceCapability.PLATFORM_INSTANCE,
+ "By default, platform_instance will use the AWS account id",
+)
+@capability(
+ SourceCapability.DELETION_DETECTION,
+ "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
+ supported=True,
+)
+class DynamoDBSource(StatefulIngestionSourceBase):
+ """
+ This plugin extracts the following:
+
+ AWS DynamoDB table names with their region, and infer schema of attribute names and types by scanning
+ the table
+
+ """
+
+ config: DynamoDBConfig
+ report: DynamoDBSourceReport
+ platform: str
+
+ def __init__(self, ctx: PipelineContext, config: DynamoDBConfig, platform: str):
+ super().__init__(config, ctx)
+ self.config = config
+ self.report = DynamoDBSourceReport()
+ self.platform = platform
+
+ @classmethod
+ def create(cls, config_dict: dict, ctx: PipelineContext) -> "DynamoDBSource":
+ config = DynamoDBConfig.parse_obj(config_dict)
+ return cls(ctx, config, "dynamodb")
+
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+ return [
+ *super().get_workunit_processors(),
+ StaleEntityRemovalHandler.create(
+ self, self.config, self.ctx
+ ).workunit_processor,
+ ]
+
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+ # This is a offline call to get available region names from botocore library
+ session = boto3.Session()
+ dynamodb_regions = session.get_available_regions("dynamodb")
+ logger.info(f"region names {dynamodb_regions}")
+
+ # traverse databases in sorted order so output is consistent
+ for region in dynamodb_regions:
+ try:
+ # create a new dynamodb client for each region,
+ # it seems for one client we could only list the table of one specific region,
+ # the list_tables() method don't take any config that related to region
+ # TODO: list table returns maximum number 100, need to implement pagination here
+ dynamodb_client = boto3.client(
+ "dynamodb",
+ region_name=region,
+ aws_access_key_id=self.config.aws_access_key_id
+ if self.config.aws_access_key_id
+ else None,
+ aws_secret_access_key=self.config.aws_secret_access_key.get_secret_value()
+ if self.config.aws_secret_access_key
+ else None,
+ )
+ table_names: List[str] = dynamodb_client.list_tables()["TableNames"]
+ except Exception as ex:
+ # TODO: If regions is config input then this would be self.report.report_warning,
+ # we can create dynamodb client to take aws region or regions as user input
+ logger.info(f"exception happen in region {region}, skipping: {ex}")
+ continue
+ for table_name in sorted(table_names):
+ if not self.config.table_pattern.allowed(table_name):
+ continue
+ table_info = dynamodb_client.describe_table(TableName=table_name)[
+ "Table"
+ ]
+ account_id = table_info["TableArn"].split(":")[4]
+ if not self.config.table_pattern.allowed(table_name):
+ self.report.report_dropped(table_name)
+ continue
+ platform_instance = self.config.platform_instance or account_id
+ dataset_name = f"{region}.{table_name}"
+ dataset_urn = make_dataset_urn_with_platform_instance(
+ platform=self.platform,
+ platform_instance=platform_instance,
+ name=dataset_name,
+ )
+ dataset_properties = DatasetPropertiesClass(
+ tags=[],
+ customProperties={
+ "table.arn": table_info["TableArn"],
+ "table.totalItems": str(table_info["ItemCount"]),
+ },
+ )
+ primary_key_dict = self.extract_primary_key_from_key_schema(table_info)
+ table_schema = self.construct_schema_from_dynamodb(
+ dynamodb_client, table_name
+ )
+ schema_metadata = self.construct_schema_metadata(
+ table_name,
+ dataset_urn,
+ dataset_properties,
+ table_schema,
+ primary_key_dict,
+ )
+
+ yield MetadataChangeProposalWrapper(
+ entityUrn=dataset_urn,
+ aspect=schema_metadata,
+ ).as_workunit()
+
+ yield MetadataChangeProposalWrapper(
+ entityUrn=dataset_urn,
+ aspect=dataset_properties,
+ ).as_workunit()
+
+ platform_instance_aspect = DataPlatformInstanceClass(
+ platform=make_data_platform_urn(self.platform),
+ instance=make_dataplatform_instance_urn(
+ self.platform, platform_instance
+ ),
+ )
+
+ yield MetadataChangeProposalWrapper(
+ entityUrn=dataset_urn,
+ aspect=platform_instance_aspect,
+ ).as_workunit()
+
+ def construct_schema_from_dynamodb(
+ self,
+ dynamodb_client: BaseClient,
+ table_name: str,
+ ) -> Dict[str, SchemaDescription]:
+ """
+ This will use the dynamodb client to scan the given table to retrieve items with pagination,
+ and construct the schema of this table by reading the attributes of the retrieved items
+ """
+ paginator = dynamodb_client.get_paginator("scan")
+ schema: Dict[str, SchemaDescription] = {}
+ """
+ https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/dynamodb.html#DynamoDB.Paginator.Scan
+ Note that the behavior of the pagination does not align with the documentation according to https://stackoverflow.com/questions/39201093/how-to-use-boto3-pagination
+
+ What we'll do is to create a paginator and boto3 library handles the pagination automatically. We'll iterate through pages
+ and retrieve the items from page.
+
+ The MaxItems is the total number of items to return, and PageSize is the size of each page, we are assigning same value
+ to these two config. If MaxItems is more than PageSize then we expect MaxItems / PageSize pages in response_iterator will return
+ """
+ self.include_table_item_to_schema(dynamodb_client, table_name, schema)
+ response_iterator = paginator.paginate(
+ TableName=table_name,
+ PaginationConfig={
+ "MaxItems": MAX_ITEMS_TO_RETRIEVE,
+ "PageSize": PAGE_SIZE,
+ },
+ )
+ # iterate through pagination result to retrieve items
+ for page in response_iterator:
+ items = page["Items"]
+ if len(items) > 0:
+ self.construct_schema_from_items(items, schema)
+
+ return schema
+
+ def include_table_item_to_schema(
+ self,
+ dynamodb_client: Any,
+ table_name: str,
+ schema: Dict[str, SchemaDescription],
+ ) -> None:
+ """
+ It will look up in the config include_table_item dict to see if the current table name exists as key,
+ if it exists then get the items by primary key from the table and put it to schema
+ """
+ if self.config.include_table_item is None:
+ return
+ if table_name not in self.config.include_table_item.keys():
+ return
+ primary_key_list = self.config.include_table_item.get(table_name)
+ assert isinstance(primary_key_list, List)
+ if len(primary_key_list) > MAX_PRIMARY_KEYS_SIZE:
+ logger.info(
+ f"the provided primary keys list size exceeded the max size for table {table_name}, we'll only process the first {MAX_PRIMARY_KEYS_SIZE} items"
+ )
+ primary_key_list = primary_key_list[0:MAX_PRIMARY_KEYS_SIZE]
+ items = []
+ response = dynamodb_client.batch_get_item(
+ RequestItems={table_name: {"Keys": primary_key_list}}
+ ).get("Responses", None)
+ if response is None:
+ logger.error(
+ f"failed to retrieve item from table {table_name} by the given key {primary_key_list}"
+ )
+ return
+ items = response.get(table_name)
+
+ self.construct_schema_from_items(items, schema)
+
+ def construct_schema_from_items(
+ slef, items: List[Dict[str, Dict]], schema: Dict[str, SchemaDescription]
+ ) -> None:
+ """
+ https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/dynamodb.html#DynamoDB.Client.scan
+ each item in the list is a dict, the key represents the attribute name,
+ and the value is a one entry dict, more details in the below code comments
+ we are writing our own construct schema method, take the attribute name as key and SchemaDescription as value
+ """
+ for document in items:
+ # the key is the attribute name and the value is a dict with only one entry,
+ # whose key is the data type and value is the data
+ for key, value in document.items():
+ if value is not None:
+ data_type = list(value.keys())[0]
+ if key not in schema:
+ schema[key] = {
+ "types": Counter(data_type),
+ "count": 1,
+ # It seems we don't have collapsed field name so we are using attribute name here
+ "delimited_name": key,
+ "type": data_type,
+ "nullable": False,
+ }
+ else:
+ # update the type count
+ schema[key]["types"].update({data_type: 1})
+ schema[key]["count"] += 1
+ # if we found an attribute name with different attribute type, we consider this attribute type as "mixed"
+ field_types = schema[key]["types"]
+ if len(field_types.keys()) > 1:
+ schema[key]["type"] = "mixed"
+
+ def construct_schema_metadata(
+ self,
+ table_name: str,
+ dataset_urn: str,
+ dataset_properties: DatasetPropertiesClass,
+ schema: Dict[str, SchemaDescription],
+ primary_key_dict: Dict[str, str],
+ ) -> SchemaMetadata:
+ """ "
+ To construct the schema metadata, it will first sort the schema by the occurrence of attribute names
+ in descending order and truncate the schema by MAX_SCHEMA_SIZE, and then start to construct the
+ schema metadata sorted by attribute name
+ """
+
+ canonical_schema: List[SchemaField] = []
+ schema_size = len(schema.values())
+ table_fields = list(schema.values())
+
+ if schema_size > MAX_SCHEMA_SIZE:
+ # downsample the schema, using frequency as the sort key
+ self.report.report_warning(
+ key=dataset_urn,
+ reason=f"Downsampling the table schema because MAX_SCHEMA_SIZE threshold is {MAX_SCHEMA_SIZE}",
+ )
+ # Add this information to the custom properties so user can know they are looking at down sampled schema
+ dataset_properties.customProperties["schema.downsampled"] = "True"
+ dataset_properties.customProperties["schema.totalFields"] = f"{schema_size}"
+ # append each schema field (sort so output is consistent)
+ for schema_field in sorted(
+ table_fields,
+ key=lambda x: x["delimited_name"],
+ )[0:MAX_SCHEMA_SIZE]:
+ field_path = schema_field["delimited_name"]
+ native_data_type = self.get_native_type(schema_field["type"], table_name)
+ type = self.get_field_type(schema_field["type"], table_name)
+ description = None
+ nullable = True
+ if field_path in primary_key_dict:
+ description = (
+ "Partition Key"
+ if primary_key_dict.get(field_path) == "HASH"
+ else "Sort Key"
+ )
+ # primary key should not be nullable
+ nullable = False
+
+ field = SchemaField(
+ fieldPath=field_path,
+ nativeDataType=native_data_type,
+ type=type,
+ description=description,
+ nullable=nullable,
+ recursive=False,
+ )
+ canonical_schema.append(field)
+
+ # create schema metadata object for table
+ schema_metadata = SchemaMetadata(
+ schemaName=table_name,
+ platform=f"urn:li:dataPlatform:{self.platform}",
+ version=0,
+ hash="",
+ platformSchema=SchemalessClass(),
+ fields=canonical_schema,
+ )
+ return schema_metadata
+
+ def extract_primary_key_from_key_schema(
+ self, table_info: Dict[str, Any]
+ ) -> Dict[str, str]:
+ key_schema = table_info.get("KeySchema")
+ primary_key_dict = {}
+ assert isinstance(key_schema, List)
+ for key in key_schema:
+ attribute_name = key.get("AttributeName")
+ key_type = key.get("KeyType")
+ primary_key_dict[attribute_name] = key_type
+ return primary_key_dict
+
+ def get_native_type(self, attribute_type: Union[type, str], table_name: str) -> str:
+ assert isinstance(attribute_type, str)
+ type_string: Optional[str] = _attribute_type_to_native_type_mapping.get(
+ attribute_type
+ )
+ if type_string is None:
+ self.report.report_warning(
+ table_name, f"unable to map type {attribute_type} to native data type"
+ )
+ return _attribute_type_to_native_type_mapping[attribute_type]
+ return type_string
+
+ def get_field_type(
+ self, attribute_type: Union[type, str], table_name: str
+ ) -> SchemaFieldDataType:
+ assert isinstance(attribute_type, str)
+ type_class: Optional[type] = _attribute_type_to_field_type_mapping.get(
+ attribute_type
+ )
+
+ if type_class is None:
+ self.report.report_warning(
+ table_name,
+ f"unable to map type {attribute_type} to metadata schema field type",
+ )
+ type_class = NullTypeClass
+ return SchemaFieldDataType(type=type_class())
+
+ def get_report(self) -> DynamoDBSourceReport:
+ return self.report
diff --git a/metadata-ingestion/tests/integration/dynamodb/dynamodb_default_platform_instance_mces_golden.json b/metadata-ingestion/tests/integration/dynamodb/dynamodb_default_platform_instance_mces_golden.json
new file mode 100644
index 00000000000000..f3d6c9809f5d2c
--- /dev/null
+++ b/metadata-ingestion/tests/integration/dynamodb/dynamodb_default_platform_instance_mces_golden.json
@@ -0,0 +1,132 @@
+[
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dynamodb,123456789012.us-west-2.Location,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "schemaMetadata",
+ "aspect": {
+ "json": {
+ "schemaName": "Location",
+ "platform": "urn:li:dataPlatform:dynamodb",
+ "version": 0,
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "hash": "",
+ "platformSchema": {
+ "com.linkedin.schema.Schemaless": {}
+ },
+ "fields": [
+ {
+ "fieldPath": "address",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "String",
+ "recursive": false,
+ "isPartOfKey": false
+ },
+ {
+ "fieldPath": "city",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "String",
+ "recursive": false,
+ "isPartOfKey": false
+ },
+ {
+ "fieldPath": "partitionKey",
+ "nullable": false,
+ "description": "Partition Key",
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "String",
+ "recursive": false,
+ "isPartOfKey": false
+ },
+ {
+ "fieldPath": "zip",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "Numbers",
+ "recursive": false,
+ "isPartOfKey": false
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1693396800000,
+ "runId": "dynamodb-test"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dynamodb,123456789012.us-west-2.Location,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "datasetProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "table.arn": "arn:aws:dynamodb:us-west-2:123456789012:table/Location",
+ "table.totalItems": "1"
+ },
+ "tags": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1693396800000,
+ "runId": "dynamodb-test"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dynamodb,123456789012.us-west-2.Location,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:dynamodb",
+ "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:dynamodb,123456789012)"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1693396800000,
+ "runId": "dynamodb-test"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dynamodb,123456789012.us-west-2.Location,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1693396800000,
+ "runId": "dynamodb-test"
+ }
+}
+]
\ No newline at end of file
diff --git a/metadata-ingestion/tests/integration/dynamodb/dynamodb_platform_instance_mces_golden.json b/metadata-ingestion/tests/integration/dynamodb/dynamodb_platform_instance_mces_golden.json
new file mode 100644
index 00000000000000..b1176b1fd5786d
--- /dev/null
+++ b/metadata-ingestion/tests/integration/dynamodb/dynamodb_platform_instance_mces_golden.json
@@ -0,0 +1,132 @@
+[
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dynamodb,dynamodb_test.us-west-2.Location,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "schemaMetadata",
+ "aspect": {
+ "json": {
+ "schemaName": "Location",
+ "platform": "urn:li:dataPlatform:dynamodb",
+ "version": 0,
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "hash": "",
+ "platformSchema": {
+ "com.linkedin.schema.Schemaless": {}
+ },
+ "fields": [
+ {
+ "fieldPath": "address",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "String",
+ "recursive": false,
+ "isPartOfKey": false
+ },
+ {
+ "fieldPath": "city",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "String",
+ "recursive": false,
+ "isPartOfKey": false
+ },
+ {
+ "fieldPath": "partitionKey",
+ "nullable": false,
+ "description": "Partition Key",
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "String",
+ "recursive": false,
+ "isPartOfKey": false
+ },
+ {
+ "fieldPath": "zip",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "Numbers",
+ "recursive": false,
+ "isPartOfKey": false
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1693396800000,
+ "runId": "dynamodb-test"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dynamodb,dynamodb_test.us-west-2.Location,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "datasetProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "table.arn": "arn:aws:dynamodb:us-west-2:123456789012:table/Location",
+ "table.totalItems": "1"
+ },
+ "tags": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1693396800000,
+ "runId": "dynamodb-test"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dynamodb,dynamodb_test.us-west-2.Location,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:dynamodb",
+ "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:dynamodb,dynamodb_test)"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1693396800000,
+ "runId": "dynamodb-test"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dynamodb,dynamodb_test.us-west-2.Location,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1693396800000,
+ "runId": "dynamodb-test"
+ }
+}
+]
\ No newline at end of file
diff --git a/metadata-ingestion/tests/integration/dynamodb/test_dynamodb.py b/metadata-ingestion/tests/integration/dynamodb/test_dynamodb.py
new file mode 100644
index 00000000000000..ef2446ddd8d627
--- /dev/null
+++ b/metadata-ingestion/tests/integration/dynamodb/test_dynamodb.py
@@ -0,0 +1,95 @@
+import pathlib
+
+import boto3
+import pytest
+from freezegun import freeze_time
+from moto import mock_dynamodb
+
+from datahub.ingestion.run.pipeline import Pipeline
+from tests.test_helpers import mce_helpers
+
+test_resources_dir = pathlib.Path(__file__).parent
+FROZEN_TIME = "2023-08-30 12:00:00"
+
+
+@freeze_time(FROZEN_TIME)
+@mock_dynamodb
+@pytest.mark.integration
+def test_dynamodb(pytestconfig, tmp_path, mock_time):
+ boto3.setup_default_session()
+ client = boto3.client("dynamodb", region_name="us-west-2")
+ client.create_table(
+ TableName="Location",
+ KeySchema=[
+ {"AttributeName": "partitionKey", "KeyType": "HASH"},
+ ],
+ AttributeDefinitions=[
+ {"AttributeName": "partitionKey", "AttributeType": "S"},
+ ],
+ ProvisionedThroughput={"ReadCapacityUnits": 10, "WriteCapacityUnits": 10},
+ )
+ client.put_item(
+ TableName="Location",
+ Item={
+ "partitionKey": {"S": "1"},
+ "city": {"S": "San Francisco"},
+ "address": {"S": "1st Market st"},
+ "zip": {"N": "94000"},
+ },
+ )
+
+ pipeline_default_platform_instance = Pipeline.create(
+ {
+ "run_id": "dynamodb-test",
+ "source": {
+ "type": "dynamodb",
+ "config": {
+ "aws_access_key_id": "test",
+ "aws_secret_access_key": "test",
+ },
+ },
+ "sink": {
+ "type": "file",
+ "config": {
+ "filename": f"{tmp_path}/dynamodb_default_platform_instance_mces.json",
+ },
+ },
+ }
+ )
+ pipeline_default_platform_instance.run()
+ pipeline_default_platform_instance.raise_from_status()
+ mce_helpers.check_golden_file(
+ pytestconfig,
+ output_path=f"{tmp_path}/dynamodb_default_platform_instance_mces.json",
+ golden_path=test_resources_dir
+ / "dynamodb_default_platform_instance_mces_golden.json",
+ ignore_paths=mce_helpers.IGNORE_PATH_TIMESTAMPS,
+ )
+
+ pipeline_with_platform_instance = Pipeline.create(
+ {
+ "run_id": "dynamodb-test",
+ "source": {
+ "type": "dynamodb",
+ "config": {
+ "platform_instance": "dynamodb_test",
+ "aws_access_key_id": "test",
+ "aws_secret_access_key": "test",
+ },
+ },
+ "sink": {
+ "type": "file",
+ "config": {
+ "filename": f"{tmp_path}/dynamodb_platform_instance_mces.json",
+ },
+ },
+ }
+ )
+ pipeline_with_platform_instance.run()
+ pipeline_with_platform_instance.raise_from_status()
+ mce_helpers.check_golden_file(
+ pytestconfig,
+ output_path=f"{tmp_path}/dynamodb_platform_instance_mces.json",
+ golden_path=test_resources_dir / "dynamodb_platform_instance_mces_golden.json",
+ ignore_paths=mce_helpers.IGNORE_PATH_TIMESTAMPS,
+ )
diff --git a/metadata-service/war/src/main/resources/boot/data_platforms.json b/metadata-service/war/src/main/resources/boot/data_platforms.json
index 2abe81d93236c7..7a7cec60aa25f0 100644
--- a/metadata-service/war/src/main/resources/boot/data_platforms.json
+++ b/metadata-service/war/src/main/resources/boot/data_platforms.json
@@ -544,5 +544,15 @@
"type": "FILE_SYSTEM",
"logoUrl": "/assets/platforms/gcslogo.svg"
}
+ },
+ {
+ "urn": "urn:li:dataPlatform:dynamodb",
+ "aspect": {
+ "datasetNameDelimiter": ".",
+ "name": "dynamodb",
+ "displayName": "DynamoDB",
+ "type": "KEY_VALUE_STORE",
+ "logoUrl": "/assets/platforms/dynamodblogo.png"
+ }
}
]
From 99d7eb756c09a3313a4c1bda6f96a0953004b58c Mon Sep 17 00:00:00 2001
From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
Date: Sat, 16 Sep 2023 02:06:04 +0530
Subject: [PATCH 25/65] feat(ingest/bigquery): support bigquery profiling with
sampling (#8794)
---
.../ingestion/source/ge_data_profiler.py | 222 ++++++++++++------
.../ingestion/source/ge_profiling_config.py | 20 +-
2 files changed, 162 insertions(+), 80 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
index 4394d108486be8..01e083d566168d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@@ -616,6 +616,9 @@ def generate_dataset_profile( # noqa: C901 (complexity)
logger.debug(f"profiling {self.dataset_name}: flushing stage 1 queries")
self.query_combiner.flush()
+ if self.config.use_sampling and not self.config.limit:
+ self.update_dataset_batch_use_sampling(profile)
+
columns_profiling_queue: List[_SingleColumnSpec] = []
if columns_to_profile:
for column in all_columns:
@@ -737,6 +740,61 @@ def generate_dataset_profile( # noqa: C901 (complexity)
self.query_combiner.flush()
return profile
+ def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> None:
+ if (
+ self.dataset.engine.dialect.name.lower() == BIGQUERY
+ and profile.rowCount
+ and profile.rowCount > self.config.sample_size
+ ):
+ """
+ According to BigQuery Sampling Docs(https://cloud.google.com/bigquery/docs/table-sampling),
+ BigQuery does not cache the results of a query that includes a TABLESAMPLE clause and the
+ query may return different results every time. Calculating different column level metrics
+ on different sampling results is possible however each query execution would incur the cost
+ of reading data from storage. Also, using different table samples may create non-coherent
+ representation of column level metrics, for example, minimum value of a column in one sample
+ can be greater than maximum value of the column in another sample.
+
+ It is observed that for a simple select * query with TABLESAMPLE, results are cached and
+ stored in temporary table. This can be (ab)used and all column level profiling calculations
+ can be performed against it.
+
+ Risks:
+ 1. All the risks mentioned in notes of `create_bigquery_temp_table` are also
+ applicable here.
+ 2. TABLESAMPLE query may read entire table for small tables that are written
+ as single data block. This may incorrectly label datasetProfile's partition as
+ "SAMPLE", although profile is for entire table.
+ 3. Table Sampling in BigQuery is a Pre-GA (Preview) feature.
+ """
+ sample_pc = 100 * self.config.sample_size / profile.rowCount
+ sql = (
+ f"SELECT * FROM {str(self.dataset._table)} "
+ + f"TABLESAMPLE SYSTEM ({sample_pc:.3f} percent)"
+ )
+ temp_table_name = create_bigquery_temp_table(
+ self,
+ sql,
+ self.dataset_name,
+ self.dataset.engine.engine.raw_connection(),
+ )
+ if temp_table_name:
+ self.dataset._table = sa.text(temp_table_name)
+ logger.debug(f"Setting table name to be {self.dataset._table}")
+
+ if (
+ profile.partitionSpec
+ and profile.partitionSpec.type == PartitionTypeClass.FULL_TABLE
+ ):
+ profile.partitionSpec = PartitionSpecClass(
+ type=PartitionTypeClass.QUERY, partition="SAMPLE"
+ )
+ elif (
+ profile.partitionSpec
+ and profile.partitionSpec.type == PartitionTypeClass.PARTITION
+ ):
+ profile.partitionSpec.partition += " SAMPLE"
+
@dataclasses.dataclass
class GEContext:
@@ -961,84 +1019,18 @@ def _generate_single_profile(
if platform == BIGQUERY and (
custom_sql or self.config.limit or self.config.offset
):
- # On BigQuery, we need to bypass GE's mechanism for creating temporary tables because
- # it requires create/delete table permissions.
- import google.cloud.bigquery.job.query
- from google.cloud.bigquery.dbapi.cursor import Cursor as BigQueryCursor
-
- raw_connection = self.base_engine.raw_connection()
- try:
- cursor: "BigQueryCursor" = cast(
- "BigQueryCursor", raw_connection.cursor()
- )
- if custom_sql is not None:
- # Note that limit and offset are not supported for custom SQL.
- # Presence of custom SQL represents that the bigquery table
- # is either partitioned or sharded
- bq_sql = custom_sql
- else:
- bq_sql = f"SELECT * FROM `{table}`"
- if self.config.limit:
- bq_sql += f" LIMIT {self.config.limit}"
- if self.config.offset:
- bq_sql += f" OFFSET {self.config.offset}"
- try:
- cursor.execute(bq_sql)
- except Exception as e:
- if not self.config.catch_exceptions:
- raise e
- logger.exception(
- f"Encountered exception while profiling {pretty_name}"
- )
- self.report.report_warning(
- pretty_name,
- f"Profiling exception {e} when running custom sql {bq_sql}",
- )
- return None
-
- # Great Expectations batch v2 API, which is the one we're using, requires
- # a concrete table name against which profiling is executed. Normally, GE
- # creates a table with an expiry time of 24 hours. However, we don't want the
- # temporary tables to stick around that long, so we'd also have to delete them
- # ourselves. As such, the profiler required create and delete table permissions
- # on BigQuery.
- #
- # It turns out that we can (ab)use the BigQuery cached results feature
- # to avoid creating temporary tables ourselves. For almost all queries, BigQuery
- # will store the results in a temporary, cached results table when an explicit
- # destination table is not provided. These tables are pretty easy to identify
- # because they live in "anonymous datasets" and have a name that looks like
- # "project-id._d60e97aec7f471046a960419adb6d44e98300db7.anon10774d0ea85fd20fe9671456c5c53d5f1b85e1b17bedb232dfce91661a219ee3"
- # These tables are per-user and per-project, so there's no risk of permissions escalation.
- # As per the docs, the cached results tables typically have a lifetime of 24 hours,
- # which should be plenty for our purposes.
- # See https://cloud.google.com/bigquery/docs/cached-results for more details.
- #
- # The code below extracts the name of the cached results table from the query job
- # and points GE to that table for profiling.
- #
- # Risks:
- # 1. If the query results are larger than the maximum response size, BigQuery will
- # not cache the results. According to the docs https://cloud.google.com/bigquery/quotas,
- # the maximum response size is 10 GB compressed.
- # 2. The cache lifetime of 24 hours is "best-effort" and hence not guaranteed.
- # 3. Tables with column-level security may not be cached, and tables with row-level
- # security will not be cached.
- # 4. BigQuery "discourages" using cached results directly, but notes that
- # the current semantics do allow it.
- #
- # The better long-term solution would be to use a subquery avoid this whole
- # temporary table dance. However, that would require either a) upgrading to
- # use GE's batch v3 API or b) bypassing GE altogether.
-
- query_job: Optional[
- "google.cloud.bigquery.job.query.QueryJob"
- ] = cursor._query_job
- assert query_job
- temp_destination_table = query_job.destination
- bigquery_temp_table = f"{temp_destination_table.project}.{temp_destination_table.dataset_id}.{temp_destination_table.table_id}"
- finally:
- raw_connection.close()
+ if custom_sql is not None:
+ # Note that limit and offset are not supported for custom SQL.
+ bq_sql = custom_sql
+ else:
+ bq_sql = f"SELECT * FROM `{table}`"
+ if self.config.limit:
+ bq_sql += f" LIMIT {self.config.limit}"
+ if self.config.offset:
+ bq_sql += f" OFFSET {self.config.offset}"
+ bigquery_temp_table = create_bigquery_temp_table(
+ self, bq_sql, pretty_name, self.base_engine.raw_connection()
+ )
if platform == BIGQUERY:
if bigquery_temp_table:
@@ -1128,6 +1120,7 @@ def _get_ge_dataset(
**batch_kwargs,
},
)
+
if platform == BIGQUERY:
# This is done as GE makes the name as DATASET.TABLE
# but we want it to be PROJECT.DATASET.TABLE instead for multi-project setups
@@ -1153,3 +1146,76 @@ def _get_column_types_to_ignore(dialect_name: str) -> List[str]:
return ["JSON"]
return []
+
+
+def create_bigquery_temp_table(
+ instance: Union[DatahubGEProfiler, _SingleDatasetProfiler],
+ bq_sql: str,
+ table_pretty_name: str,
+ raw_connection: Any,
+) -> Optional[str]:
+ # On BigQuery, we need to bypass GE's mechanism for creating temporary tables because
+ # it requires create/delete table permissions.
+ import google.cloud.bigquery.job.query
+ from google.cloud.bigquery.dbapi.cursor import Cursor as BigQueryCursor
+
+ try:
+ cursor: "BigQueryCursor" = cast("BigQueryCursor", raw_connection.cursor())
+ try:
+ cursor.execute(bq_sql)
+ except Exception as e:
+ if not instance.config.catch_exceptions:
+ raise e
+ logger.exception(
+ f"Encountered exception while profiling {table_pretty_name}"
+ )
+ instance.report.report_warning(
+ table_pretty_name,
+ f"Profiling exception {e} when running custom sql {bq_sql}",
+ )
+ return None
+
+ # Great Expectations batch v2 API, which is the one we're using, requires
+ # a concrete table name against which profiling is executed. Normally, GE
+ # creates a table with an expiry time of 24 hours. However, we don't want the
+ # temporary tables to stick around that long, so we'd also have to delete them
+ # ourselves. As such, the profiler required create and delete table permissions
+ # on BigQuery.
+ #
+ # It turns out that we can (ab)use the BigQuery cached results feature
+ # to avoid creating temporary tables ourselves. For almost all queries, BigQuery
+ # will store the results in a temporary, cached results table when an explicit
+ # destination table is not provided. These tables are pretty easy to identify
+ # because they live in "anonymous datasets" and have a name that looks like
+ # "project-id._d60e97aec7f471046a960419adb6d44e98300db7.anon10774d0ea85fd20fe9671456c5c53d5f1b85e1b17bedb232dfce91661a219ee3"
+ # These tables are per-user and per-project, so there's no risk of permissions escalation.
+ # As per the docs, the cached results tables typically have a lifetime of 24 hours,
+ # which should be plenty for our purposes.
+ # See https://cloud.google.com/bigquery/docs/cached-results for more details.
+ #
+ # The code below extracts the name of the cached results table from the query job
+ # and points GE to that table for profiling.
+ #
+ # Risks:
+ # 1. If the query results are larger than the maximum response size, BigQuery will
+ # not cache the results. According to the docs https://cloud.google.com/bigquery/quotas,
+ # the maximum response size is 10 GB compressed.
+ # 2. The cache lifetime of 24 hours is "best-effort" and hence not guaranteed.
+ # 3. Tables with column-level security may not be cached, and tables with row-level
+ # security will not be cached.
+ # 4. BigQuery "discourages" using cached results directly, but notes that
+ # the current semantics do allow it.
+ #
+ # The better long-term solution would be to use a subquery avoid this whole
+ # temporary table dance. However, that would require either a) upgrading to
+ # use GE's batch v3 API or b) bypassing GE altogether.
+
+ query_job: Optional[
+ "google.cloud.bigquery.job.query.QueryJob"
+ ] = cursor._query_job
+ assert query_job
+ temp_destination_table = query_job.destination
+ bigquery_temp_table = f"{temp_destination_table.project}.{temp_destination_table.dataset_id}.{temp_destination_table.table_id}"
+ return bigquery_temp_table
+ finally:
+ raw_connection.close()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
index 1488b55062b684..77761c529ba0b1 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
@@ -145,10 +145,26 @@ class GEProfilingConfig(ConfigModel):
# Hidden option - used for debugging purposes.
catch_exceptions: bool = Field(default=True, description="")
- partition_profiling_enabled: bool = Field(default=True, description="")
+ partition_profiling_enabled: bool = Field(
+ default=True,
+ description="Whether to profile partitioned tables. Only BigQuery supports this. "
+ "If enabled, latest partition data is used for profiling.",
+ )
partition_datetime: Optional[datetime.datetime] = Field(
default=None,
- description="For partitioned datasets profile only the partition which matches the datetime or profile the latest one if not set. Only Bigquery supports this.",
+ description="If specified, profile only the partition which matches this datetime. "
+ "If not specified, profile the latest partition. Only Bigquery supports this.",
+ )
+ use_sampling: bool = Field(
+ default=True,
+ description="Whether to profile column level stats on sample of table. Only BigQuery supports this. "
+ "If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ",
+ )
+
+ sample_size: int = Field(
+ default=1000,
+ description="Number of rows to be sampled from table for column level profiling."
+ "Applicable only if `use_sampling` is set to True.",
)
@pydantic.root_validator(pre=True)
From 5882fe407535b2362dcfcda7c1e123e6067d7e89 Mon Sep 17 00:00:00 2001
From: Kos Korchak <97058061+kkorchak@users.noreply.github.com>
Date: Mon, 18 Sep 2023 16:14:02 -0400
Subject: [PATCH 26/65] Fix for edit_documentation and glossary_navigation
cypress tests (#8838)
---
.../cypress/e2e/glossary/glossary_navigation.js | 6 ++----
.../cypress/e2e/mutations/edit_documentation.js | 13 +++++++------
2 files changed, 9 insertions(+), 10 deletions(-)
diff --git a/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js b/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js
index cd5622d0cd903a..de9fa7ecda1f0b 100644
--- a/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js
+++ b/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js
@@ -28,8 +28,6 @@ describe("glossary sidebar navigation test", () => {
//ensure the new term is under the parent term group in the navigation sidebar
cy.get('*[class^="GlossaryBrowser"]').contains(glossaryTermGroup).click();
cy.get('*[class^="GlossaryEntitiesList"]').contains(glossaryTerm).should("be.visible");
- cy.get('*[class^="GlossaryBrowser"] [aria-label="down"]').click().wait(1000);
- cy.get('*[class^="GlossaryBrowser"]').contains(glossaryTerm).should("not.exist");
//move a term group from the root level to be under a parent term group
cy.goToGlossaryList();
cy.clickOptionWithText(glossaryTermGroup);
@@ -41,8 +39,8 @@ describe("glossary sidebar navigation test", () => {
cy.get("button").contains("Move").click();
cy.waitTextVisible("Moved Term Group!");
//ensure it is no longer on the sidebar navigator at the top level but shows up under the new parent
- cy.get('*[class^="GlossaryBrowser"] [aria-label="down"]').click().wait(1000);
- cy.get('*[class^="GlossaryBrowser"]').contains(glossaryTermGroup).should("not.exist");
+ cy.get('*[class^="GlossaryBrowser"]').contains(glossaryParentGroup).click();
+ cy.get('*[class^="GlossaryEntitiesList"]').contains(glossaryTermGroup).should("be.visible");
//delete a term group
cy.goToGlossaryList();
cy.clickOptionWithText(glossaryParentGroup);
diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js
index e4e5a39ce1100d..83b66e2cb2549d 100644
--- a/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js
+++ b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js
@@ -37,8 +37,8 @@ describe("edit documentation and link to dataset", () => {
cy.contains("Sample doc").trigger("mouseover", { force: true });
cy.get('[data-icon="delete"]').click();
cy.waitTextVisible("Link Removed");
- cy.get("button").contains("Add Link").click();
- cy.get("#addLinkForm_url").type(wrong_url);
+ cy.get("button").contains("Add Link").click().wait(1000);
+ cy.get('[role="dialog"] #addLinkForm_url').type(wrong_url);
cy.waitTextVisible("This field must be a valid url.");
cy.focused().clear();
cy.waitTextVisible("A URL is required.");
@@ -54,9 +54,9 @@ describe("edit documentation and link to dataset", () => {
it("open test domain page, remove and add dataset link", () => {
cy.loginWithCredentials();
cy.visit("/domain/urn:li:domain:marketing/Entities");
- cy.get("[role='tab']").contains("Documentation").click();
- cy.get("button").contains("Add Link").click();
- cy.get("#addLinkForm_url").type(wrong_url);
+ cy.waitTextVisible("SampleCypressKafkaDataset");
+ cy.get("button").contains("Add Link").click().wait(1000);
+ cy.get('[role="dialog"] #addLinkForm_url').type(wrong_url);
cy.waitTextVisible("This field must be a valid url.");
cy.focused().clear();
cy.waitTextVisible("A URL is required.");
@@ -66,6 +66,7 @@ describe("edit documentation and link to dataset", () => {
cy.get('[role="dialog"] button').contains("Add").click();
cy.waitTextVisible("Link Added");
cy.get("[role='tab']").contains("Documentation").click();
+ cy.waitTextVisible("Edit");
cy.get(`[href='${correct_url}']`).should("be.visible");
cy.contains("Sample doc").trigger("mouseover", { force: true });
cy.get('[data-icon="delete"]').click();
@@ -94,4 +95,4 @@ describe("edit documentation and link to dataset", () => {
cy.waitTextVisible("Foo field description has changed");
cy.waitTextVisible("(edited)");
});
-});
+});
\ No newline at end of file
From 85fa5a1c4fdf2b4c4439558fa3a4cbbfd3491fbf Mon Sep 17 00:00:00 2001
From: Chris Collins
Date: Mon, 18 Sep 2023 16:14:33 -0400
Subject: [PATCH 27/65] feat(ui/java) Update domains to be nested (#8841)
Allow the ability to now nest domains underneath other domains. This should work much like the business glossary where you can add domains underneath other domains, move domains underneath other domains or at the root, and navigate domains using a nice new navigator.
---
.../datahub/graphql/GmsGraphQLEngine.java | 15 +-
.../exception/DataHubGraphQLErrorCode.java | 1 +
.../graphql/featureflags/FeatureFlags.java | 1 +
.../resolvers/config/AppConfigResolver.java | 1 +
.../domain/CreateDomainResolver.java | 29 ++-
.../domain/DeleteDomainResolver.java | 6 +
.../domain/DomainEntitiesResolver.java | 12 +-
.../resolvers/domain/ListDomainsResolver.java | 16 +-
.../domain/ParentDomainsResolver.java | 59 +++++
.../resolvers/mutate/MoveDomainResolver.java | 89 +++++++
.../resolvers/mutate/UpdateNameResolver.java | 14 ++
.../resolvers/mutate/util/DomainUtils.java | 222 ++++++++++++++++++
.../src/main/resources/app.graphql | 7 +
.../src/main/resources/entity.graphql | 50 +++-
.../domain/CreateDomainResolverTest.java | 177 +++++++++++++-
.../domain/DeleteDomainResolverTest.java | 27 +++
.../domain/ListDomainsResolverTest.java | 48 +++-
.../domain/MoveDomainResolverTest.java | 140 +++++++++++
.../domain/ParentDomainsResolverTest.java | 95 ++++++++
.../glossary/UpdateNameResolverTest.java | 12 +
datahub-web-react/src/app/SearchRoutes.tsx | 14 +-
datahub-web-react/src/app/analytics/event.ts | 9 +
.../src/app/domain/CreateDomainModal.tsx | 97 ++++++--
.../src/app/domain/DomainIcon.tsx | 11 +
.../src/app/domain/DomainRoutes.tsx | 39 +++
.../src/app/domain/DomainSearch.tsx | 143 +++++++++++
.../src/app/domain/DomainsContext.tsx | 21 ++
.../src/app/domain/DomainsList.tsx | 12 +-
.../src/app/domain/ManageDomainsPage.tsx | 31 ++-
.../nestedDomains/DomainsSidebarHeader.tsx | 58 +++++
.../app/domain/nestedDomains/DomainsTitle.tsx | 18 ++
.../nestedDomains/ManageDomainsPageV2.tsx | 60 +++++
.../nestedDomains/ManageDomainsSidebar.tsx | 28 +++
.../app/domain/nestedDomains/RootDomains.tsx | 31 +++
.../domainNavigator/DomainNavigator.tsx | 37 +++
.../domainNavigator/DomainNode.tsx | 137 +++++++++++
.../domainNavigator/useHasDomainChildren.ts | 29 +++
.../src/app/domain/useListDomains.tsx | 27 +++
datahub-web-react/src/app/domain/utils.ts | 72 +++++-
.../src/app/entity/EntityRegistry.tsx | 6 +
.../src/app/entity/domain/DomainEntity.tsx | 22 +-
.../domain/preview/DomainEntitiesSnippet.tsx | 45 ++++
.../src/app/entity/domain/preview/Preview.tsx | 21 +-
.../entity/glossaryNode/preview/Preview.tsx | 2 +-
.../entity/glossaryTerm/preview/Preview.tsx | 2 +-
.../EntityDropdown/DomainParentSelect.tsx | 108 +++++++++
.../shared/EntityDropdown/EntityDropdown.tsx | 35 +--
.../shared/EntityDropdown/MoveDomainModal.tsx | 102 ++++++++
.../EntityDropdown/NodeParentSelect.tsx | 79 ++-----
.../shared/EntityDropdown/useDeleteEntity.tsx | 7 +
.../EntityDropdown/useHandleDeleteDomain.ts | 27 +++
.../useHandleMoveDomainComplete.ts | 40 ++++
.../EntityDropdown/useParentSelector.ts | 76 ++++++
.../app/entity/shared/EntityDropdown/utils.ts | 50 +++-
.../src/app/entity/shared/constants.ts | 1 +
.../containers/profile/EntityProfile.tsx | 2 +
.../containers/profile/header/EntityName.tsx | 28 ++-
.../PlatformContentContainer.tsx | 1 +
.../PlatformContent/PlatformContentView.tsx | 13 +-
.../profile/sidebar/Domain/SetDomainModal.tsx | 78 +++---
.../src/app/entity/shared/types.ts | 3 +
.../src/app/glossary/BusinessGlossaryPage.tsx | 6 -
.../src/app/glossary/GlossarySidebar.tsx | 12 +-
.../policy/PolicyPrivilegeForm.tsx | 88 ++++---
.../src/app/preview/DefaultPreviewCard.tsx | 8 +-
.../renderer/component/DomainSearchList.tsx | 58 ++++-
.../renderer/component/HoverEntityTooltip.tsx | 6 +-
.../src/app/search/SearchResultList.tsx | 3 +-
.../src/app/search/SearchResults.tsx | 3 +-
.../autoComplete/AutoCompleteEntity.tsx | 6 +-
.../src/app/search/filters/FilterOption.tsx | 21 +-
.../{ParentNodes.tsx => ParentEntities.tsx} | 53 +++--
.../src/app/search/filters/utils.tsx | 15 ++
.../src/app/search/sidebar/BrowseSidebar.tsx | 3 +-
.../src/app/search/sidebar/ExpandableNode.tsx | 30 +--
.../src/app/shared/LogoCountCard.tsx | 26 +-
.../src/app/shared/admin/HeaderLinks.tsx | 9 +-
.../src/app/shared/components.tsx | 49 ++++
.../src/app/shared/deleteUtils.ts | 4 +-
.../src/app/shared/sidebar/components.tsx | 23 ++
.../src/app/shared/styleUtils.ts | 7 +
.../src/app/shared/tags/AddTagsTermsModal.tsx | 6 +-
.../src/app/shared/tags/DomainLink.tsx | 9 +-
datahub-web-react/src/app/shared/useToggle.ts | 24 +-
datahub-web-react/src/app/useAppConfig.ts | 5 +
datahub-web-react/src/appConfigContext.tsx | 1 +
datahub-web-react/src/conf/Global.ts | 1 +
datahub-web-react/src/graphql/app.graphql | 1 +
datahub-web-react/src/graphql/domain.graphql | 30 ++-
.../src/graphql/fragments.graphql | 32 +++
datahub-web-react/src/graphql/preview.graphql | 5 +
datahub-web-react/src/graphql/search.graphql | 10 +
.../authorization/ResolvedResourceSpec.java | 32 ---
.../com/linkedin/domain/DomainProperties.pdl | 15 ++
.../DomainFieldResolverProvider.java | 68 +++++-
.../authorization/DataHubAuthorizerTest.java | 145 ++++++++++--
.../src/main/resources/application.yml | 1 +
.../datahubusage/DataHubUsageEventType.java | 1 +
node_modules/.yarn-integrity | 12 +
.../cypress/cypress/e2e/mutations/domains.js | 23 +-
yarn.lock | 4 +
101 files changed, 3083 insertions(+), 415 deletions(-)
create mode 100644 datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/ParentDomainsResolver.java
create mode 100644 datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/MoveDomainResolver.java
create mode 100644 datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/MoveDomainResolverTest.java
create mode 100644 datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/ParentDomainsResolverTest.java
create mode 100644 datahub-web-react/src/app/domain/DomainIcon.tsx
create mode 100644 datahub-web-react/src/app/domain/DomainRoutes.tsx
create mode 100644 datahub-web-react/src/app/domain/DomainSearch.tsx
create mode 100644 datahub-web-react/src/app/domain/DomainsContext.tsx
create mode 100644 datahub-web-react/src/app/domain/nestedDomains/DomainsSidebarHeader.tsx
create mode 100644 datahub-web-react/src/app/domain/nestedDomains/DomainsTitle.tsx
create mode 100644 datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx
create mode 100644 datahub-web-react/src/app/domain/nestedDomains/ManageDomainsSidebar.tsx
create mode 100644 datahub-web-react/src/app/domain/nestedDomains/RootDomains.tsx
create mode 100644 datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNavigator.tsx
create mode 100644 datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNode.tsx
create mode 100644 datahub-web-react/src/app/domain/nestedDomains/domainNavigator/useHasDomainChildren.ts
create mode 100644 datahub-web-react/src/app/domain/useListDomains.tsx
create mode 100644 datahub-web-react/src/app/entity/domain/preview/DomainEntitiesSnippet.tsx
create mode 100644 datahub-web-react/src/app/entity/shared/EntityDropdown/DomainParentSelect.tsx
create mode 100644 datahub-web-react/src/app/entity/shared/EntityDropdown/MoveDomainModal.tsx
create mode 100644 datahub-web-react/src/app/entity/shared/EntityDropdown/useHandleDeleteDomain.ts
create mode 100644 datahub-web-react/src/app/entity/shared/EntityDropdown/useHandleMoveDomainComplete.ts
create mode 100644 datahub-web-react/src/app/entity/shared/EntityDropdown/useParentSelector.ts
rename datahub-web-react/src/app/search/filters/{ParentNodes.tsx => ParentEntities.tsx} (54%)
create mode 100644 datahub-web-react/src/app/shared/components.tsx
create mode 100644 datahub-web-react/src/app/shared/sidebar/components.tsx
create mode 100644 datahub-web-react/src/app/shared/styleUtils.ts
create mode 100644 node_modules/.yarn-integrity
create mode 100644 yarn.lock
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java
index 682710ad5d539d..d86234cf593062 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java
@@ -81,6 +81,7 @@
import com.linkedin.datahub.graphql.generated.Notebook;
import com.linkedin.datahub.graphql.generated.Owner;
import com.linkedin.datahub.graphql.generated.OwnershipTypeEntity;
+import com.linkedin.datahub.graphql.generated.ParentDomainsResult;
import com.linkedin.datahub.graphql.generated.PolicyMatchCriterionValue;
import com.linkedin.datahub.graphql.generated.QueryEntity;
import com.linkedin.datahub.graphql.generated.QuerySubject;
@@ -124,6 +125,7 @@
import com.linkedin.datahub.graphql.resolvers.domain.DeleteDomainResolver;
import com.linkedin.datahub.graphql.resolvers.domain.DomainEntitiesResolver;
import com.linkedin.datahub.graphql.resolvers.domain.ListDomainsResolver;
+import com.linkedin.datahub.graphql.resolvers.domain.ParentDomainsResolver;
import com.linkedin.datahub.graphql.resolvers.domain.SetDomainResolver;
import com.linkedin.datahub.graphql.resolvers.domain.UnsetDomainResolver;
import com.linkedin.datahub.graphql.resolvers.embed.UpdateEmbedResolver;
@@ -186,6 +188,7 @@
import com.linkedin.datahub.graphql.resolvers.mutate.BatchSetDomainResolver;
import com.linkedin.datahub.graphql.resolvers.mutate.BatchUpdateDeprecationResolver;
import com.linkedin.datahub.graphql.resolvers.mutate.BatchUpdateSoftDeletedResolver;
+import com.linkedin.datahub.graphql.resolvers.mutate.MoveDomainResolver;
import com.linkedin.datahub.graphql.resolvers.mutate.MutableTypeBatchResolver;
import com.linkedin.datahub.graphql.resolvers.mutate.MutableTypeResolver;
import com.linkedin.datahub.graphql.resolvers.mutate.RemoveLinkResolver;
@@ -944,6 +947,7 @@ private void configureMutationResolvers(final RuntimeWiring.Builder builder) {
.dataFetcher("removeGroup", new RemoveGroupResolver(this.entityClient))
.dataFetcher("updateUserStatus", new UpdateUserStatusResolver(this.entityClient))
.dataFetcher("createDomain", new CreateDomainResolver(this.entityClient, this.entityService))
+ .dataFetcher("moveDomain", new MoveDomainResolver(this.entityService, this.entityClient))
.dataFetcher("deleteDomain", new DeleteDomainResolver(entityClient))
.dataFetcher("setDomain", new SetDomainResolver(this.entityClient, this.entityService))
.dataFetcher("batchSetDomain", new BatchSetDomainResolver(this.entityService))
@@ -1029,6 +1033,13 @@ private void configureGenericEntityResolvers(final RuntimeWiring.Builder builder
.dataFetcher("entities", new EntityTypeBatchResolver(entityTypes,
(env) -> ((BrowseResults) env.getSource()).getEntities()))
)
+ .type("ParentDomainsResult", typeWiring -> typeWiring
+ .dataFetcher("domains", new EntityTypeBatchResolver(entityTypes,
+ (env) -> {
+ final ParentDomainsResult result = env.getSource();
+ return result != null ? result.getDomains() : null;
+ }))
+ )
.type("EntityRelationshipLegacy", typeWiring -> typeWiring
.dataFetcher("entity", new EntityTypeResolver(entityTypes,
(env) -> ((EntityRelationshipLegacy) env.getSource()).getEntity()))
@@ -1675,8 +1686,8 @@ private void configureGlossaryRelationshipResolvers(final RuntimeWiring.Builder
private void configureDomainResolvers(final RuntimeWiring.Builder builder) {
builder.type("Domain", typeWiring -> typeWiring
.dataFetcher("entities", new DomainEntitiesResolver(this.entityClient))
- .dataFetcher("relationships", new EntityRelationshipsResultResolver(graphClient)
- )
+ .dataFetcher("parentDomains", new ParentDomainsResolver(this.entityClient))
+ .dataFetcher("relationships", new EntityRelationshipsResultResolver(graphClient))
);
builder.type("DomainAssociation", typeWiring -> typeWiring
.dataFetcher("domain",
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/exception/DataHubGraphQLErrorCode.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/exception/DataHubGraphQLErrorCode.java
index db3e1dd03e4198..44695c334855fc 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/exception/DataHubGraphQLErrorCode.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/exception/DataHubGraphQLErrorCode.java
@@ -4,6 +4,7 @@ public enum DataHubGraphQLErrorCode {
BAD_REQUEST(400),
UNAUTHORIZED(403),
NOT_FOUND(404),
+ CONFLICT(409),
SERVER_ERROR(500);
private final int _code;
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java
index de3c217db01ec9..4d6133f18df050 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java
@@ -16,4 +16,5 @@ public class FeatureFlags {
private PreProcessHooks preProcessHooks;
private boolean showAcrylInfo = false;
private boolean showAccessManagement = false;
+ private boolean nestedDomainsEnabled = false;
}
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java
index 09df985b19cf5c..f6bc68caa0821c 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java
@@ -172,6 +172,7 @@ public CompletableFuture get(final DataFetchingEnvironment environmen
.setShowBrowseV2(_featureFlags.isShowBrowseV2())
.setShowAcrylInfo(_featureFlags.isShowAcrylInfo())
.setShowAccessManagement(_featureFlags.isShowAccessManagement())
+ .setNestedDomainsEnabled(_featureFlags.isNestedDomainsEnabled())
.build();
appConfig.setFeatureFlags(featureFlagsConfig);
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/CreateDomainResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/CreateDomainResolver.java
index 39aa1ea28da205..1930cdc1f86676 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/CreateDomainResolver.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/CreateDomainResolver.java
@@ -1,14 +1,18 @@
package com.linkedin.datahub.graphql.resolvers.domain;
import com.linkedin.common.AuditStamp;
+import com.linkedin.common.urn.Urn;
import com.linkedin.common.urn.UrnUtils;
import com.linkedin.data.template.SetMode;
import com.linkedin.datahub.graphql.QueryContext;
import com.linkedin.datahub.graphql.authorization.AuthorizationUtils;
import com.linkedin.datahub.graphql.exception.AuthorizationException;
+import com.linkedin.datahub.graphql.exception.DataHubGraphQLErrorCode;
+import com.linkedin.datahub.graphql.exception.DataHubGraphQLException;
import com.linkedin.datahub.graphql.generated.CreateDomainInput;
import com.linkedin.datahub.graphql.generated.OwnerEntityType;
import com.linkedin.datahub.graphql.generated.OwnershipType;
+import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils;
import com.linkedin.datahub.graphql.resolvers.mutate.util.OwnerUtils;
import com.linkedin.domain.DomainProperties;
import com.linkedin.entity.client.EntityClient;
@@ -19,8 +23,11 @@
import com.linkedin.mxe.MetadataChangeProposal;
import graphql.schema.DataFetcher;
import graphql.schema.DataFetchingEnvironment;
+
+import java.net.URISyntaxException;
import java.util.UUID;
import java.util.concurrent.CompletableFuture;
+
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@@ -45,9 +52,9 @@ public CompletableFuture get(DataFetchingEnvironment environment) throws
final QueryContext context = environment.getContext();
final CreateDomainInput input = bindArgument(environment.getArgument("input"), CreateDomainInput.class);
+ final Urn parentDomain = input.getParentDomain() != null ? UrnUtils.getUrn(input.getParentDomain()) : null;
return CompletableFuture.supplyAsync(() -> {
-
if (!AuthorizationUtils.canCreateDomains(context)) {
throw new AuthorizationException("Unauthorized to perform this action. Please contact your DataHub administrator.");
}
@@ -64,6 +71,17 @@ public CompletableFuture get(DataFetchingEnvironment environment) throws
throw new IllegalArgumentException("This Domain already exists!");
}
+ if (parentDomain != null && !_entityClient.exists(parentDomain, context.getAuthentication())) {
+ throw new IllegalArgumentException("Parent Domain does not exist!");
+ }
+
+ if (DomainUtils.hasNameConflict(input.getName(), parentDomain, context, _entityClient)) {
+ throw new DataHubGraphQLException(
+ String.format("\"%s\" already exists in this domain. Please pick a unique name.", input.getName()),
+ DataHubGraphQLErrorCode.CONFLICT
+ );
+ }
+
// Create the MCP
final MetadataChangeProposal proposal = buildMetadataChangeProposalWithKey(key, DOMAIN_ENTITY_NAME,
DOMAIN_PROPERTIES_ASPECT_NAME, mapDomainProperties(input, context));
@@ -77,6 +95,8 @@ public CompletableFuture get(DataFetchingEnvironment environment) throws
}
OwnerUtils.addCreatorAsOwner(context, domainUrn, OwnerEntityType.CORP_USER, ownershipType, _entityService);
return domainUrn;
+ } catch (DataHubGraphQLException e) {
+ throw e;
} catch (Exception e) {
log.error("Failed to create Domain with id: {}, name: {}: {}", input.getId(), input.getName(), e.getMessage());
throw new RuntimeException(String.format("Failed to create Domain with id: %s, name: %s", input.getId(), input.getName()), e);
@@ -89,6 +109,13 @@ private DomainProperties mapDomainProperties(final CreateDomainInput input, fina
result.setName(input.getName());
result.setDescription(input.getDescription(), SetMode.IGNORE_NULL);
result.setCreated(new AuditStamp().setActor(UrnUtils.getUrn(context.getActorUrn())).setTime(System.currentTimeMillis()));
+ if (input.getParentDomain() != null) {
+ try {
+ result.setParentDomain(Urn.createFromString(input.getParentDomain()));
+ } catch (URISyntaxException e) {
+ throw new RuntimeException(String.format("Failed to create Domain Urn from string: %s", input.getParentDomain()), e);
+ }
+ }
return result;
}
}
\ No newline at end of file
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/DeleteDomainResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/DeleteDomainResolver.java
index 60a03fcddcc4dc..9ab90e8b4ff72c 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/DeleteDomainResolver.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/DeleteDomainResolver.java
@@ -4,6 +4,7 @@
import com.linkedin.datahub.graphql.QueryContext;
import com.linkedin.datahub.graphql.authorization.AuthorizationUtils;
import com.linkedin.datahub.graphql.exception.AuthorizationException;
+import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils;
import com.linkedin.entity.client.EntityClient;
import graphql.schema.DataFetcher;
import graphql.schema.DataFetchingEnvironment;
@@ -32,6 +33,11 @@ public CompletableFuture get(final DataFetchingEnvironment environment)
if (AuthorizationUtils.canManageDomains(context) || AuthorizationUtils.canDeleteEntity(urn, context)) {
try {
+ // Make sure there are no child domains
+ if (DomainUtils.hasChildDomains(urn, context, _entityClient)) {
+ throw new RuntimeException(String.format("Cannot delete domain %s which has child domains", domainUrn));
+ }
+
_entityClient.deleteEntity(urn, context.getAuthentication());
log.info(String.format("I've successfully deleted the entity %s with urn", domainUrn));
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/DomainEntitiesResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/DomainEntitiesResolver.java
index 06bfa36fc3c146..0bf551c4683e61 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/DomainEntitiesResolver.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/DomainEntitiesResolver.java
@@ -1,6 +1,5 @@
package com.linkedin.datahub.graphql.resolvers.domain;
-import com.google.common.collect.ImmutableList;
import com.linkedin.datahub.graphql.QueryContext;
import com.linkedin.datahub.graphql.generated.Domain;
import com.linkedin.datahub.graphql.generated.DomainEntitiesInput;
@@ -67,17 +66,22 @@ public CompletableFuture get(final DataFetchingEnvironment enviro
try {
+ final CriterionArray criteria = new CriterionArray();
final Criterion filterCriterion = new Criterion()
.setField(DOMAINS_FIELD_NAME + ".keyword")
.setCondition(Condition.EQUAL)
.setValue(urn);
+ criteria.add(filterCriterion);
+ if (input.getFilters() != null) {
+ input.getFilters().forEach(filter -> {
+ criteria.add(new Criterion().setField(filter.getField()).setValue(filter.getValue()));
+ });
+ }
return UrnSearchResultsMapper.map(_entityClient.searchAcrossEntities(
SEARCHABLE_ENTITY_TYPES.stream().map(EntityTypeMapper::getName).collect(Collectors.toList()),
query,
- new Filter().setOr(new ConjunctiveCriterionArray(
- new ConjunctiveCriterion().setAnd(new CriterionArray(ImmutableList.of(filterCriterion)))
- )),
+ new Filter().setOr(new ConjunctiveCriterionArray(new ConjunctiveCriterion().setAnd(criteria))),
start,
count,
null,
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/ListDomainsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/ListDomainsResolver.java
index 6ed8639592d6e2..3a751e502eb10a 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/ListDomainsResolver.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/ListDomainsResolver.java
@@ -1,22 +1,24 @@
package com.linkedin.datahub.graphql.resolvers.domain;
import com.linkedin.common.urn.Urn;
+import com.linkedin.common.urn.UrnUtils;
import com.linkedin.datahub.graphql.QueryContext;
-import com.linkedin.datahub.graphql.authorization.AuthorizationUtils;
-import com.linkedin.datahub.graphql.exception.AuthorizationException;
import com.linkedin.datahub.graphql.generated.Domain;
import com.linkedin.datahub.graphql.generated.EntityType;
import com.linkedin.datahub.graphql.generated.ListDomainsInput;
import com.linkedin.datahub.graphql.generated.ListDomainsResult;
+import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils;
import com.linkedin.entity.client.EntityClient;
import com.linkedin.metadata.Constants;
import com.linkedin.metadata.query.SearchFlags;
+import com.linkedin.metadata.query.filter.Filter;
import com.linkedin.metadata.query.filter.SortCriterion;
import com.linkedin.metadata.query.filter.SortOrder;
import com.linkedin.metadata.search.SearchEntity;
import com.linkedin.metadata.search.SearchResult;
import graphql.schema.DataFetcher;
import graphql.schema.DataFetchingEnvironment;
+
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CompletableFuture;
@@ -30,7 +32,6 @@
* Resolver used for listing all Domains defined within DataHub. Requires the MANAGE_DOMAINS platform privilege.
*/
public class ListDomainsResolver implements DataFetcher> {
-
private static final Integer DEFAULT_START = 0;
private static final Integer DEFAULT_COUNT = 20;
private static final String DEFAULT_QUERY = "";
@@ -48,18 +49,19 @@ public CompletableFuture get(final DataFetchingEnvironment en
return CompletableFuture.supplyAsync(() -> {
- if (AuthorizationUtils.canCreateDomains(context)) {
final ListDomainsInput input = bindArgument(environment.getArgument("input"), ListDomainsInput.class);
final Integer start = input.getStart() == null ? DEFAULT_START : input.getStart();
final Integer count = input.getCount() == null ? DEFAULT_COUNT : input.getCount();
final String query = input.getQuery() == null ? DEFAULT_QUERY : input.getQuery();
+ final Urn parentDomainUrn = input.getParentDomain() != null ? UrnUtils.getUrn(input.getParentDomain()) : null;
+ final Filter filter = DomainUtils.buildParentDomainFilter(parentDomainUrn);
try {
- // First, get all group Urns.
+ // First, get all domain Urns.
final SearchResult gmsResult = _entityClient.search(
Constants.DOMAIN_ENTITY_NAME,
query,
- null,
+ filter,
new SortCriterion().setField(DOMAIN_CREATED_TIME_INDEX_FIELD_NAME).setOrder(SortOrder.DESCENDING),
start,
count,
@@ -78,8 +80,6 @@ public CompletableFuture get(final DataFetchingEnvironment en
} catch (Exception e) {
throw new RuntimeException("Failed to list domains", e);
}
- }
- throw new AuthorizationException("Unauthorized to perform this action. Please contact your DataHub administrator.");
});
}
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/ParentDomainsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/ParentDomainsResolver.java
new file mode 100644
index 00000000000000..dcaa7d61ed90cd
--- /dev/null
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/ParentDomainsResolver.java
@@ -0,0 +1,59 @@
+package com.linkedin.datahub.graphql.resolvers.domain;
+
+import com.linkedin.common.urn.Urn;
+import com.linkedin.common.urn.UrnUtils;
+import com.linkedin.datahub.graphql.QueryContext;
+import com.linkedin.datahub.graphql.generated.Entity;
+import com.linkedin.datahub.graphql.generated.ParentDomainsResult;
+import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils;
+import com.linkedin.entity.client.EntityClient;
+import graphql.schema.DataFetcher;
+import graphql.schema.DataFetchingEnvironment;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.CompletableFuture;
+
+import static com.linkedin.metadata.Constants.DOMAIN_ENTITY_NAME;
+
+public class ParentDomainsResolver implements DataFetcher> {
+
+ private final EntityClient _entityClient;
+
+ public ParentDomainsResolver(final EntityClient entityClient) {
+ _entityClient = entityClient;
+ }
+
+ @Override
+ public CompletableFuture get(DataFetchingEnvironment environment) {
+ final QueryContext context = environment.getContext();
+ final Urn urn = UrnUtils.getUrn(((Entity) environment.getSource()).getUrn());
+ final List parentDomains = new ArrayList<>();
+ final Set visitedParentUrns = new HashSet<>();
+
+ if (!DOMAIN_ENTITY_NAME.equals(urn.getEntityType())) {
+ throw new IllegalArgumentException(String.format("Failed to resolve parents for entity type %s", urn));
+ }
+
+ return CompletableFuture.supplyAsync(() -> {
+ try {
+ Entity parentDomain = DomainUtils.getParentDomain(urn, context, _entityClient);
+
+ while (parentDomain != null && !visitedParentUrns.contains(parentDomain.getUrn())) {
+ parentDomains.add(parentDomain);
+ visitedParentUrns.add(parentDomain.getUrn());
+ parentDomain = DomainUtils.getParentDomain(Urn.createFromString(parentDomain.getUrn()), context, _entityClient);
+ }
+
+ final ParentDomainsResult result = new ParentDomainsResult();
+ result.setCount(parentDomains.size());
+ result.setDomains(parentDomains);
+ return result;
+ } catch (Exception e) {
+ throw new RuntimeException(String.format("Failed to load parent domains for entity %s", urn), e);
+ }
+ });
+ }
+}
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/MoveDomainResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/MoveDomainResolver.java
new file mode 100644
index 00000000000000..e5e3a5a0ee42e3
--- /dev/null
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/MoveDomainResolver.java
@@ -0,0 +1,89 @@
+package com.linkedin.datahub.graphql.resolvers.mutate;
+
+import com.linkedin.common.urn.CorpuserUrn;
+import com.linkedin.common.urn.Urn;
+import com.linkedin.common.urn.UrnUtils;
+import com.linkedin.data.template.SetMode;
+import com.linkedin.datahub.graphql.QueryContext;
+import com.linkedin.datahub.graphql.authorization.AuthorizationUtils;
+import com.linkedin.datahub.graphql.exception.AuthorizationException;
+import com.linkedin.datahub.graphql.exception.DataHubGraphQLErrorCode;
+import com.linkedin.datahub.graphql.exception.DataHubGraphQLException;
+import com.linkedin.datahub.graphql.generated.MoveDomainInput;
+import com.linkedin.datahub.graphql.resolvers.ResolverUtils;
+import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils;
+import com.linkedin.domain.DomainProperties;
+import com.linkedin.entity.client.EntityClient;
+import com.linkedin.metadata.Constants;
+import com.linkedin.metadata.entity.EntityService;
+import com.linkedin.metadata.entity.EntityUtils;
+import graphql.schema.DataFetcher;
+import graphql.schema.DataFetchingEnvironment;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+
+import java.util.concurrent.CompletableFuture;
+
+@Slf4j
+@RequiredArgsConstructor
+public class MoveDomainResolver implements DataFetcher> {
+
+ private final EntityService _entityService;
+ private final EntityClient _entityClient;
+
+ @Override
+ public CompletableFuture get(DataFetchingEnvironment environment) throws Exception {
+ final MoveDomainInput input = ResolverUtils.bindArgument(environment.getArgument("input"), MoveDomainInput.class);
+ final QueryContext context = environment.getContext();
+ final Urn resourceUrn = UrnUtils.getUrn(input.getResourceUrn());
+ final Urn newParentDomainUrn = input.getParentDomain() != null ? UrnUtils.getUrn(input.getParentDomain()) : null;
+
+ return CompletableFuture.supplyAsync(() -> {
+ if (!AuthorizationUtils.canManageDomains(context)) {
+ throw new AuthorizationException("Unauthorized to perform this action. Please contact your DataHub administrator.");
+ }
+
+ try {
+ if (!resourceUrn.getEntityType().equals(Constants.DOMAIN_ENTITY_NAME)) {
+ throw new IllegalArgumentException("Resource is not a domain.");
+ }
+
+ DomainProperties properties = (DomainProperties) EntityUtils.getAspectFromEntity(
+ resourceUrn.toString(),
+ Constants.DOMAIN_PROPERTIES_ASPECT_NAME, _entityService,
+ null
+ );
+
+ if (properties == null) {
+ throw new IllegalArgumentException("Domain properties do not exist.");
+ }
+
+ if (newParentDomainUrn != null) {
+ if (!newParentDomainUrn.getEntityType().equals(Constants.DOMAIN_ENTITY_NAME)) {
+ throw new IllegalArgumentException("Parent entity is not a domain.");
+ }
+ if (!_entityService.exists(newParentDomainUrn)) {
+ throw new IllegalArgumentException("Parent entity does not exist.");
+ }
+ }
+
+ if (DomainUtils.hasNameConflict(properties.getName(), newParentDomainUrn, context, _entityClient)) {
+ throw new DataHubGraphQLException(
+ String.format("\"%s\" already exists in the destination domain. Please pick a unique name.", properties.getName()),
+ DataHubGraphQLErrorCode.CONFLICT
+ );
+ }
+
+ properties.setParentDomain(newParentDomainUrn, SetMode.REMOVE_IF_NULL);
+ Urn actor = CorpuserUrn.createFromString(context.getActorUrn());
+ MutationUtils.persistAspect(resourceUrn, Constants.DOMAIN_PROPERTIES_ASPECT_NAME, properties, actor, _entityService);
+ return true;
+ } catch (DataHubGraphQLException e) {
+ throw e;
+ } catch (Exception e) {
+ log.error("Failed to move domain {} to parent {} : {}", input.getResourceUrn(), input.getParentDomain(), e.getMessage());
+ throw new RuntimeException(String.format("Failed to move domain %s to %s", input.getResourceUrn(), input.getParentDomain()), e);
+ }
+ });
+ }
+}
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/UpdateNameResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/UpdateNameResolver.java
index 225bee54142c4c..0e316ac1296ee0 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/UpdateNameResolver.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/UpdateNameResolver.java
@@ -6,8 +6,11 @@
import com.linkedin.datahub.graphql.QueryContext;
import com.linkedin.datahub.graphql.authorization.AuthorizationUtils;
import com.linkedin.datahub.graphql.exception.AuthorizationException;
+import com.linkedin.datahub.graphql.exception.DataHubGraphQLErrorCode;
+import com.linkedin.datahub.graphql.exception.DataHubGraphQLException;
import com.linkedin.datahub.graphql.generated.UpdateNameInput;
import com.linkedin.datahub.graphql.resolvers.dataproduct.DataProductAuthorizationUtils;
+import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils;
import com.linkedin.datahub.graphql.resolvers.mutate.util.GlossaryUtils;
import com.linkedin.dataproduct.DataProductProperties;
import com.linkedin.domain.DomainProperties;
@@ -124,14 +127,25 @@ private Boolean updateDomainName(
try {
DomainProperties domainProperties = (DomainProperties) EntityUtils.getAspectFromEntity(
targetUrn.toString(), Constants.DOMAIN_PROPERTIES_ASPECT_NAME, _entityService, null);
+
if (domainProperties == null) {
throw new IllegalArgumentException("Domain does not exist");
}
+
+ if (DomainUtils.hasNameConflict(input.getName(), DomainUtils.getParentDomainSafely(domainProperties), context, _entityClient)) {
+ throw new DataHubGraphQLException(
+ String.format("\"%s\" already exists in this domain. Please pick a unique name.", input.getName()),
+ DataHubGraphQLErrorCode.CONFLICT
+ );
+ }
+
domainProperties.setName(input.getName());
Urn actor = CorpuserUrn.createFromString(context.getActorUrn());
persistAspect(targetUrn, Constants.DOMAIN_PROPERTIES_ASPECT_NAME, domainProperties, actor, _entityService);
return true;
+ } catch (DataHubGraphQLException e) {
+ throw e;
} catch (Exception e) {
throw new RuntimeException(String.format("Failed to perform update against input %s", input), e);
}
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/DomainUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/DomainUtils.java
index b57160be09d326..585fbdf53a2ba4 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/DomainUtils.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/DomainUtils.java
@@ -5,29 +5,55 @@
import com.linkedin.common.UrnArray;
import com.linkedin.common.urn.Urn;
import com.linkedin.common.urn.UrnUtils;
+import com.linkedin.data.DataMap;
import com.linkedin.datahub.graphql.QueryContext;
import com.linkedin.datahub.graphql.authorization.AuthorizationUtils;
import com.datahub.authorization.ConjunctivePrivilegeGroup;
import com.datahub.authorization.DisjunctivePrivilegeGroup;
+import com.linkedin.datahub.graphql.generated.Entity;
import com.linkedin.datahub.graphql.generated.ResourceRefInput;
+import com.linkedin.datahub.graphql.types.common.mappers.UrnToEntityMapper;
+import com.linkedin.domain.DomainProperties;
import com.linkedin.domain.Domains;
+import com.linkedin.entity.EntityResponse;
+import com.linkedin.entity.client.EntityClient;
import com.linkedin.metadata.Constants;
import com.linkedin.metadata.authorization.PoliciesConfig;
import com.linkedin.metadata.entity.EntityService;
import com.linkedin.metadata.entity.EntityUtils;
+import com.linkedin.metadata.query.filter.Condition;
+import com.linkedin.metadata.query.filter.ConjunctiveCriterion;
+import com.linkedin.metadata.query.filter.ConjunctiveCriterionArray;
+import com.linkedin.metadata.query.filter.Criterion;
+import com.linkedin.metadata.query.filter.CriterionArray;
+import com.linkedin.metadata.query.filter.Filter;
+import com.linkedin.metadata.search.SearchEntity;
+import com.linkedin.metadata.search.SearchResult;
import com.linkedin.mxe.MetadataChangeProposal;
+
+import com.linkedin.r2.RemoteInvocationException;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
+
import lombok.extern.slf4j.Slf4j;
import static com.linkedin.datahub.graphql.resolvers.mutate.MutationUtils.*;
+import static com.linkedin.metadata.Constants.*;
// TODO: Move to consuming from DomainService.
@Slf4j
public class DomainUtils {
+ private static final String PARENT_DOMAIN_INDEX_FIELD_NAME = "parentDomain.keyword";
+ private static final String HAS_PARENT_DOMAIN_INDEX_FIELD_NAME = "hasParentDomain";
+ private static final String NAME_INDEX_FIELD_NAME = "name";
+
private static final ConjunctivePrivilegeGroup ALL_PRIVILEGES_GROUP = new ConjunctivePrivilegeGroup(ImmutableList.of(
PoliciesConfig.EDIT_ENTITY_PRIVILEGE.getType()
));
@@ -85,4 +111,200 @@ public static void validateDomain(Urn domainUrn, EntityService entityService) {
throw new IllegalArgumentException(String.format("Failed to validate Domain with urn %s. Urn does not exist.", domainUrn));
}
}
+
+ private static List buildRootDomainCriteria() {
+ final List criteria = new ArrayList<>();
+
+ criteria.add(
+ new Criterion()
+ .setField(HAS_PARENT_DOMAIN_INDEX_FIELD_NAME)
+ .setValue("false")
+ .setCondition(Condition.EQUAL)
+ );
+ criteria.add(
+ new Criterion()
+ .setField(HAS_PARENT_DOMAIN_INDEX_FIELD_NAME)
+ .setValue("")
+ .setCondition(Condition.IS_NULL)
+ );
+
+ return criteria;
+ }
+
+ private static List buildParentDomainCriteria(@Nonnull final Urn parentDomainUrn) {
+ final List criteria = new ArrayList<>();
+
+ criteria.add(
+ new Criterion()
+ .setField(HAS_PARENT_DOMAIN_INDEX_FIELD_NAME)
+ .setValue("true")
+ .setCondition(Condition.EQUAL)
+ );
+ criteria.add(
+ new Criterion()
+ .setField(PARENT_DOMAIN_INDEX_FIELD_NAME)
+ .setValue(parentDomainUrn.toString())
+ .setCondition(Condition.EQUAL)
+ );
+
+ return criteria;
+ }
+
+ private static Criterion buildNameCriterion(@Nonnull final String name) {
+ return new Criterion()
+ .setField(NAME_INDEX_FIELD_NAME)
+ .setValue(name)
+ .setCondition(Condition.EQUAL);
+ }
+
+ /**
+ * Builds a filter that ORs together the root parent criterion / ANDs together the parent domain criterion.
+ * The reason for the OR on root is elastic can have a null|false value to represent an root domain in the index.
+ * @param name an optional name to AND in to each condition of the filter
+ * @param parentDomainUrn the parent domain (null means root).
+ * @return the Filter
+ */
+ public static Filter buildNameAndParentDomainFilter(@Nullable final String name, @Nullable final Urn parentDomainUrn) {
+ if (parentDomainUrn == null) {
+ return new Filter().setOr(
+ new ConjunctiveCriterionArray(
+ buildRootDomainCriteria().stream().map(parentCriterion -> {
+ final CriterionArray array = new CriterionArray(parentCriterion);
+ if (name != null) {
+ array.add(buildNameCriterion(name));
+ }
+ return new ConjunctiveCriterion().setAnd(array);
+ }).collect(Collectors.toList())
+ )
+ );
+ }
+
+ final CriterionArray andArray = new CriterionArray(buildParentDomainCriteria(parentDomainUrn));
+ if (name != null) {
+ andArray.add(buildNameCriterion(name));
+ }
+ return new Filter().setOr(
+ new ConjunctiveCriterionArray(
+ new ConjunctiveCriterion().setAnd(andArray)
+ )
+ );
+ }
+
+ public static Filter buildParentDomainFilter(@Nullable final Urn parentDomainUrn) {
+ return buildNameAndParentDomainFilter(null, parentDomainUrn);
+ }
+
+ /**
+ * Check if a domain has any child domains
+ * @param domainUrn the URN of the domain to check
+ * @param context query context (includes authorization context to authorize the request)
+ * @param entityClient client used to perform the check
+ * @return true if the domain has any child domains, false if it does not
+ */
+ public static boolean hasChildDomains(
+ @Nonnull final Urn domainUrn,
+ @Nonnull final QueryContext context,
+ @Nonnull final EntityClient entityClient
+ ) throws RemoteInvocationException {
+ Filter parentDomainFilter = buildParentDomainFilter(domainUrn);
+ // Search for entities matching parent domain
+ // Limit count to 1 for existence check
+ final SearchResult searchResult = entityClient.filter(
+ DOMAIN_ENTITY_NAME,
+ parentDomainFilter,
+ null,
+ 0,
+ 1,
+ context.getAuthentication());
+ return (searchResult.getNumEntities() > 0);
+ }
+
+ private static Map getDomainsByNameAndParent(
+ @Nonnull final String name,
+ @Nullable final Urn parentDomainUrn,
+ @Nonnull final QueryContext context,
+ @Nonnull final EntityClient entityClient
+ ) {
+ try {
+ final Filter filter = buildNameAndParentDomainFilter(name, parentDomainUrn);
+
+ final SearchResult searchResult = entityClient.filter(
+ DOMAIN_ENTITY_NAME,
+ filter,
+ null,
+ 0,
+ 1000,
+ context.getAuthentication());
+
+ final Set domainUrns = searchResult.getEntities()
+ .stream()
+ .map(SearchEntity::getEntity)
+ .collect(Collectors.toSet());
+
+ return entityClient.batchGetV2(
+ DOMAIN_ENTITY_NAME,
+ domainUrns,
+ Collections.singleton(DOMAIN_PROPERTIES_ASPECT_NAME),
+ context.getAuthentication());
+ } catch (Exception e) {
+ throw new RuntimeException("Failed fetching Domains by name and parent", e);
+ }
+ }
+
+ public static boolean hasNameConflict(
+ @Nonnull final String name,
+ @Nullable final Urn parentDomainUrn,
+ @Nonnull final QueryContext context,
+ @Nonnull final EntityClient entityClient
+ ) {
+ final Map entities = getDomainsByNameAndParent(name, parentDomainUrn, context, entityClient);
+
+ // Even though we searched by name, do one more pass to check the name is unique
+ return entities.values().stream().anyMatch(entityResponse -> {
+ if (entityResponse.getAspects().containsKey(DOMAIN_PROPERTIES_ASPECT_NAME)) {
+ DataMap dataMap = entityResponse.getAspects().get(DOMAIN_PROPERTIES_ASPECT_NAME).getValue().data();
+ DomainProperties domainProperties = new DomainProperties(dataMap);
+ return (domainProperties.hasName() && domainProperties.getName().equals(name));
+ }
+ return false;
+ });
+ }
+
+ @Nullable
+ public static Entity getParentDomain(
+ @Nonnull final Urn urn,
+ @Nonnull final QueryContext context,
+ @Nonnull final EntityClient entityClient
+ ) {
+ try {
+ final EntityResponse entityResponse = entityClient.getV2(
+ DOMAIN_ENTITY_NAME,
+ urn,
+ Collections.singleton(DOMAIN_PROPERTIES_ASPECT_NAME),
+ context.getAuthentication()
+ );
+
+ if (entityResponse != null && entityResponse.getAspects().containsKey(DOMAIN_PROPERTIES_ASPECT_NAME)) {
+ final DomainProperties properties = new DomainProperties(entityResponse.getAspects().get(DOMAIN_PROPERTIES_ASPECT_NAME).getValue().data());
+ final Urn parentDomainUrn = getParentDomainSafely(properties);
+ return parentDomainUrn != null ? UrnToEntityMapper.map(parentDomainUrn) : null;
+ }
+ } catch (Exception e) {
+ throw new RuntimeException(String.format("Failed to retrieve parent domain for entity %s", urn), e);
+ }
+
+ return null;
+ }
+
+ /**
+ * Get a parent domain only if hasParentDomain was set. There is strange elastic behavior where moving a domain
+ * to the root leaves the parentDomain field set but makes hasParentDomain false. This helper makes sure that queries
+ * to elastic where hasParentDomain=false and parentDomain=value only gives us the parentDomain if hasParentDomain=true.
+ * @param properties the domain properties aspect
+ * @return the parentDomain or null
+ */
+ @Nullable
+ public static Urn getParentDomainSafely(@Nonnull final DomainProperties properties) {
+ return properties.hasParentDomain() ? properties.getParentDomain() : null;
+ }
}
\ No newline at end of file
diff --git a/datahub-graphql-core/src/main/resources/app.graphql b/datahub-graphql-core/src/main/resources/app.graphql
index a5057bcf644daf..075a3b0fac43bc 100644
--- a/datahub-graphql-core/src/main/resources/app.graphql
+++ b/datahub-graphql-core/src/main/resources/app.graphql
@@ -441,10 +441,17 @@ type FeatureFlagsConfig {
Whether we should show CTAs in the UI related to moving to Managed DataHub by Acryl.
"""
showAcrylInfo: Boolean!
+
"""
Whether we should show AccessManagement tab in the datahub UI.
"""
showAccessManagement: Boolean!
+
+ """
+ Enables the nested Domains feature that allows users to have sub-Domains.
+ If this is off, Domains appear "flat" again.
+ """
+ nestedDomainsEnabled: Boolean!
}
"""
diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql
index 044c405942a3c7..39f86948c77c40 100644
--- a/datahub-graphql-core/src/main/resources/entity.graphql
+++ b/datahub-graphql-core/src/main/resources/entity.graphql
@@ -434,6 +434,11 @@ type Mutation {
"""
createDomain(input: CreateDomainInput!): String
+ """
+ Moves a domain to be parented under another domain.
+ """
+ moveDomain(input: MoveDomainInput!): Boolean
+
"""
Delete a Domain
"""
@@ -7735,6 +7740,21 @@ input UpdateParentNodeInput {
resourceUrn: String!
}
+"""
+Input for updating the parent domain of a domain.
+"""
+input MoveDomainInput {
+ """
+ The new parent domain urn. If parentDomain is null, this will remove the parent from this entity
+ """
+ parentDomain: String
+
+ """
+ The primary key of the resource to update the parent domain for
+ """
+ resourceUrn: String!
+}
+
"""
Input for updating the name of an entity
"""
@@ -9584,15 +9604,31 @@ type Domain implements Entity {
"""
entities(input: DomainEntitiesInput): SearchResults
+ """
+ Recursively get the lineage of parent domains for this entity
+ """
+ parentDomains: ParentDomainsResult
+
"""
Edges extending from this entity
"""
relationships(input: RelationshipsInput!): EntityRelationshipsResult
}
+"""
+All of the parent domains starting from a single Domain through all of its ancestors
+"""
+type ParentDomainsResult {
+ """
+ The number of parent domains bubbling up for this entity
+ """
+ count: Int!
-
-
+ """
+ A list of parent domains in order from direct parent, to parent's parent etc. If there are no parents, return an empty list
+ """
+ domains: [Entity!]!
+}
"""
Properties about a domain
@@ -9652,6 +9688,11 @@ input CreateDomainInput {
Optional description for the Domain
"""
description: String
+
+ """
+ Optional parent domain urn for the domain
+ """
+ parentDomain: String
}
"""
@@ -9672,6 +9713,11 @@ input ListDomainsInput {
Optional search query
"""
query: String
+
+ """
+ Optional parent domain
+ """
+ parentDomain: String
}
"""
diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/CreateDomainResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/CreateDomainResolverTest.java
index 8c19f1dc3eb341..560a3865ce9e1a 100644
--- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/CreateDomainResolverTest.java
+++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/CreateDomainResolverTest.java
@@ -6,35 +6,57 @@
import com.linkedin.common.urn.UrnUtils;
import com.linkedin.datahub.graphql.QueryContext;
import com.linkedin.datahub.graphql.generated.CreateDomainInput;
+import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils;
import com.linkedin.domain.DomainProperties;
+import com.linkedin.entity.Aspect;
+import com.linkedin.entity.EntityResponse;
+import com.linkedin.entity.EnvelopedAspect;
+import com.linkedin.entity.EnvelopedAspectMap;
import com.linkedin.entity.client.EntityClient;
import com.linkedin.events.metadata.ChangeType;
import com.linkedin.metadata.Constants;
import com.linkedin.metadata.key.DomainKey;
+import com.linkedin.metadata.search.SearchEntity;
+import com.linkedin.metadata.search.SearchEntityArray;
+import com.linkedin.metadata.search.SearchResult;
import com.linkedin.metadata.utils.GenericRecordUtils;
import com.linkedin.metadata.entity.EntityService;
import com.linkedin.mxe.MetadataChangeProposal;
import com.linkedin.r2.RemoteInvocationException;
import graphql.schema.DataFetchingEnvironment;
+
+import java.util.HashMap;
+import java.util.Map;
import java.util.concurrent.CompletionException;
import org.mockito.Mockito;
import org.testng.annotations.Test;
import static com.linkedin.datahub.graphql.TestUtils.*;
+import static com.linkedin.metadata.Constants.DOMAIN_PROPERTIES_ASPECT_NAME;
import static org.testng.Assert.*;
public class CreateDomainResolverTest {
+ private static final Urn TEST_DOMAIN_URN = Urn.createFromTuple("domain", "test-id");
+ private static final Urn TEST_PARENT_DOMAIN_URN = Urn.createFromTuple("domain", "test-parent-id");
+
private static final CreateDomainInput TEST_INPUT = new CreateDomainInput(
"test-id",
"test-name",
- "test-description"
+ "test-description",
+ TEST_PARENT_DOMAIN_URN.toString()
+ );
+
+ private static final CreateDomainInput TEST_INPUT_NO_PARENT_DOMAIN = new CreateDomainInput(
+ "test-id",
+ "test-name",
+ "test-description",
+ null
);
+
private static final Urn TEST_ACTOR_URN = UrnUtils.getUrn("urn:li:corpuser:test");
- private static final String TEST_ENTITY_URN = "urn:li:dataset:(urn:li:dataPlatform:mysql,my-test,PROD)";
- private static final String TEST_TAG_1_URN = "urn:li:tag:test-id-1";
- private static final String TEST_TAG_2_URN = "urn:li:tag:test-id-2";
+
@Test
public void testGetSuccess() throws Exception {
@@ -43,12 +65,31 @@ public void testGetSuccess() throws Exception {
EntityService mockService = getMockEntityService();
CreateDomainResolver resolver = new CreateDomainResolver(mockClient, mockService);
+ Mockito.when(mockClient.exists(
+ Mockito.eq(TEST_DOMAIN_URN),
+ Mockito.any(Authentication.class)
+ )).thenReturn(false);
+
+ Mockito.when(mockClient.exists(
+ Mockito.eq(TEST_PARENT_DOMAIN_URN),
+ Mockito.any(Authentication.class)
+ )).thenReturn(true);
+
// Execute resolver
QueryContext mockContext = getMockAllowContext();
DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class);
Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(TEST_INPUT);
Mockito.when(mockEnv.getContext()).thenReturn(mockContext);
+ Mockito.when(mockClient.filter(
+ Mockito.eq(Constants.DOMAIN_ENTITY_NAME),
+ Mockito.eq(DomainUtils.buildNameAndParentDomainFilter(TEST_INPUT.getName(), TEST_PARENT_DOMAIN_URN)),
+ Mockito.eq(null),
+ Mockito.any(Integer.class),
+ Mockito.any(Integer.class),
+ Mockito.any(Authentication.class)
+ )).thenReturn(new SearchResult().setEntities(new SearchEntityArray()));
+
resolver.get(mockEnv).get();
final DomainKey key = new DomainKey();
@@ -60,6 +101,7 @@ public void testGetSuccess() throws Exception {
props.setDescription("test-description");
props.setName("test-name");
props.setCreated(new AuditStamp().setActor(TEST_ACTOR_URN).setTime(0L));
+ props.setParentDomain(TEST_PARENT_DOMAIN_URN);
proposal.setAspectName(Constants.DOMAIN_PROPERTIES_ASPECT_NAME);
proposal.setAspect(GenericRecordUtils.serializeAspect(props));
proposal.setChangeType(ChangeType.UPSERT);
@@ -72,6 +114,133 @@ public void testGetSuccess() throws Exception {
);
}
+ @Test
+ public void testGetSuccessNoParentDomain() throws Exception {
+ EntityClient mockClient = Mockito.mock(EntityClient.class);
+ EntityService mockService = Mockito.mock(EntityService.class);
+ CreateDomainResolver resolver = new CreateDomainResolver(mockClient, mockService);
+
+ Mockito.when(mockClient.exists(
+ Mockito.eq(TEST_DOMAIN_URN),
+ Mockito.any(Authentication.class)
+ )).thenReturn(false);
+
+ QueryContext mockContext = getMockAllowContext();
+ DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class);
+ Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(TEST_INPUT_NO_PARENT_DOMAIN);
+ Mockito.when(mockEnv.getContext()).thenReturn(mockContext);
+
+ Mockito.when(mockClient.filter(
+ Mockito.eq(Constants.DOMAIN_ENTITY_NAME),
+ Mockito.eq(DomainUtils.buildNameAndParentDomainFilter(TEST_INPUT.getName(), null)),
+ Mockito.eq(null),
+ Mockito.any(Integer.class),
+ Mockito.any(Integer.class),
+ Mockito.any(Authentication.class)
+ )).thenReturn(new SearchResult().setEntities(new SearchEntityArray()));
+
+ resolver.get(mockEnv).get();
+
+ final DomainKey key = new DomainKey();
+ key.setId("test-id");
+ final MetadataChangeProposal proposal = new MetadataChangeProposal();
+ proposal.setEntityKeyAspect(GenericRecordUtils.serializeAspect(key));
+ proposal.setEntityType(Constants.DOMAIN_ENTITY_NAME);
+ DomainProperties props = new DomainProperties();
+ props.setDescription("test-description");
+ props.setName("test-name");
+ props.setCreated(new AuditStamp().setActor(TEST_ACTOR_URN).setTime(0L));
+ proposal.setAspectName(Constants.DOMAIN_PROPERTIES_ASPECT_NAME);
+ proposal.setAspect(GenericRecordUtils.serializeAspect(props));
+ proposal.setChangeType(ChangeType.UPSERT);
+
+ Mockito.verify(mockClient, Mockito.times(1)).ingestProposal(
+ Mockito.argThat(new CreateDomainProposalMatcher(proposal)),
+ Mockito.any(Authentication.class),
+ Mockito.eq(false)
+ );
+ }
+
+ @Test
+ public void testGetInvalidParent() throws Exception {
+ EntityClient mockClient = Mockito.mock(EntityClient.class);
+ EntityService mockService = Mockito.mock(EntityService.class);
+ CreateDomainResolver resolver = new CreateDomainResolver(mockClient, mockService);
+
+ Mockito.when(mockClient.exists(
+ Mockito.eq(TEST_DOMAIN_URN),
+ Mockito.any(Authentication.class)
+ )).thenReturn(false);
+
+ Mockito.when(mockClient.exists(
+ Mockito.eq(TEST_PARENT_DOMAIN_URN),
+ Mockito.any(Authentication.class)
+ )).thenReturn(false);
+
+ QueryContext mockContext = getMockAllowContext();
+ DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class);
+ Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(TEST_INPUT);
+ Mockito.when(mockEnv.getContext()).thenReturn(mockContext);
+
+ assertThrows(CompletionException.class, () -> resolver.get(mockEnv).join());
+ }
+
+ @Test
+ public void testGetNameConflict() throws Exception {
+ EntityClient mockClient = Mockito.mock(EntityClient.class);
+ EntityService mockService = Mockito.mock(EntityService.class);
+ CreateDomainResolver resolver = new CreateDomainResolver(mockClient, mockService);
+
+ Mockito.when(mockClient.exists(
+ Mockito.eq(TEST_DOMAIN_URN),
+ Mockito.any(Authentication.class)
+ )).thenReturn(false);
+
+ Mockito.when(mockClient.exists(
+ Mockito.eq(TEST_PARENT_DOMAIN_URN),
+ Mockito.any(Authentication.class)
+ )).thenReturn(true);
+
+ QueryContext mockContext = getMockAllowContext();
+ DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class);
+ Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(TEST_INPUT);
+ Mockito.when(mockEnv.getContext()).thenReturn(mockContext);
+
+ Mockito.when(mockClient.filter(
+ Mockito.eq(Constants.DOMAIN_ENTITY_NAME),
+ Mockito.eq(DomainUtils.buildNameAndParentDomainFilter(TEST_INPUT.getName(), TEST_PARENT_DOMAIN_URN)),
+ Mockito.eq(null),
+ Mockito.any(Integer.class),
+ Mockito.any(Integer.class),
+ Mockito.any(Authentication.class)
+ )).thenReturn(new SearchResult().setEntities(
+ new SearchEntityArray(new SearchEntity().setEntity(TEST_DOMAIN_URN))
+ ));
+
+ DomainProperties domainProperties = new DomainProperties();
+ domainProperties.setDescription(TEST_INPUT.getDescription());
+ domainProperties.setName(TEST_INPUT.getName());
+ domainProperties.setCreated(new AuditStamp().setActor(TEST_ACTOR_URN).setTime(0L));
+ domainProperties.setParentDomain(TEST_PARENT_DOMAIN_URN);
+
+ EntityResponse entityResponse = new EntityResponse();
+ EnvelopedAspectMap envelopedAspectMap = new EnvelopedAspectMap();
+ envelopedAspectMap.put(DOMAIN_PROPERTIES_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(domainProperties.data())));
+ entityResponse.setAspects(envelopedAspectMap);
+
+ Map entityResponseMap = new HashMap<>();
+ entityResponseMap.put(TEST_DOMAIN_URN, entityResponse);
+
+ Mockito.when(mockClient.batchGetV2(
+ Mockito.eq(Constants.DOMAIN_ENTITY_NAME),
+ Mockito.any(),
+ Mockito.any(),
+ Mockito.any(Authentication.class)
+ )).thenReturn(entityResponseMap);
+
+ assertThrows(CompletionException.class, () -> resolver.get(mockEnv).join());
+ }
+
@Test
public void testGetUnauthorized() throws Exception {
// Create resolver
diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/DeleteDomainResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/DeleteDomainResolverTest.java
index 1c450b0e85424d..9bcdbe6d2a0e0a 100644
--- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/DeleteDomainResolverTest.java
+++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/DeleteDomainResolverTest.java
@@ -4,6 +4,7 @@
import com.linkedin.common.urn.Urn;
import com.linkedin.datahub.graphql.QueryContext;
import com.linkedin.entity.client.EntityClient;
+import com.linkedin.metadata.search.SearchResult;
import graphql.schema.DataFetchingEnvironment;
import java.util.concurrent.CompletionException;
import org.mockito.Mockito;
@@ -28,6 +29,10 @@ public void testGetSuccess() throws Exception {
Mockito.when(mockEnv.getArgument(Mockito.eq("urn"))).thenReturn(TEST_URN);
Mockito.when(mockEnv.getContext()).thenReturn(mockContext);
+ // Domain has 0 child domains
+ Mockito.when(mockClient.filter(Mockito.eq("domain"), Mockito.any(), Mockito.any(), Mockito.eq(0), Mockito.eq(1), Mockito.any()))
+ .thenReturn(new SearchResult().setNumEntities(0));
+
assertTrue(resolver.get(mockEnv).get());
Mockito.verify(mockClient, Mockito.times(1)).deleteEntity(
@@ -36,6 +41,28 @@ public void testGetSuccess() throws Exception {
);
}
+ @Test
+ public void testDeleteWithChildDomains() throws Exception {
+ EntityClient mockClient = Mockito.mock(EntityClient.class);
+ DeleteDomainResolver resolver = new DeleteDomainResolver(mockClient);
+
+ // Execute resolver
+ QueryContext mockContext = getMockAllowContext();
+ DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class);
+ Mockito.when(mockEnv.getArgument(Mockito.eq("urn"))).thenReturn(TEST_URN);
+ Mockito.when(mockEnv.getContext()).thenReturn(mockContext);
+
+ // Domain has child domains
+ Mockito.when(mockClient.filter(Mockito.eq("domain"), Mockito.any(), Mockito.any(), Mockito.eq(0), Mockito.eq(1), Mockito.any()))
+ .thenReturn(new SearchResult().setNumEntities(1));
+
+ assertThrows(CompletionException.class, () -> resolver.get(mockEnv).join());
+
+ Mockito.verify(mockClient, Mockito.times(0)).deleteEntity(
+ Mockito.any(),
+ Mockito.any(Authentication.class));
+ }
+
@Test
public void testGetUnauthorized() throws Exception {
// Create resolver
diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/ListDomainsResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/ListDomainsResolverTest.java
index c143f3480fcff1..bd8a8f98de4974 100644
--- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/ListDomainsResolverTest.java
+++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/ListDomainsResolverTest.java
@@ -5,6 +5,7 @@
import com.linkedin.common.urn.Urn;
import com.linkedin.datahub.graphql.QueryContext;
import com.linkedin.datahub.graphql.generated.ListDomainsInput;
+import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils;
import com.linkedin.entity.client.EntityClient;
import com.linkedin.metadata.Constants;
import com.linkedin.metadata.query.SearchFlags;
@@ -28,9 +29,14 @@
public class ListDomainsResolverTest {
private static final Urn TEST_DOMAIN_URN = Urn.createFromTuple("domain", "test-id");
+ private static final Urn TEST_PARENT_DOMAIN_URN = Urn.createFromTuple("domain", "test-parent-id");
private static final ListDomainsInput TEST_INPUT = new ListDomainsInput(
- 0, 20, null
+ 0, 20, null, TEST_PARENT_DOMAIN_URN.toString()
+ );
+
+ private static final ListDomainsInput TEST_INPUT_NO_PARENT_DOMAIN = new ListDomainsInput(
+ 0, 20, null, null
);
@Test
@@ -41,7 +47,7 @@ public void testGetSuccess() throws Exception {
Mockito.when(mockClient.search(
Mockito.eq(Constants.DOMAIN_ENTITY_NAME),
Mockito.eq(""),
- Mockito.eq(null),
+ Mockito.eq(DomainUtils.buildParentDomainFilter(TEST_PARENT_DOMAIN_URN)),
Mockito.eq(new SortCriterion().setField(DOMAIN_CREATED_TIME_INDEX_FIELD_NAME).setOrder(SortOrder.DESCENDING)),
Mockito.eq(0),
Mockito.eq(20),
@@ -71,6 +77,44 @@ public void testGetSuccess() throws Exception {
assertEquals(resolver.get(mockEnv).get().getDomains().get(0).getUrn(), TEST_DOMAIN_URN.toString());
}
+ @Test
+ public void testGetSuccessNoParentDomain() throws Exception {
+ // Create resolver
+ EntityClient mockClient = Mockito.mock(EntityClient.class);
+
+ Mockito.when(mockClient.search(
+ Mockito.eq(Constants.DOMAIN_ENTITY_NAME),
+ Mockito.eq(""),
+ Mockito.eq(DomainUtils.buildParentDomainFilter(null)),
+ Mockito.eq(new SortCriterion().setField(DOMAIN_CREATED_TIME_INDEX_FIELD_NAME).setOrder(SortOrder.DESCENDING)),
+ Mockito.eq(0),
+ Mockito.eq(20),
+ Mockito.any(Authentication.class),
+ Mockito.eq(new SearchFlags().setFulltext(true))
+ )).thenReturn(
+ new SearchResult()
+ .setFrom(0)
+ .setPageSize(1)
+ .setNumEntities(1)
+ .setEntities(new SearchEntityArray(ImmutableSet.of(new SearchEntity().setEntity(TEST_DOMAIN_URN))))
+ );
+
+ ListDomainsResolver resolver = new ListDomainsResolver(mockClient);
+
+ // Execute resolver
+ QueryContext mockContext = getMockAllowContext();
+ DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class);
+ Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(TEST_INPUT_NO_PARENT_DOMAIN);
+ Mockito.when(mockEnv.getContext()).thenReturn(mockContext);
+
+ // Data Assertions
+ assertEquals((int) resolver.get(mockEnv).get().getStart(), 0);
+ assertEquals((int) resolver.get(mockEnv).get().getCount(), 1);
+ assertEquals((int) resolver.get(mockEnv).get().getTotal(), 1);
+ assertEquals(resolver.get(mockEnv).get().getDomains().size(), 1);
+ assertEquals(resolver.get(mockEnv).get().getDomains().get(0).getUrn(), TEST_DOMAIN_URN.toString());
+ }
+
@Test
public void testGetUnauthorized() throws Exception {
// Create resolver
diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/MoveDomainResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/MoveDomainResolverTest.java
new file mode 100644
index 00000000000000..4059c180b0eb03
--- /dev/null
+++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/MoveDomainResolverTest.java
@@ -0,0 +1,140 @@
+package com.linkedin.datahub.graphql.resolvers.domain;
+
+import com.datahub.authentication.Authentication;
+import com.linkedin.common.AuditStamp;
+import com.linkedin.common.urn.CorpuserUrn;
+import com.linkedin.common.urn.Urn;
+import com.linkedin.datahub.graphql.QueryContext;
+import com.linkedin.datahub.graphql.generated.MoveDomainInput;
+import com.linkedin.datahub.graphql.resolvers.mutate.MoveDomainResolver;
+import com.linkedin.datahub.graphql.resolvers.mutate.MutationUtils;
+import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils;
+import com.linkedin.domain.DomainProperties;
+import com.linkedin.entity.client.EntityClient;
+import com.linkedin.metadata.Constants;
+import com.linkedin.metadata.entity.EntityService;
+import com.linkedin.metadata.search.SearchEntityArray;
+import com.linkedin.metadata.search.SearchResult;
+import com.linkedin.mxe.MetadataChangeProposal;
+import graphql.schema.DataFetchingEnvironment;
+import org.mockito.Mockito;
+import org.testng.annotations.Test;
+
+import java.util.concurrent.CompletionException;
+
+import static com.linkedin.datahub.graphql.TestUtils.*;
+import static com.linkedin.metadata.Constants.*;
+import static org.testng.Assert.assertThrows;
+import static org.testng.Assert.assertTrue;
+
+public class MoveDomainResolverTest {
+
+ private static final String CONTAINER_URN = "urn:li:container:00005397daf94708a8822b8106cfd451";
+ private static final String PARENT_DOMAIN_URN = "urn:li:domain:00005397daf94708a8822b8106cfd451";
+ private static final String DOMAIN_URN = "urn:li:domain:11115397daf94708a8822b8106cfd451";
+ private static final MoveDomainInput INPUT = new MoveDomainInput(PARENT_DOMAIN_URN, DOMAIN_URN);
+ private static final MoveDomainInput INVALID_INPUT = new MoveDomainInput(CONTAINER_URN, DOMAIN_URN);
+ private static final CorpuserUrn TEST_ACTOR_URN = new CorpuserUrn("test");
+
+ private MetadataChangeProposal setupTests(DataFetchingEnvironment mockEnv, EntityService mockService, EntityClient mockClient) throws Exception {
+ QueryContext mockContext = getMockAllowContext();
+ Mockito.when(mockContext.getAuthentication()).thenReturn(Mockito.mock(Authentication.class));
+ Mockito.when(mockContext.getActorUrn()).thenReturn(TEST_ACTOR_URN.toString());
+ Mockito.when(mockEnv.getContext()).thenReturn(mockContext);
+
+ final String name = "test name";
+ Mockito.when(mockService.getAspect(
+ Urn.createFromString(DOMAIN_URN),
+ Constants.DOMAIN_PROPERTIES_ASPECT_NAME,
+ 0))
+ .thenReturn(new DomainProperties().setName(name));
+
+ Mockito.when(mockClient.filter(
+ Mockito.eq(Constants.DOMAIN_ENTITY_NAME),
+ Mockito.eq(DomainUtils.buildNameAndParentDomainFilter(name, Urn.createFromString(PARENT_DOMAIN_URN))),
+ Mockito.eq(null),
+ Mockito.any(Integer.class),
+ Mockito.any(Integer.class),
+ Mockito.any(Authentication.class)
+ )).thenReturn(new SearchResult().setEntities(new SearchEntityArray()));
+
+ DomainProperties properties = new DomainProperties();
+ properties.setName(name);
+ properties.setParentDomain(Urn.createFromString(PARENT_DOMAIN_URN));
+ return MutationUtils.buildMetadataChangeProposalWithUrn(Urn.createFromString(DOMAIN_URN),
+ DOMAIN_PROPERTIES_ASPECT_NAME, properties);
+ }
+
+ @Test
+ public void testGetSuccess() throws Exception {
+ EntityService mockService = Mockito.mock(EntityService.class);
+ EntityClient mockClient = Mockito.mock(EntityClient.class);
+ Mockito.when(mockService.exists(Urn.createFromString(PARENT_DOMAIN_URN))).thenReturn(true);
+ DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class);
+ Mockito.when(mockEnv.getArgument("input")).thenReturn(INPUT);
+
+ MoveDomainResolver resolver = new MoveDomainResolver(mockService, mockClient);
+ setupTests(mockEnv, mockService, mockClient);
+
+ assertTrue(resolver.get(mockEnv).get());
+ Mockito.verify(mockService, Mockito.times(1)).ingestProposal(
+ Mockito.any(MetadataChangeProposal.class),
+ Mockito.any(AuditStamp.class),
+ Mockito.eq(false)
+ );
+ }
+
+ @Test
+ public void testGetFailureEntityDoesNotExist() throws Exception {
+ EntityService mockService = Mockito.mock(EntityService.class);
+ EntityClient mockClient = Mockito.mock(EntityClient.class);
+ Mockito.when(mockService.exists(Urn.createFromString(PARENT_DOMAIN_URN))).thenReturn(true);
+ DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class);
+ Mockito.when(mockEnv.getArgument("input")).thenReturn(INPUT);
+
+ QueryContext mockContext = getMockAllowContext();
+ Mockito.when(mockContext.getAuthentication()).thenReturn(Mockito.mock(Authentication.class));
+ Mockito.when(mockContext.getActorUrn()).thenReturn(TEST_ACTOR_URN.toString());
+ Mockito.when(mockEnv.getContext()).thenReturn(mockContext);
+
+ Mockito.when(mockService.getAspect(
+ Urn.createFromString(DOMAIN_URN),
+ DOMAIN_PROPERTIES_ASPECT_NAME,
+ 0))
+ .thenReturn(null);
+
+ MoveDomainResolver resolver = new MoveDomainResolver(mockService, mockClient);
+ assertThrows(CompletionException.class, () -> resolver.get(mockEnv).join());
+ verifyNoIngestProposal(mockService);
+ }
+
+ @Test
+ public void testGetFailureParentDoesNotExist() throws Exception {
+ EntityService mockService = Mockito.mock(EntityService.class);
+ EntityClient mockClient = Mockito.mock(EntityClient.class);
+ Mockito.when(mockService.exists(Urn.createFromString(PARENT_DOMAIN_URN))).thenReturn(false);
+ DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class);
+ Mockito.when(mockEnv.getArgument("input")).thenReturn(INPUT);
+
+ MoveDomainResolver resolver = new MoveDomainResolver(mockService, mockClient);
+ setupTests(mockEnv, mockService, mockClient);
+
+ assertThrows(CompletionException.class, () -> resolver.get(mockEnv).join());
+ verifyNoIngestProposal(mockService);
+ }
+
+ @Test
+ public void testGetFailureParentIsNotDomain() throws Exception {
+ EntityService mockService = Mockito.mock(EntityService.class);
+ EntityClient mockClient = Mockito.mock(EntityClient.class);
+ Mockito.when(mockService.exists(Urn.createFromString(PARENT_DOMAIN_URN))).thenReturn(true);
+ DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class);
+ Mockito.when(mockEnv.getArgument("input")).thenReturn(INVALID_INPUT);
+
+ MoveDomainResolver resolver = new MoveDomainResolver(mockService, mockClient);
+ setupTests(mockEnv, mockService, mockClient);
+
+ assertThrows(CompletionException.class, () -> resolver.get(mockEnv).join());
+ verifyNoIngestProposal(mockService);
+ }
+}
diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/ParentDomainsResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/ParentDomainsResolverTest.java
new file mode 100644
index 00000000000000..7bd7c3afac001c
--- /dev/null
+++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/ParentDomainsResolverTest.java
@@ -0,0 +1,95 @@
+package com.linkedin.datahub.graphql.resolvers.domain;
+
+import com.datahub.authentication.Authentication;
+import com.linkedin.common.urn.Urn;
+import com.linkedin.datahub.graphql.QueryContext;
+import com.linkedin.datahub.graphql.generated.Domain;
+import com.linkedin.datahub.graphql.generated.EntityType;
+import com.linkedin.datahub.graphql.generated.ParentDomainsResult;
+import com.linkedin.domain.DomainProperties;
+import com.linkedin.entity.Aspect;
+import com.linkedin.entity.EntityResponse;
+import com.linkedin.entity.EnvelopedAspect;
+import com.linkedin.entity.EnvelopedAspectMap;
+import com.linkedin.entity.client.EntityClient;
+import graphql.schema.DataFetchingEnvironment;
+import org.mockito.Mockito;
+import org.testng.annotations.Test;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import static com.linkedin.metadata.Constants.*;
+import static org.testng.Assert.assertEquals;
+
+public class ParentDomainsResolverTest {
+ @Test
+ public void testGetSuccessForDomain() throws Exception {
+ EntityClient mockClient = Mockito.mock(EntityClient.class);
+ QueryContext mockContext = Mockito.mock(QueryContext.class);
+ Mockito.when(mockContext.getAuthentication()).thenReturn(Mockito.mock(Authentication.class));
+ DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class);
+ Mockito.when(mockEnv.getContext()).thenReturn(mockContext);
+
+ Urn domainUrn = Urn.createFromString("urn:li:domain:00005397daf94708a8822b8106cfd451");
+ Domain domainEntity = new Domain();
+ domainEntity.setUrn(domainUrn.toString());
+ domainEntity.setType(EntityType.DOMAIN);
+ Mockito.when(mockEnv.getSource()).thenReturn(domainEntity);
+
+ final DomainProperties parentDomain1 = new DomainProperties().setParentDomain(Urn.createFromString(
+ "urn:li:domain:11115397daf94708a8822b8106cfd451")
+ ).setName("test def");
+ final DomainProperties parentDomain2 = new DomainProperties().setParentDomain(Urn.createFromString(
+ "urn:li:domain:22225397daf94708a8822b8106cfd451")
+ ).setName("test def 2");
+
+ Map domainAspects = new HashMap<>();
+ domainAspects.put(DOMAIN_PROPERTIES_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(parentDomain1.data())));
+
+ Map parentDomain1Aspects = new HashMap<>();
+ parentDomain1Aspects.put(DOMAIN_PROPERTIES_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(
+ new DomainProperties().setName("domain parent 1").setParentDomain(parentDomain2.getParentDomain()).data()
+ )));
+
+ Map parentDomain2Aspects = new HashMap<>();
+ parentDomain2Aspects.put(DOMAIN_PROPERTIES_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(
+ new DomainProperties().setName("domain parent 2").data()
+ )));
+
+ Mockito.when(mockClient.getV2(
+ Mockito.eq(domainUrn.getEntityType()),
+ Mockito.eq(domainUrn),
+ Mockito.eq(Collections.singleton(DOMAIN_PROPERTIES_ASPECT_NAME)),
+ Mockito.any(Authentication.class)
+ )).thenReturn(new EntityResponse().setAspects(new EnvelopedAspectMap(domainAspects)));
+
+ Mockito.when(mockClient.getV2(
+ Mockito.eq(parentDomain1.getParentDomain().getEntityType()),
+ Mockito.eq(parentDomain1.getParentDomain()),
+ Mockito.eq(Collections.singleton(DOMAIN_PROPERTIES_ASPECT_NAME)),
+ Mockito.any(Authentication.class)
+ )).thenReturn(new EntityResponse().setAspects(new EnvelopedAspectMap(parentDomain1Aspects)));
+
+ Mockito.when(mockClient.getV2(
+ Mockito.eq(parentDomain2.getParentDomain().getEntityType()),
+ Mockito.eq(parentDomain2.getParentDomain()),
+ Mockito.eq(Collections.singleton(DOMAIN_PROPERTIES_ASPECT_NAME)),
+ Mockito.any(Authentication.class)
+ )).thenReturn(new EntityResponse().setAspects(new EnvelopedAspectMap(parentDomain2Aspects)));
+
+ ParentDomainsResolver resolver = new ParentDomainsResolver(mockClient);
+ ParentDomainsResult result = resolver.get(mockEnv).get();
+
+ Mockito.verify(mockClient, Mockito.times(3)).getV2(
+ Mockito.any(),
+ Mockito.any(),
+ Mockito.any(),
+ Mockito.any()
+ );
+ assertEquals(result.getCount(), 2);
+ assertEquals(result.getDomains().get(0).getUrn(), parentDomain1.getParentDomain().toString());
+ assertEquals(result.getDomains().get(1).getUrn(), parentDomain2.getParentDomain().toString());
+ }
+}
\ No newline at end of file
diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/glossary/UpdateNameResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/glossary/UpdateNameResolverTest.java
index 064e2dd3bd59b4..eee9cfbae8fcb2 100644
--- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/glossary/UpdateNameResolverTest.java
+++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/glossary/UpdateNameResolverTest.java
@@ -8,12 +8,15 @@
import com.linkedin.datahub.graphql.generated.UpdateNameInput;
import com.linkedin.datahub.graphql.resolvers.mutate.MutationUtils;
import com.linkedin.datahub.graphql.resolvers.mutate.UpdateNameResolver;
+import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils;
import com.linkedin.domain.DomainProperties;
import com.linkedin.entity.client.EntityClient;
import com.linkedin.glossary.GlossaryNodeInfo;
import com.linkedin.glossary.GlossaryTermInfo;
import com.linkedin.metadata.Constants;
import com.linkedin.metadata.entity.EntityService;
+import com.linkedin.metadata.search.SearchEntityArray;
+import com.linkedin.metadata.search.SearchResult;
import com.linkedin.mxe.MetadataChangeProposal;
import graphql.schema.DataFetchingEnvironment;
import org.mockito.Mockito;
@@ -121,6 +124,15 @@ public void testGetSuccessForDomain() throws Exception {
0))
.thenReturn(new DomainProperties().setName(name));
+ Mockito.when(mockClient.filter(
+ Mockito.eq(Constants.DOMAIN_ENTITY_NAME),
+ Mockito.eq(DomainUtils.buildNameAndParentDomainFilter(INPUT_FOR_DOMAIN.getName(), null)),
+ Mockito.eq(null),
+ Mockito.any(Integer.class),
+ Mockito.any(Integer.class),
+ Mockito.any(Authentication.class)
+ )).thenReturn(new SearchResult().setEntities(new SearchEntityArray()));
+
DomainProperties properties = new DomainProperties();
properties.setName(NEW_NAME);
final MetadataChangeProposal proposal = MutationUtils.buildMetadataChangeProposalWithUrn(Urn.createFromString(DOMAIN_URN),
diff --git a/datahub-web-react/src/app/SearchRoutes.tsx b/datahub-web-react/src/app/SearchRoutes.tsx
index 82606befd2663f..d2ad4ab6f4db19 100644
--- a/datahub-web-react/src/app/SearchRoutes.tsx
+++ b/datahub-web-react/src/app/SearchRoutes.tsx
@@ -8,20 +8,27 @@ import { EntityPage } from './entity/EntityPage';
import { BrowseResultsPage } from './browse/BrowseResultsPage';
import { SearchPage } from './search/SearchPage';
import { AnalyticsPage } from './analyticsDashboard/components/AnalyticsPage';
-import { ManageDomainsPage } from './domain/ManageDomainsPage';
import { ManageIngestionPage } from './ingest/ManageIngestionPage';
import GlossaryRoutes from './glossary/GlossaryRoutes';
import { SettingsPage } from './settings/SettingsPage';
+import DomainRoutes from './domain/DomainRoutes';
+import { useIsNestedDomainsEnabled } from './useAppConfig';
+import { ManageDomainsPage } from './domain/ManageDomainsPage';
/**
* Container for all searchable page routes
*/
export const SearchRoutes = (): JSX.Element => {
const entityRegistry = useEntityRegistry();
+ const isNestedDomainsEnabled = useIsNestedDomainsEnabled();
+ const entities = isNestedDomainsEnabled
+ ? entityRegistry.getEntitiesForSearchRoutes()
+ : entityRegistry.getNonGlossaryEntities();
+
return (
- {entityRegistry.getNonGlossaryEntities().map((entity) => (
+ {entities.map((entity) => (
{
/>
} />
} />
- } />
+ {isNestedDomainsEnabled && } />}
+ {!isNestedDomainsEnabled && } />}
} />
} />
} />
diff --git a/datahub-web-react/src/app/analytics/event.ts b/datahub-web-react/src/app/analytics/event.ts
index 84173b522fb072..28cd61ff3171a9 100644
--- a/datahub-web-react/src/app/analytics/event.ts
+++ b/datahub-web-react/src/app/analytics/event.ts
@@ -55,6 +55,7 @@ export enum EventType {
ShowStandardHomepageEvent,
CreateGlossaryEntityEvent,
CreateDomainEvent,
+ MoveDomainEvent,
CreateIngestionSourceEvent,
UpdateIngestionSourceEvent,
DeleteIngestionSourceEvent,
@@ -454,6 +455,13 @@ export interface CreateGlossaryEntityEvent extends BaseEvent {
export interface CreateDomainEvent extends BaseEvent {
type: EventType.CreateDomainEvent;
+ parentDomainUrn?: string;
+}
+
+export interface MoveDomainEvent extends BaseEvent {
+ type: EventType.MoveDomainEvent;
+ oldParentDomainUrn?: string;
+ parentDomainUrn?: string;
}
// Managed Ingestion Events
@@ -653,6 +661,7 @@ export type Event =
| ShowStandardHomepageEvent
| CreateGlossaryEntityEvent
| CreateDomainEvent
+ | MoveDomainEvent
| CreateIngestionSourceEvent
| UpdateIngestionSourceEvent
| DeleteIngestionSourceEvent
diff --git a/datahub-web-react/src/app/domain/CreateDomainModal.tsx b/datahub-web-react/src/app/domain/CreateDomainModal.tsx
index 9fd24b551c0afa..ca1bc305960035 100644
--- a/datahub-web-react/src/app/domain/CreateDomainModal.tsx
+++ b/datahub-web-react/src/app/domain/CreateDomainModal.tsx
@@ -5,9 +5,12 @@ import { useCreateDomainMutation } from '../../graphql/domain.generated';
import { useEnterKeyListener } from '../shared/useEnterKeyListener';
import { validateCustomUrnId } from '../shared/textUtil';
import analytics, { EventType } from '../analytics';
+import DomainParentSelect from '../entity/shared/EntityDropdown/DomainParentSelect';
+import { useIsNestedDomainsEnabled } from '../useAppConfig';
+import { useDomainsContext } from './DomainsContext';
const SuggestedNamesGroup = styled.div`
- margin-top: 12px;
+ margin-top: 8px;
`;
const ClickableTag = styled(Tag)`
@@ -16,9 +19,38 @@ const ClickableTag = styled(Tag)`
}
`;
+const FormItem = styled(Form.Item)`
+ .ant-form-item-label {
+ padding-bottom: 2px;
+ }
+`;
+
+const FormItemWithMargin = styled(FormItem)`
+ margin-bottom: 16px;
+`;
+
+const FormItemNoMargin = styled(FormItem)`
+ margin-bottom: 0;
+`;
+
+const FormItemLabel = styled(Typography.Text)`
+ font-weight: 600;
+ color: #373d44;
+`;
+
+const AdvancedLabel = styled(Typography.Text)`
+ color: #373d44;
+`;
+
type Props = {
onClose: () => void;
- onCreate: (urn: string, id: string | undefined, name: string, description: string | undefined) => void;
+ onCreate: (
+ urn: string,
+ id: string | undefined,
+ name: string,
+ description: string | undefined,
+ parentDomain?: string,
+ ) => void;
};
const SUGGESTED_DOMAIN_NAMES = ['Engineering', 'Marketing', 'Sales', 'Product'];
@@ -28,7 +60,12 @@ const NAME_FIELD_NAME = 'name';
const DESCRIPTION_FIELD_NAME = 'description';
export default function CreateDomainModal({ onClose, onCreate }: Props) {
+ const isNestedDomainsEnabled = useIsNestedDomainsEnabled();
const [createDomainMutation] = useCreateDomainMutation();
+ const { entityData } = useDomainsContext();
+ const [selectedParentUrn, setSelectedParentUrn] = useState(
+ (isNestedDomainsEnabled && entityData?.urn) || '',
+ );
const [createButtonEnabled, setCreateButtonEnabled] = useState(false);
const [form] = Form.useForm();
@@ -39,6 +76,7 @@ export default function CreateDomainModal({ onClose, onCreate }: Props) {
id: form.getFieldValue(ID_FIELD_NAME),
name: form.getFieldValue(NAME_FIELD_NAME),
description: form.getFieldValue(DESCRIPTION_FIELD_NAME),
+ parentDomain: selectedParentUrn || undefined,
},
},
})
@@ -46,6 +84,7 @@ export default function CreateDomainModal({ onClose, onCreate }: Props) {
if (!errors) {
analytics.event({
type: EventType.CreateDomainEvent,
+ parentDomainUrn: selectedParentUrn || undefined,
});
message.success({
content: `Created domain!`,
@@ -56,6 +95,7 @@ export default function CreateDomainModal({ onClose, onCreate }: Props) {
form.getFieldValue(ID_FIELD_NAME),
form.getFieldValue(NAME_FIELD_NAME),
form.getFieldValue(DESCRIPTION_FIELD_NAME),
+ selectedParentUrn || undefined,
);
form.resetFields();
}
@@ -74,7 +114,7 @@ export default function CreateDomainModal({ onClose, onCreate }: Props) {
return (
field.errors.length > 0));
}}
>
- Name}>
- Give your new Domain a name.
- Parent (optional)}>
+
+
+ )}
+ Name}>
+
-
+
{SUGGESTED_DOMAIN_NAMES.map((name) => {
return (
@@ -134,29 +181,29 @@ export default function CreateDomainModal({ onClose, onCreate }: Props) {
);
})}
-
- Description}>
-
- An optional description for your new domain. You can change this later.
-
-
+ Description}
+ help="You can always change the description later."
+ >
+
-
-
+
+
- Advanced} key="1">
- Domain Id}>
-
- By default, a random UUID will be generated to uniquely identify this domain. If
- you'd like to provide a custom id instead to more easily keep track of this domain,
+ Advanced Options} key="1">
+ Domain Id}
+ help="By default, a random UUID will be generated to uniquely identify this domain. If
+ you'd like to provide a custom id instead to more easily keep track of this domain,
you may provide it here. Be careful, you cannot easily change the domain id after
- creation.
-
-
+ ({
@@ -170,8 +217,8 @@ export default function CreateDomainModal({ onClose, onCreate }: Props) {
]}
>
-
-
+
+
diff --git a/datahub-web-react/src/app/domain/DomainIcon.tsx b/datahub-web-react/src/app/domain/DomainIcon.tsx
new file mode 100644
index 00000000000000..0fe9892f0c281f
--- /dev/null
+++ b/datahub-web-react/src/app/domain/DomainIcon.tsx
@@ -0,0 +1,11 @@
+import Icon from '@ant-design/icons/lib/components/Icon';
+import React from 'react';
+import { ReactComponent as DomainsIcon } from '../../images/domain.svg';
+
+type Props = {
+ style?: React.CSSProperties;
+};
+
+export default function DomainIcon({ style }: Props) {
+ return ;
+}
diff --git a/datahub-web-react/src/app/domain/DomainRoutes.tsx b/datahub-web-react/src/app/domain/DomainRoutes.tsx
new file mode 100644
index 00000000000000..56811ddc48c0c6
--- /dev/null
+++ b/datahub-web-react/src/app/domain/DomainRoutes.tsx
@@ -0,0 +1,39 @@
+import React, { useState } from 'react';
+import styled from 'styled-components/macro';
+import { Switch, Route } from 'react-router-dom';
+import { PageRoutes } from '../../conf/Global';
+import { EntityPage } from '../entity/EntityPage';
+import { useEntityRegistry } from '../useEntityRegistry';
+import ManageDomainsPageV2 from './nestedDomains/ManageDomainsPageV2';
+import { EntityType } from '../../types.generated';
+import ManageDomainsSidebar from './nestedDomains/ManageDomainsSidebar';
+import { DomainsContext } from './DomainsContext';
+import { GenericEntityProperties } from '../entity/shared/types';
+
+const ContentWrapper = styled.div`
+ display: flex;
+ flex: 1;
+ overflow: hidden;
+`;
+
+export default function DomainRoutes() {
+ const entityRegistry = useEntityRegistry();
+ const [entityData, setEntityData] = useState(null);
+ const [parentDomainsToUpdate, setParentDomainsToUpdate] = useState([]);
+
+ return (
+
+
+
+
+ }
+ />
+ } />
+
+
+
+ );
+}
diff --git a/datahub-web-react/src/app/domain/DomainSearch.tsx b/datahub-web-react/src/app/domain/DomainSearch.tsx
new file mode 100644
index 00000000000000..e82dae9c2c9e69
--- /dev/null
+++ b/datahub-web-react/src/app/domain/DomainSearch.tsx
@@ -0,0 +1,143 @@
+import React, { CSSProperties, useRef, useState } from 'react';
+import { Link } from 'react-router-dom';
+import styled from 'styled-components/macro';
+import Highlight from 'react-highlighter';
+import { useGetSearchResultsForMultipleQuery } from '../../graphql/search.generated';
+import { EntityType } from '../../types.generated';
+import { IconStyleType } from '../entity/Entity';
+import { ANTD_GRAY } from '../entity/shared/constants';
+import { SearchBar } from '../search/SearchBar';
+import ClickOutside from '../shared/ClickOutside';
+import { useEntityRegistry } from '../useEntityRegistry';
+import DomainIcon from './DomainIcon';
+import ParentEntities from '../search/filters/ParentEntities';
+import { getParentDomains } from './utils';
+
+const DomainSearchWrapper = styled.div`
+ position: relative;
+`;
+
+const ResultsWrapper = styled.div`
+ background-color: white;
+ border-radius: 5px;
+ box-shadow: 0 3px 6px -4px rgb(0 0 0 / 12%), 0 6px 16px 0 rgb(0 0 0 / 8%), 0 9px 28px 8px rgb(0 0 0 / 5%);
+ max-height: 380px;
+ overflow: auto;
+ padding: 8px;
+ position: absolute;
+ max-height: 210px;
+ overflow: auto;
+ width: calc(100% - 24px);
+ left: 12px;
+ top: 45px;
+ z-index: 1;
+`;
+
+const SearchResult = styled(Link)`
+ color: #262626;
+ display: flex;
+ align-items: center;
+ gap: 8px;
+ height: 100%;
+ padding: 6px 8px;
+ width: 100%;
+ &:hover {
+ background-color: ${ANTD_GRAY[3]};
+ color: #262626;
+ }
+`;
+
+const IconWrapper = styled.span``;
+
+const highlightMatchStyle: CSSProperties = {
+ fontWeight: 'bold',
+ background: 'none',
+ padding: 0,
+};
+
+function DomainSearch() {
+ const [query, setQuery] = useState('');
+ const [isSearchBarFocused, setIsSearchBarFocused] = useState(false);
+ const entityRegistry = useEntityRegistry();
+
+ const { data } = useGetSearchResultsForMultipleQuery({
+ variables: {
+ input: {
+ types: [EntityType.Domain],
+ query,
+ start: 0,
+ count: 50,
+ },
+ },
+ skip: !query,
+ });
+
+ const searchResults = data?.searchAcrossEntities?.searchResults;
+ const timerRef = useRef(-1);
+ const handleQueryChange = (q: string) => {
+ window.clearTimeout(timerRef.current);
+ timerRef.current = window.setTimeout(() => {
+ setQuery(q);
+ }, 250);
+ };
+
+ return (
+
+ setIsSearchBarFocused(false)}>
+ null}
+ onQueryChange={(q) => handleQueryChange(q)}
+ entityRegistry={entityRegistry}
+ onFocus={() => setIsSearchBarFocused(true)}
+ />
+ {isSearchBarFocused && searchResults && !!searchResults.length && (
+
+ {searchResults.map((result) => {
+ return (
+ setIsSearchBarFocused(false)}
+ >
+
+ {result.entity.type === EntityType.Domain ? (
+
+ ) : (
+ entityRegistry.getIcon(result.entity.type, 12, IconStyleType.ACCENT)
+ )}
+
+