From 405a223b58b30209bc01e89c3a87094efe89117b Mon Sep 17 00:00:00 2001 From: Christian Kasim Loan Date: Thu, 24 Oct 2024 00:04:39 +0700 Subject: [PATCH] Release/550 (#1567) * spark-nlp-jsl==5.5.0 requirements (#1544) * bump versions * implemented spark-nlp-jsl==550 requirements --------- Co-authored-by: C-K-Loan * bump pydantic support to > 2.X, various refactors and deprecation fixes (#1560) * 550 BugFix (#1563) * fix listener config if nlp=False * update conditions * fix deprecated @validator decerator (#1565) * Pydantic2 migration fix env var model parsing (#1566) * fix deprecated @validator decerator * fix env var pydantic model creation bug * bump verisons --------- Co-authored-by: Mehmet Butgul <109360261+mehmetbutgul@users.noreply.github.com> --- docs/en/jsl/jsl_release_notes.md | 21 +++++++ johnsnowlabs/auto_install/jsl_home.py | 27 +++++--- johnsnowlabs/finance.py | 2 + johnsnowlabs/legal.py | 2 + johnsnowlabs/medical.py | 3 + johnsnowlabs/py_models/install_info.py | 26 ++++++-- johnsnowlabs/py_models/jsl_secrets.py | 80 ++++++++++++++++-------- johnsnowlabs/py_models/lib_version.py | 3 + johnsnowlabs/settings.py | 10 +-- johnsnowlabs/utils/sparksession_utils.py | 7 ++- setup_johnsnowlabs.py | 2 +- setup_johnsnowlabs_for_databricks.py | 2 +- 12 files changed, 138 insertions(+), 47 deletions(-) diff --git a/docs/en/jsl/jsl_release_notes.md b/docs/en/jsl/jsl_release_notes.md index 83c2e70a6e..54a297e4c1 100644 --- a/docs/en/jsl/jsl_release_notes.md +++ b/docs/en/jsl/jsl_release_notes.md @@ -16,6 +16,27 @@ sidebar: See [Github Releases](https://github.com/JohnSnowLabs/johnsnowlabs/releases) for detailed information on Release History and Features + +## 5.5.0 +Release date: 10-23-2024 + +The John Snow Labs 5.5.0 Library released with the following pre-installed and recommended dependencies + +{:.table-model-big} +| Library | Version | +|-----------------------------------------------------------------------------------------|------------| +| [Visual NLP](https://nlp.johnsnowlabs.com/docs/en/spark_ocr_versions/ocr_release_notes) | `5.4.1` | +| [Enterprise NLP](https://nlp.johnsnowlabs.com/docs/en/licensed_annotators) | `5.5.0` | +| [Finance NLP](https://nlp.johnsnowlabs.com/docs/en/financial_release_notes) | `1.X.X` | +| [Legal NLP](https://nlp.johnsnowlabs.com/docs/en/legal_release_notes) | `1.X.X` | +| [NLU](https://github.com/JohnSnowLabs/nlu/releases) | `5.4.1` | +| [Spark-NLP-Display](https://sparknlp.org/docs/en/display) | `5.0` | +| [Spark-NLP](https://github.com/JohnSnowLabs/spark-nlp/releases/) | `5.5.0` | +| [Pyspark](https://spark.apache.org/docs/latest/api/python/) | `3.4.0` | + + + + ## 5.4.5 Release date: 9-27-2024 diff --git a/johnsnowlabs/auto_install/jsl_home.py b/johnsnowlabs/auto_install/jsl_home.py index b33a8584a3..88fe5a48e9 100644 --- a/johnsnowlabs/auto_install/jsl_home.py +++ b/johnsnowlabs/auto_install/jsl_home.py @@ -48,13 +48,15 @@ def download_deps_and_create_info( overwrite=False, ): """Download a list of deps to given lib_dir folder and creates info_file at info_file_path.""" - info, old_info = {}, {} + info, old_info = {}, None if os.path.exists(info_file_path): # keep old infos, we assume they are up-to-date and compatible - old_info = InstallFolder.parse_file(info_file_path) + if os.path.join("java_installs","info.json") in info_file_path: + old_info = InstallFolder.java_folder_from_home() + elif os.path.join("py_installs","info.json") in info_file_path: + old_info = InstallFolder.py_folder_from_home() for p in deps: - # print_prefix = Software.for_name(p.product_name).logo print_prefix = ProductLogo.from_name(p.product_name.name).value if p.dependency_type in JvmHardwareTarget: @@ -88,12 +90,19 @@ def download_deps_and_create_info( install_type=p.dependency_type.value, product_version=p.dependency_version.as_str(), ) + info[p.file_name].compatible_spark_version = p.spark_version.value.as_str() + info[p.file_name].product_version = p.dependency_version.as_str() + if info: info = InstallFolder(**{"infos": info}) if old_info: info.infos.update(old_info.infos) - info.write(info_file_path, indent=4) + with open(info_file_path, "w") as f: + for k, v in info.infos.items(): + v.product_version = str(v.product_version) + v.compatible_spark_version = str(v.compatible_spark_version) + f.write(info.model_dump_json()) def setup_jsl_home( secrets: Optional[JslSecrets] = None, @@ -183,9 +192,10 @@ def setup_jsl_home( java_deps, settings.java_dir, settings.java_info_file, overwrite ) - RootInfo(version=settings.raw_version_jsl_lib, run_from=sys.executable).write( - settings.root_info_file, indent=4 - ) + root_info = RootInfo(version=settings.raw_version_jsl_lib, run_from=sys.executable) + root_info.version = root_info.version.as_str() + with open(settings.root_info_file, "w") as f: + f.write(root_info.model_dump_json()) print(f"🙆 JSL Home setup in {settings.root_dir}") return @@ -258,9 +268,8 @@ def get_install_suite_from_jsl_home( if os.path.exists(settings.py_info_file): py_folder = InstallFolder.py_folder_from_home() - info = RootInfo.parse_file(settings.root_info_file) + info = RootInfo.get_from_jsl_home() # Read all dependencies from local ~/.johnsnowlabs folder - suite = InstallSuite( nlp=LocalPy4JLib( java_lib=java_folder.get_product_entry(ProductName.nlp, jvm_hardware_target) diff --git a/johnsnowlabs/finance.py b/johnsnowlabs/finance.py index 1849a33681..e8499774bc 100644 --- a/johnsnowlabs/finance.py +++ b/johnsnowlabs/finance.py @@ -123,6 +123,8 @@ LargeFewShotClassifierModel, Mapper2Chunk, DocumentFiltererByNER, + REChunkMerger, + ContextualEntityFilterer, ) from sparknlp_jsl.modelTracer import ModelTracer diff --git a/johnsnowlabs/legal.py b/johnsnowlabs/legal.py index f7844cc335..e2ee4c8cbd 100644 --- a/johnsnowlabs/legal.py +++ b/johnsnowlabs/legal.py @@ -121,6 +121,8 @@ LargeFewShotClassifierModel, Mapper2Chunk, DocumentFiltererByNER, + REChunkMerger, + ContextualEntityFilterer, ) from sparknlp_jsl.modelTracer import ModelTracer from sparknlp_jsl.pipeline_tracer import PipelineTracer diff --git a/johnsnowlabs/medical.py b/johnsnowlabs/medical.py index f0258511be..be87cb800d 100644 --- a/johnsnowlabs/medical.py +++ b/johnsnowlabs/medical.py @@ -98,6 +98,8 @@ LargeFewShotClassifierModel, Mapper2Chunk, DocumentFiltererByNER, + REChunkMerger, + ContextualEntityFilterer, ) from sparknlp_jsl.structured_deidentification import StructuredDeidentification from sparknlp_jsl.text_to_documents_columns import TextToDocumentsColumns @@ -130,6 +132,7 @@ TextMatcherInternalModel as TextMatcherModel, RegexMatcherInternal as RegexMatcher, RegexMatcherInternalModel as RegexMatcherModel, + MedicalLLM as AutoGGUFModel, ) from sparknlp_jsl.compatibility import Compatibility from sparknlp_jsl.pretrained import InternalResourceDownloader diff --git a/johnsnowlabs/py_models/install_info.py b/johnsnowlabs/py_models/install_info.py index 1f5273314c..49e59d3c9e 100644 --- a/johnsnowlabs/py_models/install_info.py +++ b/johnsnowlabs/py_models/install_info.py @@ -9,7 +9,7 @@ from johnsnowlabs.py_models.lib_version import LibVersion from johnsnowlabs.utils.enums import JvmHardwareTarget, ProductName, PyInstallTypes - +import json class InstallFileInfoBase(WritableBaseModel): file_name: str product: ProductName @@ -71,7 +71,12 @@ def __init__(self, *args, **kwargs): @staticmethod def get_from_jsl_home(): - return RootInfo.parse_file(settings.root_info_file) + import json + if os.path.exists(settings.root_info_file): + with open(settings.root_info_file, "r") as f: + json_data = json.loads(f.read()) + return RootInfo(run_from=json_data["run_from"], + version=json_data["version"]) class InstallFolder(WritableBaseModel): @@ -93,13 +98,26 @@ def get_product_entry( @staticmethod def java_folder_from_home(): if os.path.exists(settings.java_info_file): - return InstallFolder.parse_file(settings.java_info_file) + with open(settings.java_info_file, "r") as f: + json_data = json.loads(f.read()) + infos = {} + for k,v in json_data['infos'].items(): + if k.endswith(".jar"): + infos[k] = JvmInstallInfo(**v) + return InstallFolder(infos=infos) return False @staticmethod def py_folder_from_home(): if os.path.exists(settings.py_info_file): - return InstallFolder.parse_file(settings.py_info_file) + with open(settings.py_info_file, "r") as f: + json_data = json.loads(f.read()) + infos = {} + for k,v in json_data['infos'].items(): + if k.endswith(".whl") or k.endswith(".tar.gz"): + infos[k] = PyInstallInfo(**v) + + return InstallFolder(infos=infos) return False diff --git a/johnsnowlabs/py_models/jsl_secrets.py b/johnsnowlabs/py_models/jsl_secrets.py index b8ca0f9264..083e859a88 100644 --- a/johnsnowlabs/py_models/jsl_secrets.py +++ b/johnsnowlabs/py_models/jsl_secrets.py @@ -4,7 +4,7 @@ from pathlib import Path from typing import Dict, List, Optional, Union -from pydantic import validator +from pydantic import field_validator from johnsnowlabs import settings from johnsnowlabs.abstract_base.pydantic_model import WritableBaseModel @@ -62,17 +62,17 @@ class JslSecrets(WritableBaseModel): methods for reading/storing found_secrets and managing .jslhome folder """ - HC_SECRET: Secret = None - HC_LICENSE: Secret = None + HC_SECRET: Optional[str] = None + HC_LICENSE: Optional[str] = None HC_VERSION: Optional[LibVersionIdentifier] = None - OCR_SECRET: Secret = None - OCR_LICENSE: Secret = None + OCR_SECRET: Optional[str] = None + OCR_LICENSE: Optional[str] = None OCR_VERSION: Optional[LibVersionIdentifier] = None - AWS_ACCESS_KEY_ID: Secret = None - AWS_SECRET_ACCESS_KEY: Secret = None + AWS_ACCESS_KEY_ID: Optional[str] = None + AWS_SECRET_ACCESS_KEY: Optional[str] = None NLP_VERSION: Optional[LibVersionIdentifier] = None - JSL_LEGAL_LICENSE: Secret = None - JSL_FINANCE_LICENSE: Secret = None + JSL_LEGAL_LICENSE: Optional[str] = None + JSL_FINANCE_LICENSE: Optional[str] = None @staticmethod def raise_invalid_version(): @@ -82,7 +82,7 @@ def raise_invalid_version(): ) raise ValueError("Invalid secrets") - @validator("HC_SECRET") + @field_validator("HC_SECRET") def hc_version_check(cls, HC_SECRET): global hc_validation_logged try: @@ -114,7 +114,7 @@ def is_ocr_secret_correct_version(ocr_secret: Optional[str]) -> bool: def is_hc_secret_correct_version(hc_secret: Optional[str]) -> bool: return hc_secret and hc_secret.split("-")[0] == settings.raw_version_medical - @validator("OCR_SECRET") + @field_validator("OCR_SECRET") def ocr_version_check(cls, OCR_SECRET): global ocr_validation_logged try: @@ -123,6 +123,8 @@ def ocr_version_check(cls, OCR_SECRET): and not ocr_validation_logged ): ocr_validation_logged = True + if not OCR_SECRET: + return OCR_SECRET print( f"🚨 Outdated OCR Secrets in license file. Version={(OCR_SECRET.split('-')[0] if OCR_SECRET else None)} but should be Version={settings.raw_version_ocr}" ) @@ -424,6 +426,13 @@ def search_env_vars() -> Union["JslSecrets", bool]: ] ): print("👌 License detected in Environment Variables") + if isinstance(hc_version,str): + hc_version = LibVersionIdentifier(hc_version) + if isinstance(ocr_version,str): + ocr_version = LibVersionIdentifier(ocr_version) + if isinstance(nlp_version,str): + nlp_version = LibVersionIdentifier(nlp_version) + return JslSecrets( HC_SECRET=hc_secret, HC_LICENSE=hc_license, @@ -631,6 +640,13 @@ def from_json_dict(secrets, secrets_metadata: Optional = None) -> "JslSecrets": secrets["JSL_FINANCE_LICENSE"] if "JSL_FINANCE_LICENSE" in secrets else None ) + if isinstance(hc_version,str): + hc_version = LibVersionIdentifier(hc_version) + if isinstance(ocr_version,str): + ocr_version = LibVersionIdentifier(ocr_version) + if isinstance(nlp_version,str): + nlp_version = LibVersionIdentifier(nlp_version) + return JslSecrets( HC_SECRET=hc_secret, HC_LICENSE=hc_license, @@ -659,8 +675,9 @@ def from_jsl_home( return False try: + # Try/Catch incase we get validation errors from outdated files - license_infos = LicenseInfos.parse_file(settings.creds_info_file) + license_infos = LicenseInfos.from_home() if log and not already_logged: already_logged = True print( @@ -692,7 +709,7 @@ def update_outdated_lib_secrets( for license in os.listdir(settings.license_dir): if license == "info.json": continue - secrets = JslSecrets.parse_file(os.path.join(settings.license_dir, license)) + secrets = JslSecrets.from_json_file_path(os.path.join(settings.license_dir, license)) if ( secrets.HC_SECRET and hc_secrets @@ -768,7 +785,7 @@ def are_credentials_known(found_secrets: "JslSecrets") -> bool: # Return True, if secrets are already stored in JSL-Home, otherwise False Path(settings.py_dir).mkdir(parents=True, exist_ok=True) if os.path.exists(settings.creds_info_file): - license_infos = LicenseInfos.parse_file(settings.creds_info_file) + license_infos = LicenseInfos.from_home() else: # If license dir did not exist yet, secrets are certainly new return False @@ -786,13 +803,13 @@ def are_lib_secrets_an_upgrade(found_secrets: "JslSecrets") -> bool: # Return True, if lib are newer than existing ones, if yes upgrade locally stored secrets Path(settings.py_dir).mkdir(parents=True, exist_ok=True) if os.path.exists(settings.creds_info_file): - license_infos = LicenseInfos.parse_file(settings.creds_info_file) + license_infos = LicenseInfos.from_home() else: # If license dir did not exist yet, secrets are certainly new return False # if any stored secrets equal to found_secrets, then we already know them - # check OCR secrets + if found_secrets.HC_SECRET: if any( map( @@ -837,7 +854,7 @@ def store_in_jsl_home_if_new(secrets: "JslSecrets") -> None: file_name = file_name + "_".join(products) + f".json" if os.path.exists(settings.creds_info_file): - license_infos = LicenseInfos.parse_file(settings.creds_info_file) + license_infos = LicenseInfos.from_home() file_name = file_name.format(number=str(len(license_infos.infos))) license_info = LicenseInfo( jsl_secrets=secrets, products=products, id=str(len(license_infos.infos)) @@ -848,13 +865,16 @@ def store_in_jsl_home_if_new(secrets: "JslSecrets") -> None: secrets.write(out_dir) print(f"📋 Stored new John Snow Labs License in {out_dir}") else: - file_name = file_name.format(number="0") license_info = LicenseInfo(jsl_secrets=secrets, products=products, id="0") - LicenseInfos(infos={file_name: license_info}).write( - settings.creds_info_file - ) + license_infos = LicenseInfos(infos={file_name: license_info}) + with open(settings.creds_info_file, "w") as f: + f.write(license_infos.model_dump_json()) + + file_name = file_name.format(number="0") out_dir = os.path.join(settings.license_dir, file_name) - secrets.write(out_dir) + with open(out_dir, "w") as f: + f.write(secrets.model_dump_json()) + #secrets.write(out_dir) print(f"📋 Stored John Snow Labs License in {out_dir}") # We might load again JSL-Secrets from local already_logged = True @@ -877,6 +897,7 @@ class LicenseInfo(WritableBaseModel): products: List[ProductName] + class LicenseInfos(WritableBaseModel): """Representation of a LicenseInfo in ~/.johnsnowlabs/licenses/info.json Maps file_name to LicenseInfo @@ -886,6 +907,15 @@ class LicenseInfos(WritableBaseModel): @staticmethod def from_home() -> Optional["LicenseInfos"]: - if os.path.exists(settings.creds_info_file): - return LicenseInfos.parse_file(settings.creds_info_file) - return None + if not os.path.exists(settings.creds_info_file): + return None + data = json.load(open(settings.creds_info_file)) + infos = {} + for info in data['infos']: + secret = JslSecrets.from_json_dict(data['infos'][info]['jsl_secrets']) + i = LicenseInfo(id=info, jsl_secrets=secret, + products=data['infos'][info]['products'], + ) + infos[info] = i + license_infos = LicenseInfos(infos=infos) + return license_infos diff --git a/johnsnowlabs/py_models/lib_version.py b/johnsnowlabs/py_models/lib_version.py index 05430da1f3..277b114cdb 100644 --- a/johnsnowlabs/py_models/lib_version.py +++ b/johnsnowlabs/py_models/lib_version.py @@ -133,3 +133,6 @@ def as_str(self) -> str: """Return LibVersion object as canonical str representation""" # We filter out all values != None soo version checks match up return ".".join(filter(lambda x: x, [self.major, self.minor, self.patch])) + + def __str__(self): + return self.as_str() \ No newline at end of file diff --git a/johnsnowlabs/settings.py b/johnsnowlabs/settings.py index 0e4de17465..1c288e7d67 100644 --- a/johnsnowlabs/settings.py +++ b/johnsnowlabs/settings.py @@ -10,9 +10,9 @@ # These versions are used for auto-installs and version checks -raw_version_jsl_lib = "5.4.5" +raw_version_jsl_lib = "5.5.0" -raw_version_nlp = "5.4.1" +raw_version_nlp = "5.5.0" raw_version_nlu = "5.4.1" @@ -20,13 +20,13 @@ raw_version_pyspark = "3.4.0" raw_version_nlp_display = "5.0" -raw_version_medical = "5.4.1" -raw_version_secret_medical = "5.4.1" +raw_version_medical = "5.5.0" +raw_version_secret_medical = "5.5.0" raw_version_secret_ocr = "5.4.1" raw_version_ocr = "5.4.1" -raw_version_pydantic = "1.10.11" +raw_version_pydantic = "2" pypi_page = "https://pypi.org/project/johnsnowlabs" json_indent = 4 diff --git a/johnsnowlabs/utils/sparksession_utils.py b/johnsnowlabs/utils/sparksession_utils.py index 53f97b52e4..1e3c6b4f6e 100644 --- a/johnsnowlabs/utils/sparksession_utils.py +++ b/johnsnowlabs/utils/sparksession_utils.py @@ -186,15 +186,18 @@ def start( "spark.kryoserializer.buffer.max": "2000M", "spark.driver.maxResultSize": "2000M", "spark.jars": ",".join(jars), - 'spark.extraListeners': 'com.johnsnowlabs.license.LicenseLifeCycleManager', } - if suite.ocr and suite.ocr.get_java_path(): + if suite.hc and suite.hc.get_java_path() and nlp and Software.spark_hc.check_installed(None): + default_conf["spark.extraListeners"] = "com.johnsnowlabs.license.LicenseLifeCycleManager" + + if suite.ocr and suite.ocr.get_java_path() and visual and Software.spark_ocr.check_installed(None): # is_spark_version_env('32') default_conf["spark.sql.optimizer.expression.nestedPruning.enabled"] = "false" default_conf["spark.sql.optimizer.nestedSchemaPruning.enabled"] = "false" default_conf["spark.sql.legacy.allowUntypedScalaUDF"] = "true" default_conf["spark.sql.repl.eagerEval.enabled"] = "true" + default_conf["spark.extraListeners"] = "com.johnsnowlabs.license.LicenseLifeCycleManager" for k, v in default_conf.items(): builder.config(str(k), str(v)) diff --git a/setup_johnsnowlabs.py b/setup_johnsnowlabs.py index 156ccca355..554c0293c1 100644 --- a/setup_johnsnowlabs.py +++ b/setup_johnsnowlabs.py @@ -19,7 +19,7 @@ "dataclasses", "requests", "databricks-api", - f"pydantic=={johnsnowlabs.settings.raw_version_pydantic}", + f"pydantic>={johnsnowlabs.settings.raw_version_pydantic}", "colorama", "boto3", ] diff --git a/setup_johnsnowlabs_for_databricks.py b/setup_johnsnowlabs_for_databricks.py index eb10a32770..28becc0ac5 100644 --- a/setup_johnsnowlabs_for_databricks.py +++ b/setup_johnsnowlabs_for_databricks.py @@ -18,7 +18,7 @@ "dataclasses", "requests", "databricks-api", - f"pydantic=={johnsnowlabs.settings.raw_version_pydantic}", + f"pydantic>={johnsnowlabs.settings.raw_version_pydantic}", "colorama", "boto3", ]