Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

custom pyspark version support and removed deprecated db-install from… #820

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 3 additions & 92 deletions johnsnowlabs/auto_install/install_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,6 @@ def install(
leg_license: Optional[str] = None,
aws_access_key: Optional[str] = None,
aws_key_id: Optional[str] = None,
# -- Databricks auth flows & Install Target --
databricks_cluster_id: Optional[str] = None,
databricks_token: Optional[str] = None,
databricks_host: Optional[str] = None,
databricks_password: Optional[str] = None,
databricks_email: Optional[str] = None,
# -- Install Params --
# Install Target
python_exec_path: str = sys.executable,
Expand All @@ -77,38 +71,11 @@ def install(
py_install_type: str = PyInstallTypes.wheel.value,
only_refresh_credentials: bool = False,
refresh_install: bool = False,
# -- Databricks Cluster Creation Params --
block_till_cluster_ready=True,
num_workers=1,
cluster_name=settings.db_cluster_name,
node_type_id=settings.db_node_type_id,
driver_node_type_id=settings.db_driver_node_type,
spark_env_vars=None,
autotermination_minutes=60,
spark_version=settings.db_spark_version,
spark_conf=None,
auto_scale=None,
aws_attributes=None,
ssh_public_keys=None,
custom_tags=None,
cluster_log_conf=None,
enable_elastic_disk=None,
cluster_source=None,
instance_pool_id=None,
headers=None,
clean_cluster=True,
write_db_credentials=True,
extra_pip_installs: Optional[List[str]] = None,
spark_version=settings.raw_version_pyspark,
):
if refresh_install and os.path.exists(settings.root_dir):
print("🧹 Cleaning up old JSL Home in ", settings.root_dir)
shutil.rmtree(settings.root_dir)
if clean_cluster and databricks_host:
dbfs_rm(
get_db_client_for_token(databricks_host, databricks_token),
settings.dbfs_home_dir,
recursive=True,
)

# Input Validation
py_install_type = PyInstallTypes.from_str(py_install_type)
Expand Down Expand Up @@ -159,65 +126,8 @@ def install(
spark_nlp=spark_nlp,
)

# Databricks Install
if databricks_host and databricks_token and not offline:
print(
"nlp.install() for databricks will be deprecated next release, instead use nlp.install_to_databricks()"
)
suite = get_install_suite_from_jsl_home(
jvm_hardware_target=hardware_platform,
visual=visual,
nlp=nlp,
spark_nlp=spark_nlp,
)
if databricks_cluster_id:
install_jsl_suite_to_cluster(
db=get_db_client_for_token(databricks_host, databricks_token),
install_suite=suite,
cluster_id=databricks_cluster_id,
medical_nlp=nlp,
spark_nlp=spark_nlp,
visual=visual,
)
if extra_pip_installs:
install_list_of_pypi_ref_to_cluster(
db=get_db_client_for_token(databricks_host, databricks_token),
cluster_id=databricks_cluster_id,
pip_installs=extra_pip_installs,
)

else:
return create_cluster(
medical_nlp=nlp,
spark_nlp=spark_nlp,
visual=visual,
databricks_host=databricks_host,
databricks_token=databricks_token,
install_suite=suite,
block_till_cluster_ready=block_till_cluster_ready,
num_workers=num_workers,
cluster_name=cluster_name,
node_type_id=node_type_id,
driver_node_type_id=driver_node_type_id,
spark_env_vars=spark_env_vars,
autotermination_minutes=autotermination_minutes,
spark_version=spark_version,
spark_conf=spark_conf,
auto_scale=auto_scale,
aws_attributes=aws_attributes,
ssh_public_keys=ssh_public_keys,
custom_tags=custom_tags,
cluster_log_conf=cluster_log_conf,
enable_elastic_disk=enable_elastic_disk,
cluster_source=cluster_source,
instance_pool_id=instance_pool_id,
headers=headers,
write_db_credentials=write_db_credentials,
extra_pip_installs=extra_pip_installs,
)

# Local Py-Install
elif not slim_install:
if not slim_install:
check_and_install_dependencies(
product=product,
secrets=secrets,
Expand All @@ -230,6 +140,7 @@ def install(
visual=visual,
spark_nlp=spark_nlp,
enterpise_nlp=nlp,
spark_version=spark_version,
)


Expand Down
13 changes: 11 additions & 2 deletions johnsnowlabs/auto_install/install_software.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from pathlib import Path
from typing import Dict, Set

from johnsnowlabs.py_models.lib_version import LibVersion

from johnsnowlabs import settings
from johnsnowlabs.abstract_base.software_product import AbstractSoftwareProduct
from johnsnowlabs.auto_install.softwares import Software
Expand All @@ -26,6 +28,7 @@ def check_and_install_dependencies(
visual: bool = False,
spark_nlp: bool = True,
enterpise_nlp: bool = True,
spark_version: str = None,
):
"""
Iterates the dependency DAG in DFS order for input product and downloads installs all dependencies
Expand Down Expand Up @@ -68,6 +71,12 @@ def check_and_install_dependencies(
:param install_licensed: install licensed products if license permits it if True, otherwise not
sparknlp_to_latest: for some releases we might not want to go to the latest spark release
"""
if spark_version is not None:
# Ignore reccomended settings
spark_version = LibVersion(spark_version)
LatestCompatibleProductVersion.pyspark.value.minor = spark_version.minor
LatestCompatibleProductVersion.pyspark.value.major = spark_version.major
LatestCompatibleProductVersion.pyspark.value.patch = spark_version.patch
import site
from importlib import reload

Expand Down Expand Up @@ -180,10 +189,11 @@ def check_and_install_dependencies(
download_folder=offline_py_dir,
include_dependencies=include_dependencies,
)

if not get_pip_lib_version("pyspark", py_exec=python_exec_path).equals(
LatestCompatibleProductVersion.pyspark.value
):
# Re-install NLP incase some other library up/downgraded it while we installed it
# Re-install Pyspark incase some other library up/downgraded it while we installed it
install_results[Software.spark_nlp] = Software.pyspark.install(
re_install=True,
version=LatestCompatibleProductVersion.pyspark.value,
Expand Down Expand Up @@ -222,7 +232,6 @@ def check_and_install_dependencies(
to_reload.append(PIL)
for mod in to_reload:
reload(mod)

else:
print(f"👌 Everything is already installed, no changes made")

Expand Down