From c9b66ed15302624437f07292aa4a9d4f41ae968c Mon Sep 17 00:00:00 2001 From: vladd-bit Date: Wed, 28 Feb 2024 00:59:08 +0000 Subject: [PATCH 1/7] Services: Jhub updates. --- services/jupyter-hub/Dockerfile_singleuser | 9 ++++++++- services/jupyter-hub/Dockerfile_singleuser_gpu | 2 +- services/jupyter-hub/config/jupyterhub_config.py | 3 ++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/services/jupyter-hub/Dockerfile_singleuser b/services/jupyter-hub/Dockerfile_singleuser index 968c60d7..69ebacdb 100644 --- a/services/jupyter-hub/Dockerfile_singleuser +++ b/services/jupyter-hub/Dockerfile_singleuser @@ -89,6 +89,13 @@ RUN apt-get update && apt-get upgrade -y && \ libxcursor1 libxcomposite1 libasound2 libxi6 libxtst6 \ r-base +# python 3.11 +RUN apt-get install python3.11-full python3.11-dev + +# be careful, this conflicts +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 + + RUN apt-get update && apt-get upgrade -y # Microsoft repos @@ -132,7 +139,7 @@ RUN pip3 install --no-cache-dir --upgrade pip RUN pip3 install --no-cache-dir setuptools wheel virtualenv # jupyterhub stuff -RUN pip3 install --no-cache-dir ipywidgets jupyter jupyterhub jupyterlab jupyterlab-git dask-labextension importlib_metadata +RUN pip3 install --no-cache-dir ipywidgets jupyter jupyterhub jupyterlab jupyterlab-git importlib_metadata bitstream RUN pip3 install --no-cache-dir jupyterlab_widgets jupyter_contrib_core jupyter_contrib_nbextensions jupyter-server-proxy fastbook RUN pip3 install --no-cache-dir docker docker-compose dockerspawner jupyterhub-firstuseauthenticator jupyterhub-systemdspawner jupyterhub-jwtauthenticator jupyterhub-client jupyterhub-kerberosauthenticator RUN pip3 install --no-cache-dir jupyterhub-nanowireauthenticator jupyterhub-ldapauthenticator jupyterhub-kubespawner jupyterhub-nativeauthenticator diff --git a/services/jupyter-hub/Dockerfile_singleuser_gpu b/services/jupyter-hub/Dockerfile_singleuser_gpu index 086cad5c..b1bf2b5e 100644 --- a/services/jupyter-hub/Dockerfile_singleuser_gpu +++ b/services/jupyter-hub/Dockerfile_singleuser_gpu @@ -154,7 +154,7 @@ RUN pip3 install --no-cache-dir --upgrade pip RUN pip3 install --no-cache-dir setuptools wheel virtualenv # jupyterhub stuff -RUN pip3 install --no-cache-dir ipywidgets jupyter jupyterhub jupyterlab jupyterlab-git dask-labextension importlib_metadata +RUN pip3 install --no-cache-dir ipywidgets jupyter jupyterhub jupyterlab jupyterlab-git importlib_metadata bitstream RUN pip3 install --no-cache-dir jupyterlab_widgets jupyter_contrib_core jupyter_contrib_nbextensions jupyter-server-proxy fastbook RUN pip3 install --no-cache-dir docker docker-compose dockerspawner jupyterhub-firstuseauthenticator jupyterhub-systemdspawner jupyterhub-jwtauthenticator jupyterhub-client jupyterhub-kerberosauthenticator RUN pip3 install --no-cache-dir jupyterhub-nanowireauthenticator jupyterhub-ldapauthenticator jupyterhub-kubespawner jupyterhub-nativeauthenticator diff --git a/services/jupyter-hub/config/jupyterhub_config.py b/services/jupyter-hub/config/jupyterhub_config.py index 69b223bc..532c913a 100644 --- a/services/jupyter-hub/config/jupyterhub_config.py +++ b/services/jupyter-hub/config/jupyterhub_config.py @@ -148,7 +148,8 @@ def start(self): if self.user.name not in whitelist: whitelist.add(self.user.name) with open(userlist_path , "a") as f: - f.write(self.user.name + "\n") + f.write("\n") + f.write(self.user.name) if self.user.name in list(team_map.keys()): for team in team_map[self.user.name]: From 2719caf263ddb711457deb253cdbf3cbc2d8ee0f Mon Sep 17 00:00:00 2001 From: vladd-bit Date: Thu, 7 Mar 2024 14:11:31 +0000 Subject: [PATCH 2/7] CU:8693yammy | NiFi scripts: Added Cerner blob decompression util. --- nifi/user-scripts/utils/cerner_blob.py | 124 +++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 nifi/user-scripts/utils/cerner_blob.py diff --git a/nifi/user-scripts/utils/cerner_blob.py b/nifi/user-scripts/utils/cerner_blob.py new file mode 100644 index 00000000..d2586790 --- /dev/null +++ b/nifi/user-scripts/utils/cerner_blob.py @@ -0,0 +1,124 @@ +from typing import List + + +class LzwItem: + def __init__(self, _prefix: int = 0, _suffix: int = 0) -> None: + self.prefix = _prefix + self.suffix = _suffix + + +class DecompressLzwCernerBlob: + def __init__(self) -> None: + self.MAX_CODES: int = 8192 + self.tmp_decompression_buffer: List[int] = [0] * self.MAX_CODES + self.lzw_lookup_table: List[LzwItem] = [LzwItem()] * self.MAX_CODES + self.tmp_buffer_index: int = 0 + self.current_byte_buffer_index: int = 0 + + # starts after 256, since 256 is the ASCII alphabet + self.code_count: int = 257 + self.output_stream = bytearray() + + def save_to_lookup_table(self, compressed_code: int): + self.tmp_buffer_index = -1 + while compressed_code >= 258: + self.tmp_decompression_buffer[self.tmp_buffer_index] = \ + self.lzw_lookup_table[compressed_code].suffix + compressed_code = self.lzw_lookup_table[compressed_code].prefix + + self.tmp_buffer_index += 1 + self.tmp_decompression_buffer[self.tmp_buffer_index] = compressed_code + + for i in reversed(list(range(self.tmp_buffer_index + 1))): + self.output_stream.append(self.tmp_decompression_buffer[i]) + + def decompress(self, input_stream=bytearray()): + + byte_buffer_index: int = 0 + + # used for bit shifts + shift: int = 1 + current_shift: int = 1 + + previous_code: int = 0 + middle_code: int = 0 + lookup_index: int = 0 + + skip_flag: bool = False + + first_code = input_stream[byte_buffer_index] + + while True: + if current_shift >= 9: + + current_shift -= 8 + + if first_code != 0: + byte_buffer_index += 1 + middle_code = input_stream[byte_buffer_index] + + first_code = (first_code << current_shift + + 8) | (middle_code << current_shift) + + byte_buffer_index += 1 + middle_code = input_stream[byte_buffer_index] + + tmp_code = middle_code >> (8 - current_shift) + lookup_index = first_code | tmp_code + + skip_flag = True + else: + byte_buffer_index += 1 + first_code = input_stream[byte_buffer_index] + byte_buffer_index += 1 + middle_code = input_stream[byte_buffer_index] + else: + byte_buffer_index += 1 + middle_code = input_stream[byte_buffer_index] + + if not skip_flag: + lookup_index = (first_code << current_shift) | ( + middle_code >> 8 - current_shift) + + if lookup_index == 256: + shift = 1 + current_shift += 1 + first_code = input_stream[byte_buffer_index] + + self.tmp_decompression_buffer = [0] * self.MAX_CODES + self.tmp_buffer_index = 0 + + self.lzw_lookup_table = [LzwItem()] * self.MAX_CODES + self.code_count = 257 + continue + + elif lookup_index == 257: # EOF marker + return self.output_stream + + skip_flag = False + + # skipit part + if previous_code == 0: + self.tmp_decompression_buffer[0] = lookup_index + if lookup_index < self.code_count: + self.save_to_lookup_table(lookup_index) + if self.code_count < self.MAX_CODES: + self.lzw_lookup_table[self.code_count] = LzwItem( + previous_code, + self.tmp_decompression_buffer[self.tmp_buffer_index]) + self.code_count += 1 + else: + self.lzw_lookup_table[self.code_count] = LzwItem( + previous_code, + self.tmp_decompression_buffer[self.tmp_buffer_index]) + self.code_count += 1 + self.save_to_lookup_table(lookup_index) + # end of skipit + + first_code = (middle_code & (0xff >> current_shift)) + current_shift += shift + + if self.code_count in [511, 1023, 2047, 4095]: + shift += 1 + current_shift += 1 + previous_code = lookup_index From 5e804d3784cb53968a025582c11f7d5b7f774666 Mon Sep 17 00:00:00 2001 From: Git bot Date: Thu, 7 Mar 2024 14:11:54 +0000 Subject: [PATCH 3/7] Auto updated submodule references --- services/jupyter-hub/notebooks/working_with_cogstack | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/jupyter-hub/notebooks/working_with_cogstack b/services/jupyter-hub/notebooks/working_with_cogstack index 00a11154..404a2468 160000 --- a/services/jupyter-hub/notebooks/working_with_cogstack +++ b/services/jupyter-hub/notebooks/working_with_cogstack @@ -1 +1 @@ -Subproject commit 00a11154ca42134948cc6558c8fc638983198797 +Subproject commit 404a24687f9a80b534ccd67fb8c0dcbc50997d49 From 703b0e928fcc98987ce929c7fbc57523ab3e7a29 Mon Sep 17 00:00:00 2001 From: vladd-bit Date: Thu, 7 Mar 2024 14:22:38 +0000 Subject: [PATCH 4/7] CU: 8693yammy | NiFi scripts: added parser script for cerner blobs. --- nifi/user-scripts/parse-cerner-blob.py | 27 ++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 nifi/user-scripts/parse-cerner-blob.py diff --git a/nifi/user-scripts/parse-cerner-blob.py b/nifi/user-scripts/parse-cerner-blob.py new file mode 100644 index 00000000..a7edceaa --- /dev/null +++ b/nifi/user-scripts/parse-cerner-blob.py @@ -0,0 +1,27 @@ +import sys +from .utils.cerner_blob import DecompressLzwCernerBlob + +# This needs to be investigated, records might have different charsets, +# currently only tested with "iso-8859-1" +# other frequently used encodings: "utf-16le", "utf-16be" +# In some cases you will need to figure this out yourself, depending on +# the data source +INPUT_CHARSET = "iso-8859-1" + +# expected (optional) +OUTPUT_CHARSET = "windows-1252" + +input_cerner_blob = bytearray(sys.stdin.read(), encoding=INPUT_CHARSET) + +for arg in sys.argv: + _arg = arg.split("=", 1) + + if _arg[0] == "input_charset": + INPUT_CHARSET = str(_arg[1]).lower() + elif _arg[0] == "output_charset": + OUTPUT_CHARSET = str(_arg[1]).lower() + +decompress_blob = DecompressLzwCernerBlob() +decompress_blob.decompress(input_cerner_blob) + +sys.stdout.write(decompress_blob.output_stream.decode(encoding=OUTPUT_CHARSET)) From 33eff5fb04e935a1e7be789687898c416403c56a Mon Sep 17 00:00:00 2001 From: vladd-bit Date: Thu, 7 Mar 2024 17:59:04 +0000 Subject: [PATCH 5/7] NiFi scripts: fixed import + overflow issue. --- nifi/user-scripts/parse-cerner-blob.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nifi/user-scripts/parse-cerner-blob.py b/nifi/user-scripts/parse-cerner-blob.py index a7edceaa..5e3ba37c 100644 --- a/nifi/user-scripts/parse-cerner-blob.py +++ b/nifi/user-scripts/parse-cerner-blob.py @@ -1,5 +1,5 @@ import sys -from .utils.cerner_blob import DecompressLzwCernerBlob +from utils.cerner_blob import DecompressLzwCernerBlob # This needs to be investigated, records might have different charsets, # currently only tested with "iso-8859-1" @@ -11,7 +11,7 @@ # expected (optional) OUTPUT_CHARSET = "windows-1252" -input_cerner_blob = bytearray(sys.stdin.read(), encoding=INPUT_CHARSET) +input_cerner_blob = str(sys.stdin.buffer.read(), INPUT_CHARSET).encode(INPUT_CHARSET) for arg in sys.argv: _arg = arg.split("=", 1) From cbe733a7302877400bdb36c86bdfaa47b9dda65f Mon Sep 17 00:00:00 2001 From: vladd-bit Date: Thu, 7 Mar 2024 23:00:51 +0000 Subject: [PATCH 6/7] NiFi Scripts: fixed cerner decompression problem. --- nifi/user-scripts/utils/cerner_blob.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nifi/user-scripts/utils/cerner_blob.py b/nifi/user-scripts/utils/cerner_blob.py index d2586790..04f5f366 100644 --- a/nifi/user-scripts/utils/cerner_blob.py +++ b/nifi/user-scripts/utils/cerner_blob.py @@ -22,6 +22,7 @@ def __init__(self) -> None: def save_to_lookup_table(self, compressed_code: int): self.tmp_buffer_index = -1 while compressed_code >= 258: + self.tmp_buffer_index += 1 self.tmp_decompression_buffer[self.tmp_buffer_index] = \ self.lzw_lookup_table[compressed_code].suffix compressed_code = self.lzw_lookup_table[compressed_code].prefix From 12bea4451882c71c72ffd3ec9fcbaf40131a1e18 Mon Sep 17 00:00:00 2001 From: vladd-bit Date: Fri, 8 Mar 2024 11:16:40 +0000 Subject: [PATCH 7/7] NiFi scripts: added multiple output(s) for cerner decompression. --- nifi/user-scripts/parse-cerner-blob.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/nifi/user-scripts/parse-cerner-blob.py b/nifi/user-scripts/parse-cerner-blob.py index 5e3ba37c..ac692aa7 100644 --- a/nifi/user-scripts/parse-cerner-blob.py +++ b/nifi/user-scripts/parse-cerner-blob.py @@ -11,6 +11,11 @@ # expected (optional) OUTPUT_CHARSET = "windows-1252" +# possible values: +# - binary: output binary code +# - string: output string after decompression +OUTPUT_MODE = "binary" + input_cerner_blob = str(sys.stdin.buffer.read(), INPUT_CHARSET).encode(INPUT_CHARSET) for arg in sys.argv: @@ -20,8 +25,13 @@ INPUT_CHARSET = str(_arg[1]).lower() elif _arg[0] == "output_charset": OUTPUT_CHARSET = str(_arg[1]).lower() + elif _arg[0] == "output_mode": + OUTPUT_MODE = str(_arg[1]).lower() decompress_blob = DecompressLzwCernerBlob() decompress_blob.decompress(input_cerner_blob) -sys.stdout.write(decompress_blob.output_stream.decode(encoding=OUTPUT_CHARSET)) +if OUTPUT_MODE == "binary": + sys.stdout.buffer.write(bytes(decompress_blob.output_stream)) +else: + sys.stdout.write(decompress_blob.output_stream.decode(OUTPUT_CHARSET))