Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
vladd-bit committed Mar 10, 2024
2 parents 793fc49 + 12bea44 commit 433ffe3
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 2 deletions.
37 changes: 37 additions & 0 deletions nifi/user-scripts/parse-cerner-blob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import sys
from utils.cerner_blob import DecompressLzwCernerBlob

# This needs to be investigated, records might have different charsets,
# currently only tested with "iso-8859-1"
# other frequently used encodings: "utf-16le", "utf-16be"
# In some cases you will need to figure this out yourself, depending on
# the data source
INPUT_CHARSET = "iso-8859-1"

# expected (optional)
OUTPUT_CHARSET = "windows-1252"

# possible values:
# - binary: output binary code
# - string: output string after decompression
OUTPUT_MODE = "binary"

input_cerner_blob = str(sys.stdin.buffer.read(), INPUT_CHARSET).encode(INPUT_CHARSET)

for arg in sys.argv:
_arg = arg.split("=", 1)

if _arg[0] == "input_charset":
INPUT_CHARSET = str(_arg[1]).lower()
elif _arg[0] == "output_charset":
OUTPUT_CHARSET = str(_arg[1]).lower()
elif _arg[0] == "output_mode":
OUTPUT_MODE = str(_arg[1]).lower()

decompress_blob = DecompressLzwCernerBlob()
decompress_blob.decompress(input_cerner_blob)

if OUTPUT_MODE == "binary":
sys.stdout.buffer.write(bytes(decompress_blob.output_stream))
else:
sys.stdout.write(decompress_blob.output_stream.decode(OUTPUT_CHARSET))
125 changes: 125 additions & 0 deletions nifi/user-scripts/utils/cerner_blob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
from typing import List


class LzwItem:
def __init__(self, _prefix: int = 0, _suffix: int = 0) -> None:
self.prefix = _prefix
self.suffix = _suffix


class DecompressLzwCernerBlob:
def __init__(self) -> None:
self.MAX_CODES: int = 8192
self.tmp_decompression_buffer: List[int] = [0] * self.MAX_CODES
self.lzw_lookup_table: List[LzwItem] = [LzwItem()] * self.MAX_CODES
self.tmp_buffer_index: int = 0
self.current_byte_buffer_index: int = 0

# starts after 256, since 256 is the ASCII alphabet
self.code_count: int = 257
self.output_stream = bytearray()

def save_to_lookup_table(self, compressed_code: int):
self.tmp_buffer_index = -1
while compressed_code >= 258:
self.tmp_buffer_index += 1
self.tmp_decompression_buffer[self.tmp_buffer_index] = \
self.lzw_lookup_table[compressed_code].suffix
compressed_code = self.lzw_lookup_table[compressed_code].prefix

self.tmp_buffer_index += 1
self.tmp_decompression_buffer[self.tmp_buffer_index] = compressed_code

for i in reversed(list(range(self.tmp_buffer_index + 1))):
self.output_stream.append(self.tmp_decompression_buffer[i])

def decompress(self, input_stream=bytearray()):

byte_buffer_index: int = 0

# used for bit shifts
shift: int = 1
current_shift: int = 1

previous_code: int = 0
middle_code: int = 0
lookup_index: int = 0

skip_flag: bool = False

first_code = input_stream[byte_buffer_index]

while True:
if current_shift >= 9:

current_shift -= 8

if first_code != 0:
byte_buffer_index += 1
middle_code = input_stream[byte_buffer_index]

first_code = (first_code << current_shift +
8) | (middle_code << current_shift)

byte_buffer_index += 1
middle_code = input_stream[byte_buffer_index]

tmp_code = middle_code >> (8 - current_shift)
lookup_index = first_code | tmp_code

skip_flag = True
else:
byte_buffer_index += 1
first_code = input_stream[byte_buffer_index]
byte_buffer_index += 1
middle_code = input_stream[byte_buffer_index]
else:
byte_buffer_index += 1
middle_code = input_stream[byte_buffer_index]

if not skip_flag:
lookup_index = (first_code << current_shift) | (
middle_code >> 8 - current_shift)

if lookup_index == 256:
shift = 1
current_shift += 1
first_code = input_stream[byte_buffer_index]

self.tmp_decompression_buffer = [0] * self.MAX_CODES
self.tmp_buffer_index = 0

self.lzw_lookup_table = [LzwItem()] * self.MAX_CODES
self.code_count = 257
continue

elif lookup_index == 257: # EOF marker
return self.output_stream

skip_flag = False

# skipit part
if previous_code == 0:
self.tmp_decompression_buffer[0] = lookup_index
if lookup_index < self.code_count:
self.save_to_lookup_table(lookup_index)
if self.code_count < self.MAX_CODES:
self.lzw_lookup_table[self.code_count] = LzwItem(
previous_code,
self.tmp_decompression_buffer[self.tmp_buffer_index])
self.code_count += 1
else:
self.lzw_lookup_table[self.code_count] = LzwItem(
previous_code,
self.tmp_decompression_buffer[self.tmp_buffer_index])
self.code_count += 1
self.save_to_lookup_table(lookup_index)
# end of skipit

first_code = (middle_code & (0xff >> current_shift))
current_shift += shift

if self.code_count in [511, 1023, 2047, 4095]:
shift += 1
current_shift += 1
previous_code = lookup_index
7 changes: 6 additions & 1 deletion services/jupyter-hub/Dockerfile_singleuser
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,11 @@ RUN apt-get update && apt-get upgrade -y && \
r-base

# python 3.11
<<<<<<< HEAD
RUN apt-get install -y python3.11-full python3.11-dev
=======
RUN apt-get install python3.11-full python3.11-dev
>>>>>>> 12bea4451882c71c72ffd3ec9fcbaf40131a1e18

# be careful, this conflicts
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
Expand Down Expand Up @@ -141,7 +145,8 @@ RUN pip3 install --no-cache-dir --upgrade pip
RUN pip3 install --no-cache-dir setuptools wheel virtualenv

# jupyterhub stuff
RUN pip3 install --no-cache-dir ipywidgets jupyter jupyterhub jupyterlab jupyterlab-git importlib_metadata
RUN pip3 install --no-cache-dir ipywidgets jupyter jupyterhub jupyterlab jupyterlab-git importlib_metadata bitstream

RUN pip3 install --no-cache-dir jupyterlab_widgets jupyter_contrib_core jupyter_contrib_nbextensions jupyter-server-proxy fastbook
RUN pip3 install --no-cache-dir docker docker-compose dockerspawner jupyterhub-firstuseauthenticator jupyterhub-systemdspawner jupyterhub-jwtauthenticator jupyterhub-client jupyterhub-kerberosauthenticator
RUN pip3 install --no-cache-dir jupyterhub-nanowireauthenticator jupyterhub-ldapauthenticator jupyterhub-kubespawner jupyterhub-nativeauthenticator
Expand Down
1 change: 1 addition & 0 deletions services/jupyter-hub/Dockerfile_singleuser_gpu
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ RUN pip3 install --no-cache-dir setuptools wheel virtualenv

# jupyterhub stuff
RUN pip3 install --no-cache-dir ipywidgets jupyter jupyterhub jupyterlab jupyterlab-git importlib_metadata
RUN pip3 install --no-cache-dir ipywidgets jupyter jupyterhub jupyterlab jupyterlab-git importlib_metadata bitstream
RUN pip3 install --no-cache-dir jupyterlab_widgets jupyter_contrib_core jupyter_contrib_nbextensions jupyter-server-proxy fastbook
RUN pip3 install --no-cache-dir docker docker-compose dockerspawner jupyterhub-firstuseauthenticator jupyterhub-systemdspawner jupyterhub-jwtauthenticator jupyterhub-client jupyterhub-kerberosauthenticator
RUN pip3 install --no-cache-dir jupyterhub-nanowireauthenticator jupyterhub-ldapauthenticator jupyterhub-kubespawner jupyterhub-nativeauthenticator
Expand Down

0 comments on commit 433ffe3

Please sign in to comment.