Merge branch 'master' of https://github.com/CogStack/CogStack-NiFi

CogStack · Mar 10, 2024 · 433ffe3 · 433ffe3
2 parents 793fc49 + 12bea44
commit 433ffe3
Show file tree

Hide file tree

Showing 5 changed files with 170 additions and 2 deletions.
diff --git a/nifi/user-scripts/parse-cerner-blob.py b/nifi/user-scripts/parse-cerner-blob.py
@@ -0,0 +1,37 @@
+import sys
+from utils.cerner_blob import DecompressLzwCernerBlob
+
+# This needs to be investigated, records might have different charsets,
+#   currently only tested with "iso-8859-1"
+#   other frequently used encodings: "utf-16le", "utf-16be"
+# In some cases you will need to figure this out yourself, depending on
+#   the data source
+INPUT_CHARSET = "iso-8859-1"
+
+# expected (optional)
+OUTPUT_CHARSET = "windows-1252"
+
+# possible values:
+#   - binary: output binary code
+#   - string: output string after decompression 
+OUTPUT_MODE = "binary"
+
+input_cerner_blob = str(sys.stdin.buffer.read(), INPUT_CHARSET).encode(INPUT_CHARSET)
+
+for arg in sys.argv:
+    _arg = arg.split("=", 1)
+
+    if _arg[0] == "input_charset":
+        INPUT_CHARSET = str(_arg[1]).lower()
+    elif _arg[0] == "output_charset":
+        OUTPUT_CHARSET = str(_arg[1]).lower()
+    elif _arg[0] == "output_mode":
+        OUTPUT_MODE = str(_arg[1]).lower()
+
+decompress_blob = DecompressLzwCernerBlob()
+decompress_blob.decompress(input_cerner_blob)
+
+if OUTPUT_MODE == "binary":
+    sys.stdout.buffer.write(bytes(decompress_blob.output_stream))
+else:
+    sys.stdout.write(decompress_blob.output_stream.decode(OUTPUT_CHARSET))
diff --git a/nifi/user-scripts/utils/cerner_blob.py b/nifi/user-scripts/utils/cerner_blob.py
@@ -0,0 +1,125 @@
+from typing import List
+
+
+class LzwItem:
+    def __init__(self, _prefix: int = 0, _suffix: int = 0) -> None:
+        self.prefix = _prefix
+        self.suffix = _suffix
+
+
+class DecompressLzwCernerBlob:
+    def __init__(self) -> None:
+        self.MAX_CODES: int = 8192
+        self.tmp_decompression_buffer: List[int] = [0] * self.MAX_CODES
+        self.lzw_lookup_table: List[LzwItem] = [LzwItem()] * self.MAX_CODES
+        self.tmp_buffer_index: int = 0
+        self.current_byte_buffer_index: int = 0
+
+        # starts after 256, since 256 is the ASCII alphabet
+        self.code_count: int = 257
+        self.output_stream = bytearray()
+
+    def save_to_lookup_table(self, compressed_code: int):
+        self.tmp_buffer_index = -1
+        while compressed_code >= 258:
+            self.tmp_buffer_index += 1
+            self.tmp_decompression_buffer[self.tmp_buffer_index] = \
+                self.lzw_lookup_table[compressed_code].suffix
+            compressed_code = self.lzw_lookup_table[compressed_code].prefix
+
+        self.tmp_buffer_index += 1
+        self.tmp_decompression_buffer[self.tmp_buffer_index] = compressed_code
+
+        for i in reversed(list(range(self.tmp_buffer_index + 1))):
+            self.output_stream.append(self.tmp_decompression_buffer[i])
+
+    def decompress(self, input_stream=bytearray()):
+
+        byte_buffer_index: int = 0
+
+        # used for bit shifts
+        shift: int = 1
+        current_shift: int = 1
+
+        previous_code: int = 0
+        middle_code: int = 0
+        lookup_index: int = 0
+
+        skip_flag: bool = False
+
+        first_code = input_stream[byte_buffer_index]
+
+        while True:
+            if current_shift >= 9:
+
+                current_shift -= 8
+
+                if first_code != 0:
+                    byte_buffer_index += 1
+                    middle_code = input_stream[byte_buffer_index]
+
+                    first_code = (first_code << current_shift +
+                                  8) | (middle_code << current_shift)
+
+                    byte_buffer_index += 1
+                    middle_code = input_stream[byte_buffer_index]
+
+                    tmp_code = middle_code >> (8 - current_shift)
+                    lookup_index = first_code | tmp_code
+
+                    skip_flag = True
+                else:
+                    byte_buffer_index += 1
+                    first_code = input_stream[byte_buffer_index]
+                    byte_buffer_index += 1
+                    middle_code = input_stream[byte_buffer_index]
+            else:
+                byte_buffer_index += 1
+                middle_code = input_stream[byte_buffer_index]
+
+            if not skip_flag:
+                lookup_index = (first_code << current_shift) | (
+                    middle_code >> 8 - current_shift)
+
+                if lookup_index == 256:
+                    shift = 1
+                    current_shift += 1
+                    first_code = input_stream[byte_buffer_index]
+
+                    self.tmp_decompression_buffer = [0] * self.MAX_CODES
+                    self.tmp_buffer_index = 0
+
+                    self.lzw_lookup_table = [LzwItem()] * self.MAX_CODES
+                    self.code_count = 257
+                    continue
+
+                elif lookup_index == 257:  # EOF marker
+                    return self.output_stream
+
+            skip_flag = False
+
+            # skipit part
+            if previous_code == 0:
+                self.tmp_decompression_buffer[0] = lookup_index
+            if lookup_index < self.code_count:
+                self.save_to_lookup_table(lookup_index)
+                if self.code_count < self.MAX_CODES:
+                    self.lzw_lookup_table[self.code_count] = LzwItem(
+                        previous_code,
+                        self.tmp_decompression_buffer[self.tmp_buffer_index])
+                    self.code_count += 1
+            else:
+                self.lzw_lookup_table[self.code_count] = LzwItem(
+                    previous_code,
+                    self.tmp_decompression_buffer[self.tmp_buffer_index])
+                self.code_count += 1
+                self.save_to_lookup_table(lookup_index)
+            # end of skipit
+
+            first_code = (middle_code & (0xff >> current_shift))
+            current_shift += shift
+
+            if self.code_count in [511, 1023, 2047, 4095]:
+                shift += 1
+                current_shift += 1
+            previous_code = lookup_index
diff --git a/services/jupyter-hub/Dockerfile_singleuser b/services/jupyter-hub/Dockerfile_singleuser
@@ -90,7 +90,11 @@ RUN apt-get update && apt-get upgrade -y && \
     r-base
 
 # python 3.11
+<<<<<<< HEAD
 RUN apt-get install -y python3.11-full python3.11-dev
+=======
+RUN apt-get install python3.11-full python3.11-dev
+>>>>>>> 12bea4451882c71c72ffd3ec9fcbaf40131a1e18
 
 # be careful, this conflicts
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
@@ -141,7 +145,8 @@ RUN pip3 install --no-cache-dir --upgrade pip
 RUN pip3 install --no-cache-dir setuptools wheel virtualenv
 
 # jupyterhub stuff
-RUN pip3 install --no-cache-dir ipywidgets jupyter jupyterhub jupyterlab jupyterlab-git importlib_metadata
+RUN pip3 install --no-cache-dir ipywidgets jupyter jupyterhub jupyterlab jupyterlab-git importlib_metadata bitstream
+
 RUN pip3 install --no-cache-dir jupyterlab_widgets jupyter_contrib_core jupyter_contrib_nbextensions jupyter-server-proxy fastbook
 RUN pip3 install --no-cache-dir docker docker-compose dockerspawner jupyterhub-firstuseauthenticator jupyterhub-systemdspawner jupyterhub-jwtauthenticator jupyterhub-client jupyterhub-kerberosauthenticator 
 RUN pip3 install --no-cache-dir jupyterhub-nanowireauthenticator jupyterhub-ldapauthenticator jupyterhub-kubespawner jupyterhub-nativeauthenticator

diff --git a/services/jupyter-hub/Dockerfile_singleuser_gpu b/services/jupyter-hub/Dockerfile_singleuser_gpu
@@ -155,6 +155,7 @@ RUN pip3 install --no-cache-dir  setuptools wheel virtualenv
 
 # jupyterhub stuff
 RUN pip3 install --no-cache-dir ipywidgets jupyter jupyterhub jupyterlab jupyterlab-git importlib_metadata
+RUN pip3 install --no-cache-dir ipywidgets jupyter jupyterhub jupyterlab jupyterlab-git importlib_metadata bitstream
 RUN pip3 install --no-cache-dir jupyterlab_widgets jupyter_contrib_core jupyter_contrib_nbextensions jupyter-server-proxy fastbook
 RUN pip3 install --no-cache-dir docker docker-compose dockerspawner jupyterhub-firstuseauthenticator jupyterhub-systemdspawner jupyterhub-jwtauthenticator jupyterhub-client jupyterhub-kerberosauthenticator 
 RUN pip3 install --no-cache-dir jupyterhub-nanowireauthenticator jupyterhub-ldapauthenticator jupyterhub-kubespawner jupyterhub-nativeauthenticator

diff --git a/services/jupyter-hub/notebooks/working_with_cogstack b/services/jupyter-hub/notebooks/working_with_cogstack
+1 −1		.github/workflows/main.yml
+14 −35		medcat/evaluate_mct_export/mct_analysis.py
+33 −1,802		medcat/evaluate_mct_export/mct_export_summary.ipynb
+4 −4		requirements.txt