Fix: Adapt to old silixa xml versions

- Split functions for the old vs new xml readers - Added some checks for mismatched libraries and datasets in plotting - Bug fix for modern versions of xarray in merging single into double-ended
klapo · Mar 14, 2023 · 2f7aadd · 2f7aadd
1 parent c3a8eac
commit 2f7aadd
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 10 deletions.
diff --git a/src/pyfocs/data.py b/src/pyfocs/data.py
@@ -153,7 +153,7 @@ def double_end_dv_clean(ds):
 
 
 def merge_single(
-    dstore_fw, dstore_bw, shift_window=20, fixed_shift=None, plot_result=False
+    dstore_fw, dstore_bw, shift_window=500, fixed_shift=None, plot_result=False
 ):
     """
     Merge two single-ended channels to a single double-ended configuration.
@@ -172,11 +172,15 @@ def merge_single(
     if not fixed_shift:
         shift_lims = np.arange(-shift_window, shift_window, 1, dtype=int)
 
-        # Estimate the number of indices to shift the two channels to align them.
-        # I use some overly generous limits for searching
-        # This parameter should be made an optional argument passed to the function.
+        # Estimate the number of indices to shift the two channels to align them. The
+        # time mean is used, but dtscalibration relies on having a matrix of a
+        # particular dimensionality. Accommodate this by expanding back out the time
+        # dimension after averaging.
         shift1, shift2 = suggest_cable_shift_double_ended(
-            double.mean(dim="time").compute(),
+            double[["st", "ast", "rst", "rast"]]
+            .mean(dim="time")
+            .compute()
+            .expand_dims("time", axis=1),
             shift_lims,
             plot_result=plot_result,
         )

diff --git a/src/pyfocs/dts_plots.py b/src/pyfocs/dts_plots.py
@@ -119,6 +119,11 @@ def bath_validation(
         except KeyError:
             del callib[bn]
 
+    if not callib:
+        raise KeyError(
+            "None of the specified reference sections could be found in the dataset."
+        )
+
     if plot_var == "bias":
         label_text = "Bias (K)"
         if not bath_lims:

diff --git a/src/pyfocs/dtsarch.py b/src/pyfocs/dtsarch.py
@@ -263,7 +263,7 @@ def archiver(cfg):
                     break
 
                 # If the archiving mode is active, do not process
-                # the last archiving interval, as it may be incomplete. Instead
+                # the last archiving interval, as it may be incomplete. Instead,
                 # exit and handle this interval with the next scheduled call
                 # to the archiver. This step just requires no action here.
                 if mode == "active":

diff --git a/src/pyfocs/readDTS.py b/src/pyfocs/readDTS.py
@@ -38,9 +38,83 @@ def xml_read(dumbXMLFile):
         # general catch here and hope for the best.
         # Raising this error allows us to catch corrupted files.
         raise CorruptedXMLError
-    # Remove all of the bullshit
-    doc = doc["logs"]["log"]
+    # Strip the unneeded metadata and accommodate the various versions.
 
+    if "logs" in doc:
+        doc = doc["logs"]["log"]
+        return read_new_silixa_xml(doc)
+    # The xml file is older.
+    elif "wellLogs" in doc:
+        doc = doc["wellLogs"]["wellLog"]
+        return read_old_silixa_xml(doc)
+
+
+def read_old_silixa_xml(doc):
+    # Extract units/metadata info out of xml dictionary
+    metaData = {
+        "LAF_beg": float(doc["minIndex"]["#text"]),
+        "LAF_end": float(doc["maxIndex"]["#text"]),
+        "dLAF": float(doc["blockInfo"]["stepIncrement"]["#text"]),
+        "dt_start": pd.to_datetime(
+            doc["minDateTimeIndex"], infer_datetime_format=True, utc=True
+        ),
+        "dt_end": pd.to_datetime(
+            doc["maxDateTimeIndex"], infer_datetime_format=True, utc=True
+        ),
+        "probe1Temperature": float(doc["customData"]["probe1Temperature"]),
+        "probe2Temperature": float(doc["customData"]["probe2Temperature"]),
+    }
+
+    # Extract data
+    data = doc["logData"]["data"]
+
+    numEntries = np.size(data)
+    LAF = np.empty(numEntries)
+    Ps = np.empty_like(LAF)
+    Pas = np.empty_like(LAF)
+    temp = np.empty_like(LAF)
+
+    # Check dts type based on the number of columns
+    if len(data[0]["#text"].split(",")) == 4:
+        dtsType = "single_ended"
+    elif len(data[0]["#text"].split(",")) == 6:
+        dtsType = "double_ended"
+    else:
+        raise IOError("Unrecognized xml format... dumping first row \n" + data[0])
+
+    # Single ended data
+    if "single_ended" in dtsType:
+        for dnum, dlist in enumerate(data):
+            LAF[dnum], Ps[dnum], Pas[dnum], temp[dnum] = list(
+                map(float, dlist["#text"].split(","))
+            )
+        actualData = pd.DataFrame.from_dict(
+            {"LAF": LAF, "Ps": Ps, "Pas": Pas, "temp": temp}
+        ).set_index("LAF")
+
+    # Double ended data
+    elif "double_ended" in dtsType:
+        rPs = np.empty_like(LAF)
+        rPas = np.empty_like(LAF)
+
+        for dnum, dlist in enumerate(data):
+            (
+                LAF[dnum],
+                Ps[dnum],
+                Pas[dnum],
+                rPs[dnum],
+                rPas[dnum],
+                temp[dnum],
+            ) = list(map(float, dlist.split(",")))
+
+        actualData = pd.DataFrame.from_dict(
+            {"LAF": LAF, "Ps": Ps, "Pas": Pas, "rPs": rPs, "rPas": rPas, "temp": temp}
+        ).set_index("LAF")
+
+    return (actualData, metaData)
+
+
+def read_new_silixa_xml(doc):
     # Extract units/metadata info out of xml dictionary
     metaData = {
         "LAF_beg": float(doc["startIndex"]["#text"]),
@@ -143,6 +217,8 @@ def archive_read(cfg, write_mode="preserve", prevNumChunk=0):
         # List of files to iterate over
         dirConTar = [dC for dC in os.listdir() if chan in dC and ".tar.gz" in dC]
         dirConTar.sort()
+        if not dirConTar:
+            print("No archives were found for {}.".format(chan))
 
         # Untar files
         for tFile in dirConTar:
@@ -161,13 +237,17 @@ def archive_read(cfg, write_mode="preserve", prevNumChunk=0):
             # Extract the archive
             t = tarfile.open(tFile)
             t.extractall()
-            t.close
+            t.close()
 
             # List of files to iterate over
             dirConXML = [dC for dC in os.listdir() if chan in dC and ".xml" in dC]
+            # No .xml files could be found so an error needs to be raised.
+            if not dirConXML:
+                raise ValueError("No xml files found in {}".format(tFile))
+
+            # Sorting by name sould be the equivalent to sorting by date.
             dirConXML.sort()
             nTotal = np.size(dirConXML)
-            ds = None
             ds_list = []
 
             # Read each xml file, assign to an xarray Dataset, concatenate