diff --git a/ChangeLog.md b/ChangeLog.md
index 371464d6..8cb9d77b 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -8,6 +8,7 @@ Performance improvements:
 
 * Revise `Table._fast_merge` to use COO directly. For very large tables, this reduces runtime by ~50x and memory by ~5x. See PR [#913](https://github.com/biocore/biom-format/pull/933).
 * Drastically reduce the memory needs of subsampling when sums are large. Also adds 64-bit support. See PR [#935](https://github.com/biocore/biom-format/pull/935).
+* Improve handling of not-perfectly-integer inputs. See PR [#938](https://github.com/biocore/biom-format/pull/938).
 
 biom 2.1.15
 -----------
diff --git a/biom/_subsample.pyx b/biom/_subsample.pyx
index 7abca385..6bf212ac 100644
--- a/biom/_subsample.pyx
+++ b/biom/_subsample.pyx
@@ -38,7 +38,7 @@ cdef _subsample_with_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
 
     """
     cdef:
-        cnp.int64_t counts_sum
+        cnp.float64_t counts_sum
         cnp.int32_t start,end,length
         Py_ssize_t i
         cnp.ndarray[cnp.float64_t, ndim=1] pvals
@@ -83,14 +83,19 @@ cdef _subsample_without_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
     cdef:
         cnp.int64_t counts_sum, count_el, perm_count_el
         cnp.int64_t count_rem
-        cnp.ndarray[cnp.int64_t, ndim=1] permuted
+        cnp.ndarray[cnp.int64_t, ndim=1] permuted, intdata
         Py_ssize_t i, idx
         cnp.int32_t length,el,start,end
+        cnp.int64_t el_cnt
 
     for i in range(indptr.shape[0] - 1):
         start, end = indptr[i], indptr[i+1]
         length = end - start
-        counts_sum = data[start:end].sum()
+        # We are relying on data being integers
+        # If there are rounding erros, fp64 sums can lead to
+        # big errors in sum, so convert to int64, first
+        intdata = data[start:end].astype(np.int64)
+        counts_sum = intdata.sum()
         
         if counts_sum < n:
             data[start:end] = 0
@@ -115,22 +120,34 @@ cdef _subsample_without_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
 
         el = 0         # index in result/data
         count_el = 0  # index in permutted
-        count_rem = long(data[start])  # since each data has multiple els, sub count there
-        data[start] = 0.0
+        count_rem = intdata[0]  # since each data has multiple els, keep track how many are left
+        el_cnt = 0
         for idx in range(n):
             perm_count_el = permuted[idx]
-            # the array is sorted, so just jump ahead
+            # The array is sorted, so just jump ahead if needed
+            # Move until we get withing the elements range
             while (perm_count_el - count_el) >= count_rem:
-               count_el += count_rem
+               #save the computed value
+               data[start+el] = el_cnt
+               # move to next element
                el += 1
-               count_rem = long(data[start+el])
-               data[start+el] = 0.0
+               # move to the beginning of next element
+               count_el += count_rem
+               # Load how much we have avaialble
+               count_rem = intdata[el]
+               #re-start the el counter
+               el_cnt = 0
+            # increment the el counter
+            el_cnt += 1
+            # update the counters
+            # reduce what is left
             count_rem -= (perm_count_el - count_el)
+            #move the pointer to where we stopped
             count_el = perm_count_el
-
-            data[start+el] += 1
+        # save the last value
+        data[start+el] = el_cnt
         # clean up tail elements
-        data[start+el+1:end] = 0.0
+        data[start+el+1:end] = 0
 
 
 def _subsample(arr, n, with_replacement, rng):