From 9495744d6a433cdf4ead00880098f00f98812478 Mon Sep 17 00:00:00 2001
From: Matt Dahl <matt.dahl.2013@gmail.com>
Date: Fri, 14 Jul 2023 23:08:09 -0700
Subject: [PATCH 01/10] test(models): Adds failing test for citation with
 corrected reporter.

---
 tests/test_ModelsTest.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tests/test_ModelsTest.py b/tests/test_ModelsTest.py
index 8e025af7..76f5be30 100644
--- a/tests/test_ModelsTest.py
+++ b/tests/test_ModelsTest.py
@@ -42,6 +42,20 @@ def test_resource_comparison_with_missing_page_cites(self):
         self.assertNotEqual(hash(citations[0]), hash(citations[1]))
         print("✓")
 
+    def test_citation_comparison_with_corrected_reporter(self):
+        """Are two citation objects equal when their attributes are
+        the same, even if the reporter has been normalized?"""
+        citations = [
+            case_citation(2, volume="2", reporter="U.S.", page="4"),
+            case_citation(2, volume="2", reporter="U. S.", page="4"),
+        ]
+        print(
+            "Testing citation comparison with corrected reporter...", end=" "
+        )
+        self.assertEqual(citations[0], citations[1])
+        self.assertEqual(hash(citations[0]), hash(citations[1]))
+        print("✓")
+
     def test_missing_page_cite_conversion(self):
         """Do citations with missing page numbers get their groups['page']
         attribute set to None?"""

From b46cf5dbadba3e04cd9abb23e7594d477d0fdebf Mon Sep 17 00:00:00 2001
From: Matt Dahl <matt.dahl.2013@gmail.com>
Date: Fri, 14 Jul 2023 23:11:53 -0700
Subject: [PATCH 02/10] feat(models): Explicitly declares citation object
 __hash__() functions.

---
 eyecite/models.py | 91 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 64 insertions(+), 27 deletions(-)

diff --git a/eyecite/models.py b/eyecite/models.py
index 5fff5903..6aaa5504 100644
--- a/eyecite/models.py
+++ b/eyecite/models.py
@@ -60,7 +60,7 @@ def includes_year(
         )
 
 
-@dataclass(eq=True, unsafe_hash=True)
+@dataclass(eq=False, unsafe_hash=False)
 class CitationBase:
     """Base class for objects returned by `eyecite.find.get_citations`. We
     define several subclasses of this class below, representing the various
@@ -101,21 +101,29 @@ def __repr__(self):
             + ")"
         )
 
+    def __hash__(self) -> int:
+        """In general, citations are considered equivalent if they have the
+        same group values (i.e., the same regex group content that is extracted
+        from the matched text). Subclasses may override this method in order to
+        specify equivalence behavior that is more appropriate for certain
+        kinds of citations (e.g., see CaseCitation override).
+        """
+        return hash((type(self), tuple(self.groups.items())))
+
+    def __eq__(self, other):
+        """This method is inherited by all subclasses and should not be
+        overridden. It implements object equality in exactly the same way as
+        defined in an object's __hash__() function, which should be overridden
+        instead if desired.
+        """
+        return self.__hash__() == other.__hash__()
+
     @dataclass(eq=True, unsafe_hash=True)
     class Metadata:
         """Define fields on self.metadata."""
 
         parenthetical: Optional[str] = None
 
-    def comparison_hash(self) -> int:
-        """Return hash that will be the same if two cites are semantically
-        equivalent, unless the citation is a CaseCitation missing a page.
-        """
-        if isinstance(self, CaseCitation) and self.groups["page"] is None:
-            return id(self)
-        else:
-            return hash((type(self), tuple(self.groups.items())))
-
     def corrected_citation(self):
         """Return citation with any variations normalized."""
         return self.matched_text()
@@ -170,7 +178,7 @@ def full_span(self) -> Tuple[int, int]:
         return start, end
 
 
-@dataclass(eq=True, unsafe_hash=True, repr=False)
+@dataclass(eq=False, unsafe_hash=False, repr=False)
 class ResourceCitation(CitationBase):
     """Base class for a case, law, or journal citation. Could be short or
     long."""
@@ -194,6 +202,13 @@ def __post_init__(self):
         )
         super().__post_init__()
 
+    def __hash__(self) -> int:
+        """ResourceCitation objects are hashed in the same way as their
+        parent class (CitationBase) objects, except that we also take into
+        consideration the all_editions field.
+        """
+        return hash((super().__hash__(), self.all_editions))
+
     @dataclass(eq=True, unsafe_hash=True)
     class Metadata(CitationBase.Metadata):
         """Define fields on self.metadata."""
@@ -201,11 +216,6 @@ class Metadata(CitationBase.Metadata):
         pin_cite: Optional[str] = None
         year: Optional[str] = None
 
-    def comparison_hash(self) -> int:
-        """Return hash that will be the same if two cites are semantically
-        equivalent."""
-        return hash((super().comparison_hash(), self.all_editions))
-
     def add_metadata(self, words: "Tokens"):
         """Extract metadata from text before and after citation."""
         self.guess_edition()
@@ -248,13 +258,13 @@ def guess_edition(self):
             self.edition_guess = editions[0]
 
 
-@dataclass(eq=True, unsafe_hash=True, repr=False)
+@dataclass(eq=False, unsafe_hash=False, repr=False)
 class FullCitation(ResourceCitation):
     """Abstract base class indicating that a citation fully identifies a
     resource."""
 
 
-@dataclass(eq=True, unsafe_hash=True, repr=False)
+@dataclass(eq=False, unsafe_hash=False, repr=False)
 class FullLawCitation(FullCitation):
     """Citation to a source from `reporters_db/laws.json`."""
 
@@ -291,7 +301,7 @@ def corrected_citation_full(self):
         return "".join(parts)
 
 
-@dataclass(eq=True, unsafe_hash=True, repr=False)
+@dataclass(eq=False, unsafe_hash=False, repr=False)
 class FullJournalCitation(FullCitation):
     """Citation to a source from `reporters_db/journals.json`."""
 
@@ -317,12 +327,31 @@ def corrected_citation_full(self):
         return "".join(parts)
 
 
-@dataclass(eq=True, unsafe_hash=True, repr=False)
+@dataclass(eq=False, unsafe_hash=False, repr=False)
 class CaseCitation(ResourceCitation):
     """Convenience class which represents a single citation found in a
     document.
     """
 
+    def __hash__(self) -> int:
+        """CaseCitation objects that have the same volume, reporter, and page
+        are considered equivalent, unless the citation is missing a page, in
+        which case the object's hash will be unique for safety.
+        """
+        if self.groups["page"] is None:
+            return id(self)
+        else:
+            return hash(
+                (
+                    type(self),
+                    frozenset({
+                        'volume': self.groups["volume"],
+                        'reporter': self.corrected_reporter(),
+                        'page': self.groups["page"]
+                    }.items()),
+                )
+            )
+
     @dataclass(eq=True, unsafe_hash=True)
     class Metadata(FullCitation.Metadata):
         """Define fields on self.metadata."""
@@ -339,7 +368,7 @@ def guess_court(self):
             self.metadata.court = "scotus"
 
 
-@dataclass(eq=True, unsafe_hash=True, repr=False)
+@dataclass(eq=False, unsafe_hash=False, repr=False)
 class FullCaseCitation(CaseCitation, FullCitation):
     """Convenience class which represents a standard, fully named citation,
     i.e., the kind of citation that marks the first time a document is cited.
@@ -389,7 +418,7 @@ def corrected_citation_full(self):
         return "".join(parts)
 
 
-@dataclass(eq=True, unsafe_hash=True, repr=False)
+@dataclass(eq=False, unsafe_hash=False, repr=False)
 class ShortCaseCitation(CaseCitation):
     """Convenience class which represents a short form citation, i.e., the kind
     of citation made after a full citation has already appeared. This kind of
@@ -419,7 +448,7 @@ def corrected_citation_full(self):
         return "".join(parts)
 
 
-@dataclass(eq=True, unsafe_hash=True, repr=False)
+@dataclass(eq=False, unsafe_hash=False, repr=False)
 class SupraCitation(CitationBase):
     """Convenience class which represents a 'supra' citation, i.e., a citation
     to something that is above in the document. Like a short form citation,
@@ -458,7 +487,7 @@ def formatted(self):
         return "".join(parts)
 
 
-@dataclass(eq=True, unsafe_hash=True, repr=False)
+@dataclass(eq=False, unsafe_hash=False, repr=False)
 class IdCitation(CitationBase):
     """Convenience class which represents an 'id' or 'ibid' citation, i.e., a
     citation to the document referenced immediately prior. An 'id' citation is
@@ -469,6 +498,10 @@ class IdCitation(CitationBase):
     Example: "... foo bar," id., at 240
     """
 
+    def __hash__(self) -> int:
+        """IdCitation objects are always considered unique for safety."""
+        return id(self)
+
     @dataclass(eq=True, unsafe_hash=True)
     class Metadata(CitationBase.Metadata):
         """Define fields on self.metadata."""
@@ -483,7 +516,7 @@ def formatted(self):
         return "".join(parts)
 
 
-@dataclass(eq=True, unsafe_hash=True, repr=False)
+@dataclass(eq=False, unsafe_hash=False, repr=False)
 class UnknownCitation(CitationBase):
     """Convenience class which represents an unknown citation. A recognized
     citation should theoretically be parsed as a CaseCitation, FullLawCitation,
@@ -491,6 +524,10 @@ class UnknownCitation(CitationBase):
     a naive catch-all.
     """
 
+    def __hash__(self) -> int:
+        """UnknownCitation objects are always considered unique for safety."""
+        return id(self)
+
 
 def NonopinionCitation(*args, **kwargs):
     from warnings import warn
@@ -647,13 +684,13 @@ class Resource(ResourceType):
 
     def __hash__(self):
         """Resources are the same if their citations are semantically
-        equivalent.
+        equivalent, as defined by their hash function.
 
         Note: Resources composed of citations with missing page numbers are
         NOT considered the same, even if their other attributes are identical.
         This is to avoid potential false positives.
         """
-        return self.citation.comparison_hash()
+        return hash(self.citation)
 
     def __eq__(self, other):
         return self.__hash__() == other.__hash__()

From bf5a3d1b180026f3d95941ab5b7bbda9a96fcea1 Mon Sep 17 00:00:00 2001
From: Matt Dahl <matt.dahl.2013@gmail.com>
Date: Fri, 14 Jul 2023 23:12:37 -0700
Subject: [PATCH 03/10] test(models): Adds new tests for new hashing behavior.

---
 eyecite/test_factories.py |  4 +-
 tests/test_ModelsTest.py  | 93 +++++++++++++++++++++++++++++++++++----
 2 files changed, 86 insertions(+), 11 deletions(-)

diff --git a/eyecite/test_factories.py b/eyecite/test_factories.py
index e69a467d..89d01e25 100644
--- a/eyecite/test_factories.py
+++ b/eyecite/test_factories.py
@@ -67,8 +67,8 @@ def case_citation(
 
 
 def law_citation(
-    source_text,
-    reporter,
+    source_text=None,
+    reporter="Mass. Gen. Laws",
     **kwargs,
 ):
     """Convenience function for creating mock FullLawCitation objects."""
diff --git a/tests/test_ModelsTest.py b/tests/test_ModelsTest.py
index 76f5be30..53294643 100644
--- a/tests/test_ModelsTest.py
+++ b/tests/test_ModelsTest.py
@@ -2,21 +2,28 @@
 
 from eyecite import get_citations
 from eyecite.models import Resource
-from eyecite.test_factories import case_citation
+from eyecite.test_factories import (
+    case_citation,
+    id_citation,
+    journal_citation,
+    law_citation,
+    unknown_citation,
+)
 
 
 class ModelsTest(TestCase):
     def test_citation_comparison(self):
         """Are two citation objects equal when their attributes are
         the same?"""
-        citations = [
-            case_citation(2, volume="2", reporter="U.S.", page="2"),
-            case_citation(2, volume="2", reporter="U.S.", page="2"),
-        ]
-        print("Testing citation comparison...", end=" ")
-        self.assertEqual(citations[0], citations[1])
-        self.assertEqual(hash(citations[0]), hash(citations[1]))
-        print("✓")
+        for factory in [case_citation, journal_citation, law_citation]:
+            citations = [
+                factory(),
+                factory(),
+            ]
+            print(f"Testing {factory.__name__} comparison...", end=" ")
+            self.assertEqual(citations[0], citations[1])
+            self.assertEqual(hash(citations[0]), hash(citations[1]))
+            print("✓")
 
     def test_resource_comparison(self):
         """Are two Resource objects equal when their citations' attributes are
@@ -42,6 +49,18 @@ def test_resource_comparison_with_missing_page_cites(self):
         self.assertNotEqual(hash(citations[0]), hash(citations[1]))
         print("✓")
 
+    def test_citation_comparison_with_missing_page_cites(self):
+        """Are two citation objects different when one of them is missing
+        a page, even if their other attributes are the same?"""
+        citations = [
+            case_citation(2, volume="2", reporter="U.S.", page="__"),
+            case_citation(2, volume="2", reporter="U.S.", page="__"),
+        ]
+        print("Testing citation comparison with missing pages...", end=" ")
+        self.assertNotEqual(citations[0], citations[1])
+        self.assertNotEqual(hash(citations[0]), hash(citations[1]))
+        print("✓")
+
     def test_citation_comparison_with_corrected_reporter(self):
         """Are two citation objects equal when their attributes are
         the same, even if the reporter has been normalized?"""
@@ -56,6 +75,62 @@ def test_citation_comparison_with_corrected_reporter(self):
         self.assertEqual(hash(citations[0]), hash(citations[1]))
         print("✓")
 
+    def test_citation_comparison_with_different_source_text(self):
+        """Are two citation objects equal when their attributes are
+        the same, even if they have different source text?"""
+        citations = [
+            case_citation(
+                source_text="foobar", volume="2", reporter="U.S.", page="4"
+            ),
+            case_citation(
+                source_text="foo", volume="2", reporter="U.S.", page="4"
+            ),
+        ]
+        print(
+            "Testing citation comparison with different source text...",
+            end=" ",
+        )
+        self.assertEqual(citations[0], citations[1])
+        self.assertEqual(hash(citations[0]), hash(citations[1]))
+        print("✓")
+
+    def test_citation_comparison_with_different_reporter(self):
+        """Are two citation objects different when they have different
+        reporters, even if their other attributes are the same?
+        (sanity check)"""
+        citations = [
+            case_citation(2, volume="2", reporter="F. Supp.", page="4"),
+            case_citation(2, volume="2", reporter="U. S.", page="4"),
+        ]
+        print(
+            "Testing citation comparison with different reporters...", end=" "
+        )
+        self.assertNotEqual(citations[0], citations[1])
+        self.assertNotEqual(hash(citations[0]), hash(citations[1]))
+        print("✓")
+
+    def test_id_citation_comparison(self):
+        """Are two IdCitation objects always different?"""
+        citations = [
+            id_citation("Id.,", metadata={"pin_cite": "at 123"}),
+            id_citation("Id.,", metadata={"pin_cite": "at 123"}),
+        ]
+        print("Testing id citation comparison...", end=" ")
+        self.assertNotEqual(citations[0], citations[1])
+        self.assertNotEqual(hash(citations[0]), hash(citations[1]))
+        print("✓")
+
+    def test_unknown_citation_comparison(self):
+        """Are two UnknownCitation objects always different?"""
+        citations = [
+            unknown_citation("§99"),
+            unknown_citation("§99"),
+        ]
+        print("Testing unknown citation comparison...", end=" ")
+        self.assertNotEqual(citations[0], citations[1])
+        self.assertNotEqual(hash(citations[0]), hash(citations[1]))
+        print("✓")
+
     def test_missing_page_cite_conversion(self):
         """Do citations with missing page numbers get their groups['page']
         attribute set to None?"""

From 7b976268616541b906e6ceb77f73dec2da439d2e Mon Sep 17 00:00:00 2001
From: Matt Dahl <matt.dahl.2013@gmail.com>
Date: Sat, 15 Jul 2023 00:17:40 -0700
Subject: [PATCH 04/10] feat(models): Implements reproducible hashing of
 citation objects.

---
 eyecite/models.py | 46 +++++++++++++++++++++++++++++++---------------
 eyecite/utils.py  | 29 ++++++++++++++++++++++-------
 2 files changed, 53 insertions(+), 22 deletions(-)

diff --git a/eyecite/models.py b/eyecite/models.py
index 6aaa5504..440e0a67 100644
--- a/eyecite/models.py
+++ b/eyecite/models.py
@@ -1,6 +1,6 @@
 import re
 from collections import UserString
-from dataclasses import dataclass, field
+from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from typing import (
     Any,
@@ -15,7 +15,7 @@
     cast,
 )
 
-from eyecite.utils import HashableDict
+from eyecite.utils import hash_sha256
 
 ResourceType = Hashable
 
@@ -79,7 +79,7 @@ class CitationBase:
     def __post_init__(self):
         """Set up groups and metadata."""
         # Allow groups to be used in comparisons:
-        self.groups = HashableDict(self.token.groups)
+        self.groups = self.token.groups
         # Make metadata a self.Metadata object:
         self.metadata = (
             self.Metadata(**self.metadata)
@@ -108,7 +108,9 @@ def __hash__(self) -> int:
         specify equivalence behavior that is more appropriate for certain
         kinds of citations (e.g., see CaseCitation override).
         """
-        return hash((type(self), tuple(self.groups.items())))
+        return hash_sha256(
+            {**dict(self.groups.items()), **{"class": type(self).__name__}}
+        )
 
     def __eq__(self, other):
         """This method is inherited by all subclasses and should not be
@@ -207,7 +209,18 @@ def __hash__(self) -> int:
         parent class (CitationBase) objects, except that we also take into
         consideration the all_editions field.
         """
-        return hash((super().__hash__(), self.all_editions))
+        return hash_sha256(
+            {
+                **dict(self.groups.items()),
+                **{
+                    "all_editions": sorted(
+                        [asdict(e) for e in self.all_editions],
+                        key=lambda d: d["short_name"],  # type: ignore
+                    ),
+                    "class": type(self).__name__,
+                },
+            }
+        )
 
     @dataclass(eq=True, unsafe_hash=True)
     class Metadata(CitationBase.Metadata):
@@ -341,15 +354,13 @@ def __hash__(self) -> int:
         if self.groups["page"] is None:
             return id(self)
         else:
-            return hash(
-                (
-                    type(self),
-                    frozenset({
-                        'volume': self.groups["volume"],
-                        'reporter': self.corrected_reporter(),
-                        'page': self.groups["page"]
-                    }.items()),
-                )
+            return hash_sha256(
+                {
+                    "volume": self.groups["volume"],
+                    "reporter": self.corrected_reporter(),
+                    "page": self.groups["page"],
+                    "class": type(self).__name__,
+                }
             )
 
     @dataclass(eq=True, unsafe_hash=True)
@@ -690,7 +701,12 @@ def __hash__(self):
         NOT considered the same, even if their other attributes are identical.
         This is to avoid potential false positives.
         """
-        return hash(self.citation)
+        return hash_sha256(
+            {
+                "citation": hash(self.citation),
+                "class": type(self).__name__,
+            }
+        )
 
     def __eq__(self, other):
         return self.__hash__() == other.__hash__()
diff --git a/eyecite/utils.py b/eyecite/utils.py
index a7df001f..f632ea51 100644
--- a/eyecite/utils.py
+++ b/eyecite/utils.py
@@ -1,4 +1,7 @@
+import hashlib
+import json
 import re
+from ctypes import c_int32
 
 from lxml import etree
 
@@ -72,13 +75,6 @@ def on_match(index, start, end, flags, context):
     return matches
 
 
-class HashableDict(dict):
-    """Dict that works as an attribute of a hashable dataclass."""
-
-    def __hash__(self):
-        return hash(frozenset(self.items()))
-
-
 def dump_citations(citations, text, context_chars=30):
     """Dump citations extracted from text, for debugging. Example:
     >>> text = "blah. Foo v. Bar, 1 U.S. 1, 2 (1999). blah"
@@ -117,3 +113,22 @@ def dump_citations(citations, text, context_chars=30):
                 else:
                     out.append(f"  * {key}={repr(value)}")
     return "\n".join(out)
+
+
+def hash_sha256(dictionary: dict) -> int:
+    """Hash dictionaries in a deterministic way.
+
+    :param dictionary: The dictionary to hash
+    :return: An integer hash
+    """
+
+    # Convert the dictionary to a JSON string
+    json_str: str = json.dumps(dictionary, sort_keys=True)
+
+    # Convert the JSON string to bytes
+    json_bytes: bytes = json_str.encode("utf-8")
+
+    # Calculate the hash of the bytes, convert to 32-bit int, and return
+    return c_int32(
+        int.from_bytes(hashlib.sha256(json_bytes).digest(), byteorder="big")
+    ).value

From 5626b161617f48f1783a77ff442c8666bc6e5446 Mon Sep 17 00:00:00 2001
From: Matt Dahl <matt.dahl.2013@gmail.com>
Date: Sat, 15 Jul 2023 00:17:58 -0700
Subject: [PATCH 05/10] test(models): Adds new tests for reproducible hashing.

---
 tests/test_ModelsTest.py | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tests/test_ModelsTest.py b/tests/test_ModelsTest.py
index 53294643..079bf750 100644
--- a/tests/test_ModelsTest.py
+++ b/tests/test_ModelsTest.py
@@ -141,3 +141,37 @@ def test_missing_page_cite_conversion(self):
         self.assertIsNone(citation1.groups["page"])
         self.assertIsNone(citation2.groups["page"])
         print("✓")
+
+    def test_persistent_hash(self):
+        """Are object hashes reproducible across runs?"""
+        print("Testing persistent citation hash...", end=" ")
+        objects = [
+            (
+                case_citation(),
+                1009797070,
+            ),
+            (
+                journal_citation(),
+                -1332833206,
+            ),
+            (
+                law_citation(),
+                554454242,
+            ),
+            (
+                Resource(case_citation()),
+                -666984820,
+            ),
+        ]
+        for citation, citation_hash in objects:
+            self.assertEqual(hash(citation), citation_hash)
+            print("✓")
+
+    def test_hash_function_identity(self):
+        """Do hash() and __hash__() output the same hash?"""
+        citation = case_citation()
+        resource = Resource(case_citation())
+        print("Testing hash function identity...", end=" ")
+        self.assertEqual(hash(citation), citation.__hash__())
+        self.assertEqual(hash(resource), resource.__hash__())
+        print("✓")

From c80526f12b98e17d009d9f64a953d1f09952cef3 Mon Sep 17 00:00:00 2001
From: Matt Dahl <matt.dahl.2013@gmail.com>
Date: Sat, 15 Jul 2023 00:29:52 -0700
Subject: [PATCH 06/10] test(models): Adds test for nominative reporter
 equality (#154).

---
 tests/test_ModelsTest.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tests/test_ModelsTest.py b/tests/test_ModelsTest.py
index 079bf750..a4edccc8 100644
--- a/tests/test_ModelsTest.py
+++ b/tests/test_ModelsTest.py
@@ -94,6 +94,20 @@ def test_citation_comparison_with_different_source_text(self):
         self.assertEqual(hash(citations[0]), hash(citations[1]))
         print("✓")
 
+    def test_citation_comparison_with_nominative_reporter(self):
+        """Are two citation objects equal when their attributes are
+        the same, even if one of them has a nominative reporter?"""
+        citations = [
+            get_citations("5 U.S. 137")[0],
+            get_citations("5 U.S. (1 Cranch) 137")[0],
+        ]
+        print(
+            "Testing citation comparison with nominative reporter...", end=" "
+        )
+        self.assertEqual(citations[0], citations[1])
+        self.assertEqual(hash(citations[0]), hash(citations[1]))
+        print("✓")
+
     def test_citation_comparison_with_different_reporter(self):
         """Are two citation objects different when they have different
         reporters, even if their other attributes are the same?

From 0a305592d9581dfd44930c04e575d0cd49b6e117 Mon Sep 17 00:00:00 2001
From: Matt Dahl <matt.dahl.2013@gmail.com>
Date: Tue, 18 Jul 2023 14:12:00 -0700
Subject: [PATCH 07/10] refactor(models): Removes 32-bit truncation of hashes.

---
 eyecite/models.py        | 58 +++++++++++++++++++++++-----------------
 eyecite/utils.py         |  7 ++---
 tests/test_ModelsTest.py |  8 +++---
 3 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/eyecite/models.py b/eyecite/models.py
index 440e0a67..c4173c9d 100644
--- a/eyecite/models.py
+++ b/eyecite/models.py
@@ -108,8 +108,10 @@ def __hash__(self) -> int:
         specify equivalence behavior that is more appropriate for certain
         kinds of citations (e.g., see CaseCitation override).
         """
-        return hash_sha256(
-            {**dict(self.groups.items()), **{"class": type(self).__name__}}
+        return hash(
+            hash_sha256(
+                {**dict(self.groups.items()), **{"class": type(self).__name__}}
+            )
         )
 
     def __eq__(self, other):
@@ -209,17 +211,19 @@ def __hash__(self) -> int:
         parent class (CitationBase) objects, except that we also take into
         consideration the all_editions field.
         """
-        return hash_sha256(
-            {
-                **dict(self.groups.items()),
-                **{
-                    "all_editions": sorted(
-                        [asdict(e) for e in self.all_editions],
-                        key=lambda d: d["short_name"],  # type: ignore
-                    ),
-                    "class": type(self).__name__,
-                },
-            }
+        return hash(
+            hash_sha256(
+                {
+                    **dict(self.groups.items()),
+                    **{
+                        "all_editions": sorted(
+                            [asdict(e) for e in self.all_editions],
+                            key=lambda d: d["short_name"],  # type: ignore
+                        ),
+                        "class": type(self).__name__,
+                    },
+                }
+            )
         )
 
     @dataclass(eq=True, unsafe_hash=True)
@@ -354,13 +358,15 @@ def __hash__(self) -> int:
         if self.groups["page"] is None:
             return id(self)
         else:
-            return hash_sha256(
-                {
-                    "volume": self.groups["volume"],
-                    "reporter": self.corrected_reporter(),
-                    "page": self.groups["page"],
-                    "class": type(self).__name__,
-                }
+            return hash(
+                hash_sha256(
+                    {
+                        "volume": self.groups["volume"],
+                        "reporter": self.corrected_reporter(),
+                        "page": self.groups["page"],
+                        "class": type(self).__name__,
+                    }
+                )
             )
 
     @dataclass(eq=True, unsafe_hash=True)
@@ -701,11 +707,13 @@ def __hash__(self):
         NOT considered the same, even if their other attributes are identical.
         This is to avoid potential false positives.
         """
-        return hash_sha256(
-            {
-                "citation": hash(self.citation),
-                "class": type(self).__name__,
-            }
+        return hash(
+            hash_sha256(
+                {
+                    "citation": hash(self.citation),
+                    "class": type(self).__name__,
+                }
+            )
         )
 
     def __eq__(self, other):
diff --git a/eyecite/utils.py b/eyecite/utils.py
index f632ea51..b642d23e 100644
--- a/eyecite/utils.py
+++ b/eyecite/utils.py
@@ -1,7 +1,6 @@
 import hashlib
 import json
 import re
-from ctypes import c_int32
 
 from lxml import etree
 
@@ -128,7 +127,5 @@ def hash_sha256(dictionary: dict) -> int:
     # Convert the JSON string to bytes
     json_bytes: bytes = json_str.encode("utf-8")
 
-    # Calculate the hash of the bytes, convert to 32-bit int, and return
-    return c_int32(
-        int.from_bytes(hashlib.sha256(json_bytes).digest(), byteorder="big")
-    ).value
+    # Calculate the hash of the bytes, convert to an int, and return
+    return int.from_bytes(hashlib.sha256(json_bytes).digest(), byteorder="big")
diff --git a/tests/test_ModelsTest.py b/tests/test_ModelsTest.py
index a4edccc8..2ec30c3c 100644
--- a/tests/test_ModelsTest.py
+++ b/tests/test_ModelsTest.py
@@ -162,19 +162,19 @@ def test_persistent_hash(self):
         objects = [
             (
                 case_citation(),
-                1009797070,
+                376794172219282606,
             ),
             (
                 journal_citation(),
-                -1332833206,
+                1073308118601601409,
             ),
             (
                 law_citation(),
-                554454242,
+                407008277458283218,
             ),
             (
                 Resource(case_citation()),
-                -666984820,
+                1986750081022884797,
             ),
         ]
         for citation, citation_hash in objects:

From 444250d7e98930c3334c74fd1ad845ed44a836eb Mon Sep 17 00:00:00 2001
From: Matt Dahl <matt.dahl.2013@gmail.com>
Date: Fri, 21 Jul 2023 10:37:00 -0700
Subject: [PATCH 08/10] test(models): Adds failing test for tax citation
 hashing.

---
 tests/test_ModelsTest.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tests/test_ModelsTest.py b/tests/test_ModelsTest.py
index 2ec30c3c..906aa29f 100644
--- a/tests/test_ModelsTest.py
+++ b/tests/test_ModelsTest.py
@@ -123,6 +123,19 @@ def test_citation_comparison_with_different_reporter(self):
         self.assertNotEqual(hash(citations[0]), hash(citations[1]))
         print("✓")
 
+    def test_tax_court_citation_comparison(self):
+        """Are two citation objects equal when their attributes are
+        the same, even if they are tax court citations and might not
+        have volumes?"""
+        citations = [
+            get_citations("T.C.M. (RIA) ¶ 95,342")[0],
+            get_citations("T.C.M. (RIA) ¶ 95,342")[0],
+        ]
+        print("Testing tax court citation comparison...", end=" ")
+        self.assertEqual(citations[0], citations[1])
+        self.assertEqual(hash(citations[0]), hash(citations[1]))
+        print("✓")
+
     def test_id_citation_comparison(self):
         """Are two IdCitation objects always different?"""
         citations = [

From 38171a9041157e34234a3ea0d41fe81db8ecb3a3 Mon Sep 17 00:00:00 2001
From: Matt Dahl <matt.dahl.2013@gmail.com>
Date: Fri, 21 Jul 2023 10:39:21 -0700
Subject: [PATCH 09/10] fix(models): Fixes bug re potentially nonexistent keys
 in CaseCitation hash function.

---
 eyecite/models.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/eyecite/models.py b/eyecite/models.py
index 2cf1c858..806f7fac 100644
--- a/eyecite/models.py
+++ b/eyecite/models.py
@@ -361,10 +361,15 @@ def __hash__(self) -> int:
             return hash(
                 hash_sha256(
                     {
-                        "volume": self.groups["volume"],
-                        "reporter": self.corrected_reporter(),
-                        "page": self.groups["page"],
-                        "class": type(self).__name__,
+                        **{
+                            k: self.groups[k]
+                            for k in ["volume", "page"]
+                            if k in self.groups
+                        },
+                        **{
+                            "reporter": self.corrected_reporter(),
+                            "class": type(self).__name__,
+                        },
                     }
                 )
             )

From a51987323e41cb92ec4d0bc1f96b9533b5732586 Mon Sep 17 00:00:00 2001
From: Matt Dahl <matt.dahl.2013@gmail.com>
Date: Fri, 21 Jul 2023 11:07:00 -0700
Subject: [PATCH 10/10] feat(docs): Documents self.groups content for different
 classes.

---
 eyecite/models.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/eyecite/models.py b/eyecite/models.py
index 806f7fac..4ab71f2c 100644
--- a/eyecite/models.py
+++ b/eyecite/models.py
@@ -107,6 +107,25 @@ def __hash__(self) -> int:
         from the matched text). Subclasses may override this method in order to
         specify equivalence behavior that is more appropriate for certain
         kinds of citations (e.g., see CaseCitation override).
+
+        self.groups typically contains different keys for different objects:
+
+        FullLawCitation (non-exhaustive and non-guaranteed):
+        - chapter
+        - reporter
+        - law_section
+        - issue
+        - page
+        - docket_number
+        - pamphlet
+        - title
+
+        FullJournalCitation (non-exhaustive and non-guaranteed):
+        - volume
+        - reporter
+        - page
+
+        FullCaseCitation (see CaseCitation.__hash__() notes)
         """
         return hash(
             hash_sha256(
@@ -354,6 +373,13 @@ def __hash__(self) -> int:
         """CaseCitation objects that have the same volume, reporter, and page
         are considered equivalent, unless the citation is missing a page, in
         which case the object's hash will be unique for safety.
+
+        self.groups for CaseCitation objects usually contains these keys:
+        - page (guaranteed here: https://github.com/freelawproject/reporters-db/blob/main/tests.py#L129)  # noqa: E501
+        - reporter (guaranteed here: https://github.com/freelawproject/reporters-db/blob/main/tests.py#L129)  # noqa: E501
+        - volume (almost always present, but some tax court citations don't have volumes)  # noqa: E501
+        - reporter_nominative (sometimes)
+        - volumes_nominative (sometimes)
         """
         if self.groups["page"] is None:
             return id(self)