Merge pull request #330 from cmc333333/37-42-fixes

Fixes for 37 CFR 42 history
eregs · Dec 2, 2016 · e96412e · e96412e
2 parents c682688 + da0a3d4
commit e96412e
Show file tree

Hide file tree

Showing 18 changed files with 173 additions and 111 deletions.
diff --git a/regparser/commands/fill_with_rules.py b/regparser/commands/fill_with_rules.py
@@ -10,15 +10,28 @@
 logger = logging.getLogger(__name__)
 
 
+def drop_initial_orphans(versions_with_parents, existing):
+    """We can only build a version if there's a complete tree before it to
+    build from. As such, we need to drop any orphaned versions from the
+    beginning of our list"""
+    for idx, (version, parent) in enumerate(versions_with_parents):
+        if version.identifier in existing:
+            return versions_with_parents[idx:]
+        logger.warning("No previous annual edition to version %s; ignoring",
+                       version.identifier)
+    return []
+
+
 def dependencies(tree_dir, version_dir, versions_with_parents):
     """Set up the dependency graph for this regulation. First calculates
     "gaps" -- versions for which there is no existing tree. In this
     calculation, we ignore the first version, as we won't be able to build
     anything for it. Add dependencies for any gaps, tying the output tree to
     the preceding tree, the version info and the parsed rule"""
     existing_tree_ids = set(tree.path[-1] for tree in tree_dir.sub_entries())
-    versions_with_parents = versions_with_parents[1:]
-    gaps = [(version, parent) for (version, parent) in versions_with_parents
+    version_pairs = drop_initial_orphans(
+        versions_with_parents, existing_tree_ids)
+    gaps = [(version, parent) for (version, parent) in version_pairs
             if version.identifier not in existing_tree_ids]
 
     deps = dependency.Graph()

diff --git a/regparser/commands/import_notice.py b/regparser/commands/import_notice.py
@@ -28,8 +28,9 @@ def has_requirements(notice_xml):
         logger.error("Missing publish date (eregs-published-date attribute "
                      "on the DATES tag)")
     elif not notice_xml.fr_volume:
-        logger.error("Missing volume (eregs-fr-volume attribute on the first "
-                     "PRTPAGE tag)")
+        logger.error("Missing volume (fr-volume attribute on root)")
+    elif not notice_xml.start_page:
+        logger.error("Missing volume (fr-start-page attribute on root)")
     else:
         return True
 

diff --git a/regparser/commands/preprocess_notice.py b/regparser/commands/preprocess_notice.py
@@ -50,16 +50,20 @@ def preprocess_notice(document_number):
             "effective_on",
             "cfr_references",
             "comments_close_on",
+            "end_page",
             "full_text_xml_url",
             "html_url",
             "publication_date",
             "regulation_id_numbers",
+            "start_page",
             "volume"
         ])
     notice_xmls = list(notice_xmls_for_url(meta['full_text_xml_url']))
     for notice_xml in notice_xmls:
         notice_xml.published = meta['publication_date']
         notice_xml.fr_volume = meta['volume']
+        notice_xml.start_page = meta['start_page']
+        notice_xml.end_page = meta['end_page']
         if meta.get('html_url'):
             notice_xml.fr_html_url = meta['html_url']
         if meta.get("comments_close_on"):

diff --git a/regparser/commands/versions.py b/regparser/commands/versions.py
@@ -58,21 +58,16 @@ def generate_dependencies(version_dir, version_ids, delays_by_version):
     return deps
 
 
-class InvalidEffectiveDate(Exception):
-    def __init__(self, version_id):
-        self.version_id = version_id
-        super(InvalidEffectiveDate, self).__init__(
-            "No effective date for this rule: {}".format(version_id))
-
-
 def write_to_disk(xml, version_entry, delay=None):
     """Serialize a Version instance to disk"""
     effective = xml.effective if delay is None else delay.until
-    if not effective:
-        raise InvalidEffectiveDate(xml.version_id)
-    version = Version(identifier=xml.version_id, effective=effective,
-                      published=xml.published)
-    version_entry.write(version)
+    if effective:
+        version = Version(identifier=xml.version_id, effective=effective,
+                          published=xml.published)
+        version_entry.write(version)
+    else:
+        logger.warning("No effective date for this rule: %s. Skipping",
+                       xml.version_id)
 
 
 def write_if_needed(cfr_title, cfr_part, version_ids, xmls, delays_by_version):

diff --git a/regparser/notice/fake.py b/regparser/notice/fake.py
@@ -7,7 +7,6 @@
 def build(doc_number, effective_on, cfr_title, cfr_part):
     notice_xml = NoticeXML(etree.fromstring("""
         <ROOT>
-            <PRTPAGE P="1" />
             <AGENCY></AGENCY>
             <SUBJECT></SUBJECT>
         </ROOT>
@@ -16,5 +15,7 @@ def build(doc_number, effective_on, cfr_title, cfr_part):
     notice_xml.version_id = doc_number
     notice_xml.effective = effective_on
     notice_xml.published = effective_on
+    notice_xml.start_page = 0
+    notice_xml.end_page = 0
     notice_xml.cfr_refs = [TitlePartsRef(cfr_title, [cfr_part])]
     return notice_xml
diff --git a/regparser/notice/xml.py b/regparser/notice/xml.py
@@ -50,10 +50,14 @@ def add_children(el, children):
     return el
 
 
-def _root_property(attrib):
-    """We add multiple attributes to the NoticeXML's root element"""
+def _root_property(attrib, transform=None):
+    """We add multiple attributes to the NoticeXML's root element. Account for
+    data transforms (e.g. to an integer)"""
     def getter(self):
-        return self.xml.attrib.get(attrib)
+        value = self.xml.attrib.get(attrib)
+        if transform and value is not None:
+            return transform(value)
+        return value
 
     def setter(self, value):
         self.xml.attrib[attrib] = str(value)
@@ -330,25 +334,6 @@ def published(self):
     def published(self, value):
         self._set_date_attr('published', value)
 
-    @property
-    def fr_volume(self):
-        value = self.xpath(".//PRTPAGE")[0].attrib.get('eregs-fr-volume')
-        if value:
-            return int(value)
-
-    @fr_volume.setter
-    def fr_volume(self, value):
-        for prtpage in self.xpath(".//PRTPAGE"):
-            prtpage.attrib['eregs-fr-volume'] = str(value)
-
-    @property
-    def start_page(self):
-        return int(self.xpath(".//PRTPAGE")[0].attrib["P"]) - 1
-
-    @property
-    def end_page(self):
-        return int(self.xpath(".//PRTPAGE")[-1].attrib["P"])
-
     @cached_property        # rather expensive operation, so cache results
     def amendments(self):
         return fetch_amendments(self.xml)
@@ -399,6 +384,9 @@ def supporting_documents(self, value):
     fr_html_url = _root_property('fr-html-url')
     comment_doc_id = _root_property('eregs-comment-doc-id')
     primary_docket = _root_property('eregs-primary-docket')
+    fr_volume = _root_property('fr-volume', int)
+    start_page = _root_property('fr-start-page', int)
+    end_page = _root_property('fr-end-page', int)
 
     def as_dict(self):
         """We use JSON to represent notices in the API. This converts the

diff --git a/regparser/tree/depth/derive.py b/regparser/tree/depth/derive.py
@@ -131,7 +131,6 @@ def derive_depths(original_markers, additional_constraints=None):
     # @todo: There's probably efficiency gains to making these rules over
     # prefixes (see above) rather than over the whole collection at once
     problem.addConstraint(rules.same_parent_same_type, all_vars)
-    problem.addConstraint(rules.stars_occupy_space, all_vars)
 
     for constraint in additional_constraints:
         constraint(problem.addConstraint, all_vars)

diff --git a/regparser/tree/depth/optional_rules.py b/regparser/tree/depth/optional_rules.py
@@ -7,7 +7,7 @@
 from constraint import InSetConstraint
 
 from regparser.tree.depth import markers
-from regparser.tree.depth.rules import ancestors
+from regparser.tree.depth.rules import ancestors, _level_and_children
 
 
 def depth_type_inverses(constrain, all_variables):
@@ -43,6 +43,43 @@ def inner(prev_typ, prev_depth, typ, depth):
         constrain(inner, [prev_typ, prev_depth, typ, depth])
 
 
+def stars_occupy_space(constrain, all_variables):
+    """Star markers can't be ignored in sequence, so 1, *, 2 doesn't make
+    sense for a single level, unless it's an inline star. In the inline
+    case, we can think of it as 1, intro-text-to-1, 2"""
+
+    def per_level(elements):
+        level, grouped_children = _level_and_children(elements)
+
+        if not level:
+            return True     # Base Case
+
+        last_idx, last_typ = -1, None
+        for typ, idx, _ in level:
+            if typ == markers.stars:
+                if idx == 0:    # STARS_TAG, not INLINE_STARS
+                    last_idx += 1
+            # sequences must be increasing. Exception for markerless
+            elif (last_idx >= idx and
+                  markers.markerless not in (last_typ, typ)):
+                return False
+            else:
+                last_idx = idx
+            last_typ = typ
+
+        for children in grouped_children:           # Recurse
+            if not per_level(children):
+                return False
+        return True
+
+    def inner(*all_vars):
+        elements = [tuple(all_vars[i:i + 3])
+                    for i in range(0, len(all_vars), 3)]
+        return per_level(elements)
+
+    constrain(inner, all_variables)
+
+
 def limit_paragraph_types(*p_types):
     """Constraint paragraphs to a limited set of paragraph types. This can
     reduce the search space if we know (for example) that the text comes from

diff --git a/regparser/tree/depth/rules.py b/regparser/tree/depth/rules.py
@@ -140,39 +140,6 @@ def per_level(elements, parent_type=None):
     return per_level(elements)
 
 
-def stars_occupy_space(*all_vars):
-    """Star markers can't be ignored in sequence, so 1, *, 2 doesn't make
-    sense for a single level, unless it's an inline star. In the inline
-    case, we can think of it as 1, intro-text-to-1, 2"""
-    elements = [tuple(all_vars[i:i + 3]) for i in range(0, len(all_vars), 3)]
-
-    def per_level(elements):
-        level, grouped_children = _level_and_children(elements)
-
-        if not level:
-            return True     # Base Case
-
-        last_idx, last_typ = -1, None
-        for typ, idx, _ in level:
-            if typ == markers.stars:
-                if idx == 0:    # STARS_TAG, not INLINE_STARS
-                    last_idx += 1
-            # sequences must be increasing. Exception for markerless
-            elif (last_idx >= idx and
-                  markers.markerless not in (last_typ, typ)):
-                return False
-            else:
-                last_idx = idx
-            last_typ = typ
-
-        for children in grouped_children:           # Recurse
-            if not per_level(children):
-                return False
-        return True
-
-    return per_level(elements)
-
-
 def depth_type_order(order):
     """Create a function which constrains paragraphs depths to a particular
     type sequence. For example, we know a priori what regtext and

diff --git a/regparser/tree/xml_parser/reg_text.py b/regparser/tree/xml_parser/reg_text.py
@@ -374,7 +374,8 @@ class RegtextParagraphProcessor(paragraph_processor.ParagraphProcessor):
     def additional_constraints(self):
         return [
             optional_rules.depth_type_inverses,
-            optional_rules.limit_sequence_gap(3)
+            optional_rules.limit_sequence_gap(3),
+            optional_rules.stars_occupy_space,
         ] + self.relaxed_constraints()
 
     def relaxed_constraints(self):

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="regparser",
-    version="4.0.0",
+    version="4.1.0",
     packages=find_packages(),
     classifiers=[
         'License :: Public Domain',

diff --git a/tests/commands_fill_with_rules_tests.py b/tests/commands_fill_with_rules_tests.py
@@ -103,3 +103,14 @@ def test_process(self, Notice, compile_regulation):
             self.assertEqual(changes, {
                 "1000-2-b": ["2b changes"], "1000-2-c": ["2c changes"],
                 "1000-4-a": ["4a changes"]})
+
+
+def test_drop_initial_orphan_versions():
+    version_list = [Version(letter, None, None) for letter in 'abcdef']
+    version_pairs = list(zip(version_list, [None] + version_list[1:]))
+    existing = {'c', 'e'}
+
+    result = fill_with_rules.drop_initial_orphans(version_pairs, existing)
+    result = [pair[0].identifier for pair in result]
+
+    assert result == ['c', 'd', 'e', 'f']
diff --git a/tests/commands_import_notice_tests.py b/tests/commands_import_notice_tests.py
@@ -1,27 +1,33 @@
-from unittest import TestCase
-
 from regparser.commands import import_notice
 from regparser.notice.xml import NoticeXML
 from regparser.test_utils.xml_builder import XMLBuilder
 
 
-class CommandsImportNoticeTests(TestCase):
-    def test_has_requirments(self):
-        """Validate that certain attributes are required"""
-        with XMLBuilder("ROOT", **{"eregs-version-id": "vvv"}) as ctx:
-            ctx.PRTPAGE(P=44, **{"eregs-fr-volume": "124"})
-            ctx.DATES(**{"eregs-published-date": "2005-05-05"})
-        notice_xml = NoticeXML(ctx.xml_copy())
-        self.assertTrue(import_notice.has_requirements(notice_xml))
+def test_has_requirments():
+    """Validate that certain attributes are required"""
+    root_attrs = {
+        "eregs-version-id": "vvv",
+        "fr-volume": 124,
+        "fr-start-page": 44,
+        "fr-end-page": 55
+    }
+    with XMLBuilder("ROOT", **root_attrs) as ctx:
+        ctx.DATES(**{"eregs-published-date": "2005-05-05"})
+    notice_xml = NoticeXML(ctx.xml_copy())
+    assert import_notice.has_requirements(notice_xml)
+
+    notice_xml = NoticeXML(ctx.xml_copy())
+    del notice_xml.xml.attrib['eregs-version-id']
+    assert not import_notice.has_requirements(notice_xml)
 
-        notice_xml = NoticeXML(ctx.xml_copy())
-        del notice_xml.xml.attrib['eregs-version-id']
-        self.assertFalse(import_notice.has_requirements(notice_xml))
+    notice_xml = NoticeXML(ctx.xml_copy())
+    del notice_xml.xml.attrib['fr-volume']
+    assert not import_notice.has_requirements(notice_xml)
 
-        notice_xml = NoticeXML(ctx.xml_copy())
-        del notice_xml.xml.xpath('//PRTPAGE')[0].attrib['eregs-fr-volume']
-        self.assertFalse(import_notice.has_requirements(notice_xml))
+    notice_xml = NoticeXML(ctx.xml_copy())
+    del notice_xml.xml.attrib['fr-start-page']
+    assert not import_notice.has_requirements(notice_xml)
 
-        notice_xml = NoticeXML(ctx.xml_copy())
-        del notice_xml.xml.xpath('//DATES')[0].attrib['eregs-published-date']
-        self.assertFalse(import_notice.has_requirements(notice_xml))
+    notice_xml = NoticeXML(ctx.xml_copy())
+    del notice_xml.xml.xpath('//DATES')[0].attrib['eregs-published-date']
+    assert not import_notice.has_requirements(notice_xml)
diff --git a/tests/commands_preprocess_notice_tests.py b/tests/commands_preprocess_notice_tests.py
@@ -31,7 +31,9 @@ def expect_common_json(self, **kwargs):
         params = {'effective_on': '2008-08-08',
                   'publication_date': '2007-07-07',
                   'full_text_xml_url': 'some://url',
-                  'volume': 45}
+                  'volume': 45,
+                  'start_page': 111,
+                  'end_page': 222}
         params.update(kwargs)
         self.expect_json_http(params, uri=re.compile('.*federalregister.*'))
         # No data from regs.gov

diff --git a/tests/commands_versions_tests.py b/tests/commands_versions_tests.py
@@ -102,14 +102,6 @@ def test_write_to_disk(self):
             self.assertEqual((path / '111').read().effective, date(2002, 2, 2))
             self.assertEqual((path / '222').read().effective, date(2004, 4, 4))
 
-    def test_write_to_disk_no_effective(self):
-        """If a version is somehow associated with a proposed rule (or a final
-        rule has been misparsed), we should get an exception"""
-        xml = Mock()
-        xml.effective = None
-        with self.assertRaises(versions.InvalidEffectiveDate):
-            versions.write_to_disk(xml, entry.Version('12', '1000', '11'))
-
     @patch('regparser.commands.versions.write_to_disk')
     def test_write_if_needed_raises_exception(self, write_to_disk):
         """If an input file is missing, this raises an exception"""
@@ -158,3 +150,15 @@ def test_write_if_needed_delays(self, write_to_disk):
                 'title', 'part', ['111'], {'111': 'xml111'},
                 {'111': versions.Delay('222', 'until-date')})
             self.assertTrue(write_to_disk.called)
+
+
+def test_write_to_disk_no_effective(monkeypatch):
+    """If a version is somehow associated with a proposed rule (or a final
+    rule has been misparsed), we should get a warning"""
+    xml = Mock(effective=None, version_id='vv123')
+    monkeypatch.setattr(versions, 'logger', Mock())
+
+    versions.write_to_disk(xml, entry.Version('12', '1000', '11'))
+
+    assert versions.logger.warning.called
+    assert 'vv123' in versions.logger.warning.call_args[0]