From c913ccc6b520af66925f095f0a469b35d658c894 Mon Sep 17 00:00:00 2001 From: Andy Lo Date: Sat, 23 Jul 2016 01:08:31 -0400 Subject: [PATCH 1/3] CA: Added committee abbreviations to recognized list. --- openstates/ca/bills.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/openstates/ca/bills.py b/openstates/ca/bills.py index 647f57aad7..0eb5f660a6 100644 --- a/openstates/ca/bills.py +++ b/openstates/ca/bills.py @@ -14,11 +14,11 @@ from .models import CABill from .actions import CACategorizer - SPONSOR_TYPES = {'LEAD_AUTHOR': 'primary', 'COAUTHOR': 'cosponsor', 'PRINCIPAL_COAUTHOR': 'primary'} + def clean_title(s): # replace smart quote characters s = s.replace(u'\xe2\u20ac\u201c', '-') @@ -38,7 +38,6 @@ def clean_title(s): # Committee codes used in action chamber text. committee_data_upper = [ - #('CZ09', 'Standing Committee on Floor Analyses'), ('Standing Committee on Governance and Finance', 'CS73', [u'Gov. & F.']), @@ -104,13 +103,15 @@ def clean_title(s): ('Standing Committee on Rules', 'CS58', [u'RLS.']), - ] + + ('Extraordinary Committee on Transportation and Infrastructure Development', + 'CS67', [r'T. & I.D.']), +] committee_data_lower = [ - # LOWER ('Standing Committee on Rules', 'CX20', [u'RLS.']), - #('assembly floor analysis', 'CZ01', []), + ('Standing Committee on Revenue and Taxation', 'CX19', [u'REV. & TAX']), @@ -195,8 +196,17 @@ def clean_title(s): 'CX02', [u'A. & A.R.']), ('Standing Committee on Budget', - 'CX29', [u'BUDGET.']) - ] + 'CX29', [u'BUDGET.']), + + ('Standing Committee on Privacy and Consumer Protection', + 'CX32', [u'P. & C.P.']), + + ('Extraordinary Committee on Finance', + 'CX35', [u'FINANCE']), + + ('Extraordinary Committee on Public Health and Developmental Services', + 'CX30', [u'P.H. & D.S.']), +] committee_data_both = committee_data_upper + committee_data_lower @@ -226,13 +236,15 @@ def get_committee_abbr_data(): def get_committee_name_regex(): + # Builds a list of all committee abbreviations. _committee_abbrs = map(operator.itemgetter(2), committee_data_both) _committee_abbrs = itertools.chain.from_iterable(_committee_abbrs) _committee_abbrs = sorted(_committee_abbrs, reverse=True, key=len) _committee_abbrs = map(slugify, _committee_abbrs) - #_committee_abbrs = map(re.escape, _committee_abbrs) + _committee_abbr_regex = ['%s' % '[ .,]*'.join(list(abbr)) for abbr in _committee_abbrs] _committee_abbr_regex = re.compile('Com\.\s+on\s+(%s)\.?' % '|'.join(_committee_abbr_regex)) + return _committee_abbr_regex From 09cd33dfc97c76cbf73698f2e2b4dc55ccf69169 Mon Sep 17 00:00:00 2001 From: Andy Lo Date: Sat, 23 Jul 2016 04:11:30 -0400 Subject: [PATCH 2/3] CA: Improved committee abbreviation parsing. Now supports detecting multiple committees abbreviations in bill actions. --- openstates/ca/bills.py | 122 +++++++++++++++++++++-------------------- 1 file changed, 63 insertions(+), 59 deletions(-) diff --git a/openstates/ca/bills.py b/openstates/ca/bills.py index 0eb5f660a6..750b3f9890 100644 --- a/openstates/ca/bills.py +++ b/openstates/ca/bills.py @@ -39,164 +39,165 @@ def clean_title(s): # Committee codes used in action chamber text. committee_data_upper = [ ('Standing Committee on Governance and Finance', - 'CS73', [u'Gov. & F.']), + 'CS73', [u'GOV. & F.', u'Gov. & F.']), ('Standing Committee on Energy, Utilities and Communications', - 'CS71', [u'E. U. & C.', u'E., U. & C', 'E., U., & C.']), + 'CS71', [u'E., U., & C.']), ('Standing Committee on Education', - 'CS44', [u'ED.']), + 'CS44', [u'ED.']), ('Standing Committee on Appropriations', - 'CS61', [u'APPR.']), + 'CS61', [u'APPR.']), ('Standing Committee on Labor and Industrial Relations', - 'CS51', [u'L. & I.R.']), + 'CS51', [u'L. & I.R.']), ('Standing Committee on Elections and Constitutional Amendments', - 'CS45', [u'E. & C.A.']), + 'CS45', [u'E. & C.A.']), ('Standing Committee on Environmental Quality', - 'CS64', [u'E.Q.']), + 'CS64', [u'E.Q.']), ('Standing Committee on Natural Resources And Water', - 'CS55', [u'N.R. & W.']), + 'CS55', [u'N.R. & W.']), ('Standing Committee on Public Employment and Retirement', - 'CS56', [u'P.E. & R.']), + 'CS56', [u'P.E. & R.']), ('Standing Committee on Governmental Organization', - 'CS48', [u'G.O.']), + 'CS48', [u'G.O.']), ('Standing Committee on Insurance', - 'CS70', [u'INS.']), + 'CS70', [u'INS.']), ('Standing Committee on Public Safety', - 'CS72', [u'PUB. S.']), + 'CS72', [u'PUB. S.']), ('Standing Committee on Judiciary', - 'CS53', [u'JUD.']), + 'CS53', [u'JUD.']), ('Standing Committee on Health', - 'CS60', [u'HEALTH.']), + 'CS60', [u'HEALTH']), ('Standing Committee on Transportation and Housing', - 'CS59', [u'T. & H.']), + 'CS59', [u'T. & H.']), ('Standing Committee on Business, Professions and Economic Development', - 'CS42', [u'B., P. & E.D.']), + 'CS42', [u'B., P. & E.D.']), ('Standing Committee on Agriculture', - 'CS40', [u'AGRI.']), + 'CS40', [u'AGRI.']), ('Standing Committee on Banking and Financial Institutions', - 'CS69', [u'B. & F.I.']), + 'CS69', [u'B. & F.I.']), ('Standing Committee on Veterans Affairs', - 'CS66', [u'V.A.']), + 'CS66', [u'V.A.']), ('Standing Committee on Budget and Fiscal Review', - 'CS62', [u'B. & F.R.']), + 'CS62', [u'B. & F.R.']), ('Standing Committee on Human Services', - 'CS74', [u'HUM. S.', u'HUMAN S.']), + 'CS74', [u'HUM. S.', u'HUMAN S.']), ('Standing Committee on Rules', - 'CS58', [u'RLS.']), + 'CS58', [u'RLS.']), ('Extraordinary Committee on Transportation and Infrastructure Development', - 'CS67', [r'T. & I.D.']), + 'CS67', [u'T. & I.D.']), ] committee_data_lower = [ ('Standing Committee on Rules', - 'CX20', [u'RLS.']), + 'CX20', [u'RLS.']), ('Standing Committee on Revenue and Taxation', - 'CX19', [u'REV. & TAX']), + 'CX19', [u'REV. & TAX']), ('Standing Committee on Natural Resources', - 'CX16', [u'NAT. RES.']), + 'CX16', [u'NAT. RES.']), ('Standing Committee on Appropriations', - 'CX25', [u'APPR.']), + 'CX25', [u'APPR.']), ('Standing Committee on Insurance', - 'CX28', ['INS.']), + 'CX28', [u'INS.']), ('Standing Committee on Utilities and Commerce', - 'CX23', [u'U. & C.']), + 'CX23', [u'U. & C.']), ('Standing Committee on Education', - 'CX03', [u'ED.']), + 'CX03', [u'ED.']), ('Standing Committee on Public Safety', - 'CX18', [u'PUB. S.']), + 'CX18', [u'PUB. S.']), ('Standing Committee on Elections and Redistricting', - 'CX04', [u'E. & R.']), + 'CX04', [u'E. & R.']), ('Standing Committee on Judiciary', - 'CX13', [u'JUD.', 'Jud.']), + 'CX13', [u'JUD.']), + ('Standing Committee on Higher Education', - 'CX09', [u'HIGHER ED.']), + 'CX09', [u'HIGHER ED.']), ('Standing Committee on Health', - 'CX08', [u'HEALTH']), + 'CX08', [u'HEALTH']), ('Standing Committee on Human Services', - 'CX11', [u'HUM. S.', u'HUMAN S.']), + 'CX11', [u'HUM. S.', u'HUMAN S.']), ('Standing Committee on Arts, Entertainment, Sports, Tourism, and Internet Media', - 'CX37', [u'A.,E.,S.,T., & I.M.']), + 'CX37', [u'A., E., S., T., & I.M.']), ('Standing Committee on Transportation', - 'CX22', [u'TRANS.']), + 'CX22', [u'TRANS.']), ('Standing Committee on Business, Professions and Consumer Protection', - 'CX33', [u'B.,P. & C.P.', 'B., P. & C.P.', u'B. & P.']), + 'CX33', [u'B., P., & C.P.', u'B. & P.']), ('Standing Committee on Water, Parks and Wildlife', - 'CX24', [u'W., P. & W']), + 'CX24', [u'W., P., & W.']), ('Standing Committee on Local Government', - 'CX15', [u'L. GOV.', 'L. Gov.']), + 'CX15', [u'L. GOV.', u'L. Gov.']), ('Standing Committee on Aging and Long Term Care', - 'CX31', [u'AGING & L.T.C.']), + 'CX31', [u'AGING & L.T.C.']), ('Standing Committee on Labor and Employment', - 'CX14', [u'L. & E.']), + 'CX14', [u'L. & E.']), ('Standing Committee on Governmental Organization', - 'CX07', [u'G.O.']), + 'CX07', [u'G.O.']), ('Standing Committee on Public Employees, Retirement and Social Security', - 'CX17', [u'P.E., R. & S.S.']), + 'CX17', [u'P.E., R., & S.S.']), ('Standing Committee on Veterans Affairs', - 'CX38', [u'V.A.']), + 'CX38', [u'V.A.']), ('Standing Committee on Housing and Community Development', - 'CX10', [u'H. & C.D.']), + 'CX10', [u'H. & C.D.']), ('Standing Committee on Environmental Safety and Toxic Materials', - 'CX05', [u'E.S. & T.M.']), + 'CX05', [u'E.S. & T.M.']), ('Standing Committee on Agriculture', - 'CX01', [u'AGRI.']), + 'CX01', [u'AGRI.']), ('Standing Committee on Banking and Finance', - 'CX27', [u'B. & F.']), + 'CX27', [u'B. & F.']), ('Standing Committee on Jobs, Economic Development and the Economy', - 'CX34', [u'J., E.D. & E.']), + 'CX34', [u'J., E.D., & E.']), ('Standing Committee on Accountability and Administrative Review', - 'CX02', [u'A. & A.R.']), + 'CX02', [u'A. & A.R.']), ('Standing Committee on Budget', - 'CX29', [u'BUDGET.']), + 'CX29', [u'BUDGET']), ('Standing Committee on Privacy and Consumer Protection', 'CX32', [u'P. & C.P.']), @@ -232,6 +233,7 @@ def get_committee_abbr_data(): committee_data = {'upper': _committee_abbr_to_name_upper, 'lower': _committee_abbr_to_name_lower} + return committee_data @@ -240,10 +242,10 @@ def get_committee_name_regex(): _committee_abbrs = map(operator.itemgetter(2), committee_data_both) _committee_abbrs = itertools.chain.from_iterable(_committee_abbrs) _committee_abbrs = sorted(_committee_abbrs, reverse=True, key=len) - _committee_abbrs = map(slugify, _committee_abbrs) - _committee_abbr_regex = ['%s' % '[ .,]*'.join(list(abbr)) for abbr in _committee_abbrs] - _committee_abbr_regex = re.compile('Com\.\s+on\s+(%s)\.?' % '|'.join(_committee_abbr_regex)) + _committee_abbr_regex = ['%s' % '[\s,]*'.join(abbr.replace(',', '') + .split(' ')) for abbr in _committee_abbrs] + _committee_abbr_regex = re.compile('(%s)' % '|'.join(_committee_abbr_regex)) return _committee_abbr_regex @@ -463,7 +465,7 @@ def replacer(matchobj): kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) - if 'Com. on' in action.action and not matched_abbrs: + if re.search(r'Com[s]?. on', action.action) and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) @@ -472,13 +474,12 @@ def replacer(matchobj): for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) + committees.append(name) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) - else: - committees.append(name) committees = filter(None, committees) kwargs['committees'] = committees @@ -490,8 +491,11 @@ def replacer(matchobj): assert len(committees) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): + act_str = act_str.replace('Coms. on ', '') act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) + if not act_str.endswith('.'): + act_str = act_str + '.' changed = False for string in ['upper', 'lower', 'joint']: From 86ba91bdd5e314b986c8ba959722052dd972ed72 Mon Sep 17 00:00:00 2001 From: Andy Lo Date: Sat, 23 Jul 2016 12:05:55 -0400 Subject: [PATCH 3/3] Changed variable name to be more elucidating. --- openstates/ca/bills.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/openstates/ca/bills.py b/openstates/ca/bills.py index 750b3f9890..416bae747b 100644 --- a/openstates/ca/bills.py +++ b/openstates/ca/bills.py @@ -497,14 +497,16 @@ def replacer(matchobj): if not act_str.endswith('.'): act_str = act_str + '.' + # Determine which chamber the action originated from. changed = False - for string in ['upper', 'lower', 'joint']: - if actor.startswith(string): - actor = string + for committee_chamber in ['upper', 'lower', 'joint']: + if actor.startswith(committee_chamber): + actor = committee_chamber changed = True break if not changed: actor = 'other' + if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor