Skip to content

Commit

Permalink
CA: Improved committee abbreviation parsing.
Browse files Browse the repository at this point in the history
Now supports detecting multiple committees abbreviations in bill actions.
  • Loading branch information
Andy Lo committed Jul 23, 2016
1 parent c913ccc commit 09cd33d
Showing 1 changed file with 63 additions and 59 deletions.
122 changes: 63 additions & 59 deletions openstates/ca/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,164 +39,165 @@ def clean_title(s):
# Committee codes used in action chamber text.
committee_data_upper = [
('Standing Committee on Governance and Finance',
'CS73', [u'Gov. & F.']),
'CS73', [u'GOV. & F.', u'Gov. & F.']),

('Standing Committee on Energy, Utilities and Communications',
'CS71', [u'E. U. & C.', u'E., U. & C', 'E., U., & C.']),
'CS71', [u'E., U., & C.']),

('Standing Committee on Education',
'CS44', [u'ED.']),
'CS44', [u'ED.']),

('Standing Committee on Appropriations',
'CS61', [u'APPR.']),
'CS61', [u'APPR.']),

('Standing Committee on Labor and Industrial Relations',
'CS51', [u'L. & I.R.']),
'CS51', [u'L. & I.R.']),

('Standing Committee on Elections and Constitutional Amendments',
'CS45', [u'E. & C.A.']),
'CS45', [u'E. & C.A.']),

('Standing Committee on Environmental Quality',
'CS64', [u'E.Q.']),
'CS64', [u'E.Q.']),

('Standing Committee on Natural Resources And Water',
'CS55', [u'N.R. & W.']),
'CS55', [u'N.R. & W.']),

('Standing Committee on Public Employment and Retirement',
'CS56', [u'P.E. & R.']),
'CS56', [u'P.E. & R.']),

('Standing Committee on Governmental Organization',
'CS48', [u'G.O.']),
'CS48', [u'G.O.']),

('Standing Committee on Insurance',
'CS70', [u'INS.']),
'CS70', [u'INS.']),

('Standing Committee on Public Safety',
'CS72', [u'PUB. S.']),
'CS72', [u'PUB. S.']),

('Standing Committee on Judiciary',
'CS53', [u'JUD.']),
'CS53', [u'JUD.']),

('Standing Committee on Health',
'CS60', [u'HEALTH.']),
'CS60', [u'HEALTH']),

('Standing Committee on Transportation and Housing',
'CS59', [u'T. & H.']),
'CS59', [u'T. & H.']),

('Standing Committee on Business, Professions and Economic Development',
'CS42', [u'B., P. & E.D.']),
'CS42', [u'B., P. & E.D.']),

('Standing Committee on Agriculture',
'CS40', [u'AGRI.']),
'CS40', [u'AGRI.']),

('Standing Committee on Banking and Financial Institutions',
'CS69', [u'B. & F.I.']),
'CS69', [u'B. & F.I.']),

('Standing Committee on Veterans Affairs',
'CS66', [u'V.A.']),
'CS66', [u'V.A.']),

('Standing Committee on Budget and Fiscal Review',
'CS62', [u'B. & F.R.']),
'CS62', [u'B. & F.R.']),

('Standing Committee on Human Services',
'CS74', [u'HUM. S.', u'HUMAN S.']),
'CS74', [u'HUM. S.', u'HUMAN S.']),

('Standing Committee on Rules',
'CS58', [u'RLS.']),
'CS58', [u'RLS.']),

('Extraordinary Committee on Transportation and Infrastructure Development',
'CS67', [r'T. & I.D.']),
'CS67', [u'T. & I.D.']),
]

committee_data_lower = [
('Standing Committee on Rules',
'CX20', [u'RLS.']),
'CX20', [u'RLS.']),

('Standing Committee on Revenue and Taxation',
'CX19', [u'REV. & TAX']),
'CX19', [u'REV. & TAX']),

('Standing Committee on Natural Resources',
'CX16', [u'NAT. RES.']),
'CX16', [u'NAT. RES.']),

('Standing Committee on Appropriations',
'CX25', [u'APPR.']),
'CX25', [u'APPR.']),

('Standing Committee on Insurance',
'CX28', ['INS.']),
'CX28', [u'INS.']),

('Standing Committee on Utilities and Commerce',
'CX23', [u'U. & C.']),
'CX23', [u'U. & C.']),

('Standing Committee on Education',
'CX03', [u'ED.']),
'CX03', [u'ED.']),

('Standing Committee on Public Safety',
'CX18', [u'PUB. S.']),
'CX18', [u'PUB. S.']),

('Standing Committee on Elections and Redistricting',
'CX04', [u'E. & R.']),
'CX04', [u'E. & R.']),

('Standing Committee on Judiciary',
'CX13', [u'JUD.', 'Jud.']),
'CX13', [u'JUD.']),

('Standing Committee on Higher Education',
'CX09', [u'HIGHER ED.']),
'CX09', [u'HIGHER ED.']),

('Standing Committee on Health',
'CX08', [u'HEALTH']),
'CX08', [u'HEALTH']),

('Standing Committee on Human Services',
'CX11', [u'HUM. S.', u'HUMAN S.']),
'CX11', [u'HUM. S.', u'HUMAN S.']),

('Standing Committee on Arts, Entertainment, Sports, Tourism, and Internet Media',
'CX37', [u'A.,E.,S.,T., & I.M.']),
'CX37', [u'A., E., S., T., & I.M.']),

('Standing Committee on Transportation',
'CX22', [u'TRANS.']),
'CX22', [u'TRANS.']),

('Standing Committee on Business, Professions and Consumer Protection',
'CX33', [u'B.,P. & C.P.', 'B., P. & C.P.', u'B. & P.']),
'CX33', [u'B., P., & C.P.', u'B. & P.']),

('Standing Committee on Water, Parks and Wildlife',
'CX24', [u'W., P. & W']),
'CX24', [u'W., P., & W.']),

('Standing Committee on Local Government',
'CX15', [u'L. GOV.', 'L. Gov.']),
'CX15', [u'L. GOV.', u'L. Gov.']),

('Standing Committee on Aging and Long Term Care',
'CX31', [u'AGING & L.T.C.']),
'CX31', [u'AGING & L.T.C.']),

('Standing Committee on Labor and Employment',
'CX14', [u'L. & E.']),
'CX14', [u'L. & E.']),

('Standing Committee on Governmental Organization',
'CX07', [u'G.O.']),
'CX07', [u'G.O.']),

('Standing Committee on Public Employees, Retirement and Social Security',
'CX17', [u'P.E., R. & S.S.']),
'CX17', [u'P.E., R., & S.S.']),

('Standing Committee on Veterans Affairs',
'CX38', [u'V.A.']),
'CX38', [u'V.A.']),

('Standing Committee on Housing and Community Development',
'CX10', [u'H. & C.D.']),
'CX10', [u'H. & C.D.']),

('Standing Committee on Environmental Safety and Toxic Materials',
'CX05', [u'E.S. & T.M.']),
'CX05', [u'E.S. & T.M.']),

('Standing Committee on Agriculture',
'CX01', [u'AGRI.']),
'CX01', [u'AGRI.']),

('Standing Committee on Banking and Finance',
'CX27', [u'B. & F.']),
'CX27', [u'B. & F.']),

('Standing Committee on Jobs, Economic Development and the Economy',
'CX34', [u'J., E.D. & E.']),
'CX34', [u'J., E.D., & E.']),

('Standing Committee on Accountability and Administrative Review',
'CX02', [u'A. & A.R.']),
'CX02', [u'A. & A.R.']),

('Standing Committee on Budget',
'CX29', [u'BUDGET.']),
'CX29', [u'BUDGET']),

('Standing Committee on Privacy and Consumer Protection',
'CX32', [u'P. & C.P.']),
Expand Down Expand Up @@ -232,6 +233,7 @@ def get_committee_abbr_data():

committee_data = {'upper': _committee_abbr_to_name_upper,
'lower': _committee_abbr_to_name_lower}

return committee_data


Expand All @@ -240,10 +242,10 @@ def get_committee_name_regex():
_committee_abbrs = map(operator.itemgetter(2), committee_data_both)
_committee_abbrs = itertools.chain.from_iterable(_committee_abbrs)
_committee_abbrs = sorted(_committee_abbrs, reverse=True, key=len)
_committee_abbrs = map(slugify, _committee_abbrs)

_committee_abbr_regex = ['%s' % '[ .,]*'.join(list(abbr)) for abbr in _committee_abbrs]
_committee_abbr_regex = re.compile('Com\.\s+on\s+(%s)\.?' % '|'.join(_committee_abbr_regex))
_committee_abbr_regex = ['%s' % '[\s,]*'.join(abbr.replace(',', '')
.split(' ')) for abbr in _committee_abbrs]
_committee_abbr_regex = re.compile('(%s)' % '|'.join(_committee_abbr_regex))

return _committee_abbr_regex

Expand Down Expand Up @@ -463,7 +465,7 @@ def replacer(matchobj):
kwargs = attrs
matched_abbrs = committee_abbr_regex.findall(action.action)

if 'Com. on' in action.action and not matched_abbrs:
if re.search(r'Com[s]?. on', action.action) and not matched_abbrs:
msg = 'Failed to extract committee abbr from %r.'
self.logger.warning(msg % action.action)

Expand All @@ -472,13 +474,12 @@ def replacer(matchobj):
for abbr in matched_abbrs:
try:
name = self.committee_abbr_to_name(chamber, abbr)
committees.append(name)
except KeyError:
msg = ('Mapping contains no committee name for '
'abbreviation %r. Action text was %r.')
args = (abbr, action.action)
raise KeyError(msg % args)
else:
committees.append(name)

committees = filter(None, committees)
kwargs['committees'] = committees
Expand All @@ -490,8 +491,11 @@ def replacer(matchobj):

assert len(committees) == len(matched_abbrs)
for committee, abbr in zip(committees, matched_abbrs):
act_str = act_str.replace('Coms. on ', '')
act_str = act_str.replace('Com. on ' + abbr, committee)
act_str = act_str.replace(abbr, committee)
if not act_str.endswith('.'):
act_str = act_str + '.'

changed = False
for string in ['upper', 'lower', 'joint']:
Expand Down

0 comments on commit 09cd33d

Please sign in to comment.