"
- ],
- "text/plain": [
- " transaction_id donor_id \\\n",
- "0 7773a71e-9f67-438e-8313-80b1b75deeb4 4544b60d-da6b-4dd5-9efe-334152ccf1f1 \n",
- "1 95f74915-a945-491f-8751-8c970a76fc24 946d7561-42a3-4a4b-b410-3a10271c9f18 \n",
- "\n",
- " year amount recipient_id office_sought \\\n",
- "0 2018 1000.0 981a0414-b738-4e20-91b8-a29ee2cc7edf none \n",
- "1 2018 1000.0 981a0414-b738-4e20-91b8-a29ee2cc7edf none \n",
- "\n",
- " purpose transaction_type \\\n",
- "0 bob worsley for state senate contribute to a candidate committee \n",
- "1 drew john for state house contribute to a candidate committee \n",
- "\n",
- " donor_type recipient_type donor_office \n",
- "0 NaN NaN NaN \n",
- "1 NaN NaN NaN "
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "transactions.head(2)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array(['neutral', 'f'], dtype=object)"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "inds_df.classification.unique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(9926, 9919, 10000, 10000)"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "inds_ids = set(inds_df.id.tolist())\n",
- "orgs_ids = set(orgs_df.id.tolist())\n",
- "trans_donorids = set(transactions.donor_id.tolist())\n",
- "trans_recepids = set(transactions.recipient_id.tolist())\n",
- "ind_id_there, org_id_there = [], []\n",
- "for ind_id in inds_ids:\n",
- " if ind_id in trans_donorids:\n",
- " ind_id_there.append(ind_id)\n",
- " elif ind_id in trans_recepids:\n",
- " ind_id_there.append(ind_id)\n",
- "\n",
- "for org_id in orgs_ids:\n",
- " if org_id in trans_donorids:\n",
- " org_id_there.append(org_id)\n",
- " elif org_id in trans_recepids:\n",
- " org_id_there.append(org_id)\n",
- "\n",
- "len(inds_ids), len(ind_id_there), len(orgs_ids), len(org_id_there)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['242d019c-e0ab-405e-8e77-abae7418b87f',\n",
- " '8b2ad550-64a1-4975-8b77-5eb1f24a8871',\n",
- " 'aee69307-194f-4c40-af3d-a55a34e1068e',\n",
- " '55e5e946-6261-4f19-9752-fb58219b2e99',\n",
- " '4faf251a-73d9-46ef-9e17-d3cf0a3052ae',\n",
- " '3b5c0a9e-c6f2-44e9-ad05-fde071447564',\n",
- " '3936bdf5-9a7a-462c-9e8c-9124f2bd7f57',\n",
- " '13882059-3c74-4d9e-825d-a03a72b43b08',\n",
- " '50c78f1a-3e9b-4996-a319-eef4fe01ccfb',\n",
- " 'ae96f38f-68c8-47e3-95b3-c6f096d3c22e',\n",
- " '74ba8a8a-7256-4eb3-b0f8-995f7a6319fb',\n",
- " '12823a76-78e2-4b09-b606-859efaa5c8ef',\n",
- " '9de9bf03-8c4a-4d2f-9a95-283b230ddfad',\n",
- " '588593b9-9bba-4597-94d9-1b3a7fd5b402',\n",
- " '5277b642-6bf0-4423-9350-3602ae51c6ac',\n",
- " 'd98985b4-f55d-4ada-b279-0497e3176512',\n",
- " 'c8586d36-f188-4684-aa99-193407d4d068',\n",
- " '3798fda1-83cd-4e48-974a-e1a390060198',\n",
- " 'a536b509-f052-4984-a35d-10397308daec',\n",
- " '80996477-ce99-4f34-b5fc-bab4d676fc77',\n",
- " 'cd1a740c-b1d7-4334-b335-925bd5708753',\n",
- " '46af8908-f4e4-4041-9d1e-5b442d051921',\n",
- " '2969075a-86d2-4b04-a991-a81832e096a0',\n",
- " 'd0337f72-b701-4524-891b-c48ef6f771ec',\n",
- " '591aa72b-511b-4dbb-a161-80458f257471']"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "a = []\n",
- "for ind_id in inds_ids:\n",
- " if ((ind_id in trans_donorids) and (ind_id in trans_recepids)):\n",
- " a.append(ind_id)\n",
- "a"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "data = {'id':['50c7d9a1-b448-46a5-8e2d-cd15b3097360','50c7d9a1-b448-46a5-8e2d-cd15b3097360','50c7d9a1-b448-46a5-8e2d-cd15b3097360',\n",
- " '62ea1e9c-ac12-400c-b3dc-519389c0f7d3','62ea1e9c-ac12-400c-b3dc-519389c0f7d3','62ea1e9c-ac12-400c-b3dc-519389c0f7d3',\n",
- " 'd31df1ca-714e-4a82-9e88-1892c0451a71','d31df1ca-714e-4a82-9e88-1892c0451a71','62ea1e9c-ac12-400c-b3dc-519389c0f7d3',\n",
- " '4db76e6e-f0d5-40eb-82de-6dbcdb562dd7','f71341d7-d27e-47eb-9b66-903af39d6cb5','c875d7de-94be-42f1-b994-dd89b114d51e',\n",
- " '910c4d36-b036-469e-aa2a-ea4ff8855a6c','60d454d1-3773-4d88-80e9-132c161da0f0','1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd',\n",
- " '1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd','1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe','1d2b5bc0-9385-4cd7-ac48-df43b3eca6ff',\n",
- " '1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd'],\n",
- " 'name':['REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN PAC','REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN PAC',\n",
- " 'REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN PAC','UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB',\n",
- " 'UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB','UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB',\n",
- " 'COMMITTEE TO ELECT DR PATRICIA BERNARD','COMMITTEE TO ELECT DR PATRICIA BERNARD','UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB',\n",
- " 'Ugi Utilities Inc/Ugi Energy Services Llc Pac','Pabar Pac (Pa Bar Assn)','Pa Fraternal Order Of Police Pac','Citizens For Kail',\n",
- " 'Paa Pac','MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC','MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC',\n",
- " 'MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC','MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC','Paa Pac'],\n",
- " 'state':['MI','MI','MI','MI','MI','MI','MI','MI','MI','PA','PA','PA','PA','PA','MI','MI','MI','MI','PA'],\n",
- " 'entity_type':['committee','committee','committee','committee','committee','committee','committee','committee','committee',\n",
- " 'Organization','Organization','Organization','Organization','Organization','committee','committee','committee','committee','Organization']}\n",
- "\n",
- "sample_df = pd.DataFrame(data)\n",
- "sample_df['donations'] = np.random.randint(100, 6000, sample_df.shape[0])\n",
- "sample_df['donations_to'] = np.random.choice(sample_df.name.tolist(), size=len(sample_df))\n",
- "sample_df['received'] = np.random.randint(0, 6000, sample_df.shape[0])\n",
- "sample_df['donations_from'] = np.random.choice(sample_df.name.tolist(), size=len(sample_df))\n",
- "sample_df.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Some Considerations to Remember Moving Forward:\n",
- "1. The 'get_likely_name' function takes in 3 string inputs. The data is not clean and when there are NaN entries, the function is somehow inputing null values as strings, so a column that has \"Tim\", \"Walz\" and Nan in the first, last, and full name columns, is being combined as \"Tim Walz Nan\". When calling this function account for this possibility"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Playing Around with Graphs\n",
- "\n",
- "**Some considerations**\n",
- "1. What attributes do we want each Node to Have?\n",
- "- UUID, Name, Entity Type, Address, {from transactions table: money_donated and money_given}, affilition?\n",
- "- Should transaction info also be included? If so, how would we show transaction info to multiple recipients / from multiple donors?"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Notes for Graphs\n",
- "**Generating Graphs**\n",
- "* nx.Graph() → the most simple undirected graph (edges going both ways)\n",
- "* nx.DiGraph() → a graph with directed edges\n",
- "* nx.MultiGraph() → multiple edges between nodes\n",
- "* nx.MultiDiGraph() → the MultiGraph equivalent for directed graphs\n",
- "\n",
- "**Finding Centrality**\n",
- "There are 4 main ways to find the centrality of a node (how important or frequent is a node / how influential are some donors potentially)\n",
- "* nx.degree_centrality : based on the assumption that important nodes have many connections\n",
- "* nx.closeness_centrality : based on the assumption that important nodes are close to other nodes. It is calculated as the sum of the path lengths from the given node to all other nodes. \n",
- "* nx.eigenvector_centrality : assumes that important nodes connect other nodes. Considers the number of shortest paths between 2 nodes .For Graphs with a large number of nodes, the value of betweenness centrality is very high\n",
- "* nx.betweeness_centrality : a measure of centrality in a graph based on shortest paths. For every pair of vertices in a connected graph, there exists at least one shortest path between the vertices such that either the number of edges that the path passes through (for unweighted graphs) or the sum of the weights of the edges (for weighted graphs) is minimized. The betweenness centrality for each vertex is the number of these shortest paths that pass through the vertex\n",
- "* nx.pagerank : Page Rank Algorithm (developed by Google founders to measure the importance of webpages) assigns a score of importance to each node. Important nodes are those with many inlinks from important pages. It mainly works for Directed Networks\n",
- "\n",
- "**Finding Connections**\n",
- "* nx.find_cliques (undirected graphs): finds the maximum subgraphs based on the number of interconnected nodes\n",
- "* nx.k_core : A k-core is a maximal subgraph that contains nodes of degree k or more. Groups clusters meeting the threshold k (can be used as a toggle)\n",
- "\n",
- "**Sources**\n",
- "* https://www.youtube.com/watch?v=VetBkjcm9Go\n",
- "* https://www.activestate.com/blog/graph-theory-using-python-introduction-and-implementation/ \n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Things to think about\n",
- "* Apply the deduplicated_uuids.csv info to the transactions table\n",
- "* After doing a left join on the inds/orgs dataset with the transactions data, the recipient_id column needs to have a recipient_name column so that a new node can be created\n",
- "* for ppl who have multiple donations {and so have various attributes like office_sought, purpose, transaction_type}, should this information be saved?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
transaction_id
\n",
- "
donor_id
\n",
- "
year
\n",
- "
amount
\n",
- "
recipient_id
\n",
- "
office_sought
\n",
- "
purpose
\n",
- "
transaction_type
\n",
- "
donor_type
\n",
- "
recipient_type
\n",
- "
donor_office
\n",
- "
recipient_name
\n",
- "
\n",
- " \n",
- " \n",
- "
\n",
- "
0
\n",
- "
7773a71e-9f67-438e-8313-80b1b75deeb4
\n",
- "
4544b60d-da6b-4dd5-9efe-334152ccf1f1
\n",
- "
2018
\n",
- "
1000.0
\n",
- "
981a0414-b738-4e20-91b8-a29ee2cc7edf
\n",
- "
none
\n",
- "
bob worsley for state senate
\n",
- "
contribute to a candidate committee
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
#1022 arizona committee of automotive retailers
\n",
- "
\n",
- "
\n",
- "
1
\n",
- "
95f74915-a945-491f-8751-8c970a76fc24
\n",
- "
946d7561-42a3-4a4b-b410-3a10271c9f18
\n",
- "
2018
\n",
- "
1000.0
\n",
- "
981a0414-b738-4e20-91b8-a29ee2cc7edf
\n",
- "
none
\n",
- "
drew john for state house
\n",
- "
contribute to a candidate committee
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
#1022 arizona committee of automotive retailers
\n",
- "
\n",
- "
\n",
- "
2
\n",
- "
d05f1763-132d-4717-addc-8ff6239ad4d9
\n",
- "
c8f98436-9562-48ed-b51f-45b2b217aad1
\n",
- "
2018
\n",
- "
1000.0
\n",
- "
981a0414-b738-4e20-91b8-a29ee2cc7edf
\n",
- "
none
\n",
- "
elect karen fann ld1
\n",
- "
contribute to a candidate committee
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
#1022 arizona committee of automotive retailers
\n",
- "
\n",
- "
\n",
- "
3
\n",
- "
3dc3da30-6562-4755-bfad-6a26f1baec15
\n",
- "
b9965bc2-c94d-4f69-98d1-bc4f5ad701c5
\n",
- "
2018
\n",
- "
1000.0
\n",
- "
981a0414-b738-4e20-91b8-a29ee2cc7edf
\n",
- "
none
\n",
- "
elect noel campbell for house
\n",
- "
contribute to a candidate committee
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
#1022 arizona committee of automotive retailers
\n",
- "
\n",
- "
\n",
- "
4
\n",
- "
a4340a2c-7b8a-4eeb-8290-746f0f436c83
\n",
- "
946d7561-42a3-4a4b-b410-3a10271c9f18
\n",
- "
2018
\n",
- "
1000.0
\n",
- "
981a0414-b738-4e20-91b8-a29ee2cc7edf
\n",
- "
none
\n",
- "
closed to new donations
\n",
- "
refund from contrib to a cand committee
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
#1022 arizona committee of automotive retailers
\n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " transaction_id donor_id \\\n",
- "0 7773a71e-9f67-438e-8313-80b1b75deeb4 4544b60d-da6b-4dd5-9efe-334152ccf1f1 \n",
- "1 95f74915-a945-491f-8751-8c970a76fc24 946d7561-42a3-4a4b-b410-3a10271c9f18 \n",
- "2 d05f1763-132d-4717-addc-8ff6239ad4d9 c8f98436-9562-48ed-b51f-45b2b217aad1 \n",
- "3 3dc3da30-6562-4755-bfad-6a26f1baec15 b9965bc2-c94d-4f69-98d1-bc4f5ad701c5 \n",
- "4 a4340a2c-7b8a-4eeb-8290-746f0f436c83 946d7561-42a3-4a4b-b410-3a10271c9f18 \n",
- "\n",
- " year amount recipient_id office_sought \\\n",
- "0 2018 1000.0 981a0414-b738-4e20-91b8-a29ee2cc7edf none \n",
- "1 2018 1000.0 981a0414-b738-4e20-91b8-a29ee2cc7edf none \n",
- "2 2018 1000.0 981a0414-b738-4e20-91b8-a29ee2cc7edf none \n",
- "3 2018 1000.0 981a0414-b738-4e20-91b8-a29ee2cc7edf none \n",
- "4 2018 1000.0 981a0414-b738-4e20-91b8-a29ee2cc7edf none \n",
- "\n",
- " purpose transaction_type \\\n",
- "0 bob worsley for state senate contribute to a candidate committee \n",
- "1 drew john for state house contribute to a candidate committee \n",
- "2 elect karen fann ld1 contribute to a candidate committee \n",
- "3 elect noel campbell for house contribute to a candidate committee \n",
- "4 closed to new donations refund from contrib to a cand committee \n",
- "\n",
- " donor_type recipient_type donor_office \\\n",
- "0 NaN NaN NaN \n",
- "1 NaN NaN NaN \n",
- "2 NaN NaN NaN \n",
- "3 NaN NaN NaN \n",
- "4 NaN NaN NaN \n",
- "\n",
- " recipient_name \n",
- "0 #1022 arizona committee of automotive retailers \n",
- "1 #1022 arizona committee of automotive retailers \n",
- "2 #1022 arizona committee of automotive retailers \n",
- "3 #1022 arizona committee of automotive retailers \n",
- "4 #1022 arizona committee of automotive retailers "
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from utils.network import name_identifier\n",
- "from utils.linkage import deduplicate_perfect_matches\n",
- "transactions = transactions.loc[(transactions.recipient_id.isin(inds_df.id)) | \n",
- " (transactions.recipient_id.isin(orgs_df.id)) |\n",
- " (transactions.donor_id.isin(inds_df.id)) |\n",
- " (transactions.donor_id.isin(inds_df.id))]\n",
- "inds = deduplicate_perfect_matches(inds_df) \n",
- "orgs = deduplicate_perfect_matches(orgs_df)\n",
- "transactions[\"recipient_name\"] = transactions[\"recipient_id\"].apply(name_identifier, args=([orgs, inds],))\n",
- "\n",
- "transactions.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "87"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "x = transactions.loc[transactions.donor_id.isin(inds_df.id)]\n",
- "len(x.recipient_name.unique())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
id
\n",
- "
first_name
\n",
- "
last_name
\n",
- "
full_name
\n",
- "
entity_type
\n",
- "
state
\n",
- "
party
\n",
- "
company
\n",
- "
occupation
\n",
- "
address
\n",
- "
...
\n",
- "
year
\n",
- "
amount
\n",
- "
recipient_id
\n",
- "
office_sought
\n",
- "
purpose
\n",
- "
transaction_type
\n",
- "
donor_type
\n",
- "
recipient_type
\n",
- "
donor_office
\n",
- "
recipient_name
\n",
- "
\n",
- " \n",
- " \n",
- "
\n",
- "
55243
\n",
- "
0e24b503-b209-48b5-8edb-cca0cdaca78c
\n",
- "
M.
\n",
- "
TANG
\n",
- "
m. tang ...
\n",
- "
Individual
\n",
- "
MD
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
6614 23RD PLACE
\n",
- "
...
\n",
- "
2022.0
\n",
- "
2.0
\n",
- "
49a2d46f-5e75-433c-94fa-f910e66d1a1e
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
direct
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
None
\n",
- "
\n",
- "
\n",
- "
55244
\n",
- "
0e24b503-b209-48b5-8edb-cca0cdaca78c
\n",
- "
M.
\n",
- "
TANG
\n",
- "
m. tang ...
\n",
- "
Individual
\n",
- "
MD
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
6614 23RD PLACE
\n",
- "
...
\n",
- "
2022.0
\n",
- "
95.0
\n",
- "
49a2d46f-5e75-433c-94fa-f910e66d1a1e
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
direct
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
None
\n",
- "
\n",
- "
\n",
- "
55245
\n",
- "
0e24b503-b209-48b5-8edb-cca0cdaca78c
\n",
- "
M.
\n",
- "
TANG
\n",
- "
m. tang ...
\n",
- "
Individual
\n",
- "
MD
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
6614 23RD PLACE
\n",
- "
...
\n",
- "
2022.0
\n",
- "
10.0
\n",
- "
49a2d46f-5e75-433c-94fa-f910e66d1a1e
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
direct
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
None
\n",
- "
\n",
- "
\n",
- "
55246
\n",
- "
a23037f6-741c-43a5-8a6d-0f1db4371e1d
\n",
- "
OLIVIA N
\n",
- "
DALMASSO
\n",
- "
olivia n dalmasso ...
\n",
- "
Individual
\n",
- "
IL
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
PO BOX 574
\n",
- "
...
\n",
- "
2022.0
\n",
- "
12.6
\n",
- "
6b33721f-3f6a-47c0-bce2-284fc58e0d2a
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
direct
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
None
\n",
- "
\n",
- "
\n",
- "
55247
\n",
- "
a23037f6-741c-43a5-8a6d-0f1db4371e1d
\n",
- "
OLIVIA N
\n",
- "
DALMASSO
\n",
- "
olivia n dalmasso ...
\n",
- "
Individual
\n",
- "
IL
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
PO BOX 574
\n",
- "
...
\n",
- "
2022.0
\n",
- "
4.2
\n",
- "
6b33721f-3f6a-47c0-bce2-284fc58e0d2a
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
direct
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
NaN
\n",
- "
None
\n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 25 columns
\n",
- "
"
- ],
- "text/plain": [
- " id first_name \\\n",
- "55243 0e24b503-b209-48b5-8edb-cca0cdaca78c M. \n",
- "55244 0e24b503-b209-48b5-8edb-cca0cdaca78c M. \n",
- "55245 0e24b503-b209-48b5-8edb-cca0cdaca78c M. \n",
- "55246 a23037f6-741c-43a5-8a6d-0f1db4371e1d OLIVIA N \n",
- "55247 a23037f6-741c-43a5-8a6d-0f1db4371e1d OLIVIA N \n",
- "\n",
- " last_name \\\n",
- "55243 TANG \n",
- "55244 TANG \n",
- "55245 TANG \n",
- "55246 DALMASSO \n",
- "55247 DALMASSO \n",
- "\n",
- " full_name entity_type state \\\n",
- "55243 m. tang ... Individual MD \n",
- "55244 m. tang ... Individual MD \n",
- "55245 m. tang ... Individual MD \n",
- "55246 olivia n dalmasso ... Individual IL \n",
- "55247 olivia n dalmasso ... Individual IL \n",
- "\n",
- " party company occupation address ... year amount \\\n",
- "55243 NaN NaN NaN 6614 23RD PLACE ... 2022.0 2.0 \n",
- "55244 NaN NaN NaN 6614 23RD PLACE ... 2022.0 95.0 \n",
- "55245 NaN NaN NaN 6614 23RD PLACE ... 2022.0 10.0 \n",
- "55246 NaN NaN NaN PO BOX 574 ... 2022.0 12.6 \n",
- "55247 NaN NaN NaN PO BOX 574 ... 2022.0 4.2 \n",
- "\n",
- " recipient_id office_sought purpose \\\n",
- "55243 49a2d46f-5e75-433c-94fa-f910e66d1a1e NaN NaN \n",
- "55244 49a2d46f-5e75-433c-94fa-f910e66d1a1e NaN NaN \n",
- "55245 49a2d46f-5e75-433c-94fa-f910e66d1a1e NaN NaN \n",
- "55246 6b33721f-3f6a-47c0-bce2-284fc58e0d2a NaN NaN \n",
- "55247 6b33721f-3f6a-47c0-bce2-284fc58e0d2a NaN NaN \n",
- "\n",
- " transaction_type donor_type recipient_type donor_office \\\n",
- "55243 direct NaN NaN NaN \n",
- "55244 direct NaN NaN NaN \n",
- "55245 direct NaN NaN NaN \n",
- "55246 direct NaN NaN NaN \n",
- "55247 direct NaN NaN NaN \n",
- "\n",
- " recipient_name \n",
- "55243 None \n",
- "55244 None \n",
- "55245 None \n",
- "55246 None \n",
- "55247 None \n",
- "\n",
- "[5 rows x 25 columns]"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# left merge according to ind_id and transaction donor_id. This was entities that only received money will still be there, no info from ind_dataset\n",
- "# is lost\n",
- "merged_inds_sample = pd.merge(inds_df,transactions,how='left',left_on='id',right_on='donor_id')\n",
- "merged_inds_sample.dropna(subset = ['amount'], inplace=True)\n",
- "merged_inds_sample.tail(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['id', 'first_name', 'last_name', 'full_name', 'entity_type', 'state',\n",
- " 'party', 'company', 'occupation', 'address', 'zip', 'city',\n",
- " 'classification', 'transaction_id', 'donor_id', 'year', 'amount',\n",
- " 'recipient_id', 'office_sought', 'purpose', 'transaction_type',\n",
- " 'donor_type', 'recipient_type', 'donor_office', 'recipient_name'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "merged_inds_sample.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
donor_id
\n",
- "
recipient_id
\n",
- "
full_name
\n",
- "
recipient_name
\n",
- "
address
\n",
- "
amount
\n",
- "
city
\n",
- "
classification
\n",
- "
company
\n",
- "
donor_office
\n",
- "
...
\n",
- "
occupation
\n",
- "
office_sought
\n",
- "
party
\n",
- "
purpose
\n",
- "
recipient_type
\n",
- "
state
\n",
- "
transaction_id
\n",
- "
transaction_type
\n",
- "
year
\n",
- "
zip
\n",
- "
\n",
- " \n",
- " \n",
- "
\n",
- "
0
\n",
- "
0007b184-4e1d-401a-ba51-99733d2e13e7
\n",
- "
d461f2bd-9074-44b3-8948-e659bead3e58
\n",
- "
graham filler ...
\n",
- "
saginaw county republican committee
\n",
- "
12705 WARM CREEK
\n",
- "
500.00
\n",
- "
DEWITT
\n",
- "
neutral
\n",
- "
None
\n",
- "
None
\n",
- "
...
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
MI
\n",
- "
None
\n",
- "
direct
\n",
- "
2022.0
\n",
- "
48820-0000
\n",
- "
\n",
- "
\n",
- "
1
\n",
- "
00523627-46c7-4f76-ab42-fb2c1fbac1b1
\n",
- "
6126e78b-4e80-4361-a019-9d99aa1623ed
\n",
- "
daniel millstone ...
\n",
- "
rooted in community leadership pac
\n",
- "
10518 ROUNTREE RD
\n",
- "
0.77
\n",
- "
LOS ANGELES
\n",
- "
neutral
\n",
- "
None
\n",
- "
None
\n",
- "
...
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
CA
\n",
- "
None
\n",
- "
direct
\n",
- "
2022.0
\n",
- "
90064-0000
\n",
- "
\n",
- "
\n",
- "
2
\n",
- "
00934782-86e5-4941-94cf-0a700100a2c0
\n",
- "
2d1a0919-218e-4692-98ec-c4a73a126482
\n",
- "
josie petersheim ...
\n",
- "
mi greenstone pac
\n",
- "
7196 W. BRIGGS RD.
\n",
- "
25.00
\n",
- "
STANTON
\n",
- "
neutral
\n",
- "
None
\n",
- "
None
\n",
- "
...
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
MI
\n",
- "
None
\n",
- "
direct
\n",
- "
2022.0
\n",
- "
48888-0000
\n",
- "
\n",
- "
\n",
- "
3
\n",
- "
00f22bdd-96bf-4074-9620-4737e8444958
\n",
- "
af8417ee-5bca-49f5-91e9-d2de65d73631
\n",
- "
robert doerfler ...
\n",
- "
michigan senate democratic fund
\n",
- "
1534 NE 5TH AVE
\n",
- "
50.00
\n",
- "
FORT LAUDERDALE
\n",
- "
neutral
\n",
- "
None
\n",
- "
None
\n",
- "
...
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
FL
\n",
- "
None
\n",
- "
direct
\n",
- "
2022.0
\n",
- "
33304-1006
\n",
- "
\n",
- "
\n",
- "
4
\n",
- "
0138403b-b5b9-453a-a1d2-b6ed9fa5fe58
\n",
- "
6126e78b-4e80-4361-a019-9d99aa1623ed
\n",
- "
joseph martinez ...
\n",
- "
rooted in community leadership pac
\n",
- "
139 HURON AVE
\n",
- "
1.65
\n",
- "
MOUNT CLEMENS
\n",
- "
neutral
\n",
- "
None
\n",
- "
None
\n",
- "
...
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
MI
\n",
- "
None
\n",
- "
direct
\n",
- "
2022.0
\n",
- "
48043-0000
\n",
- "
\n",
- "
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
...
\n",
- "
\n",
- "
\n",
- "
1120
\n",
- "
fdccce6b-e55f-4f1d-bd95-1714f2a667ed
\n",
- "
a3fe20e2-8019-448e-9b54-bfdce4d87f2f
\n",
- "
michael olthoff ...
\n",
- "
bumstead leadership fund
\n",
- "
1499 MIDDLEBROOK DR
\n",
- "
1000.00
\n",
- "
NORTON SHORES
\n",
- "
neutral
\n",
- "
nichols
\n",
- "
None
\n",
- "
...
\n",
- "
ceo
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
MI
\n",
- "
None
\n",
- "
direct
\n",
- "
2022.0
\n",
- "
49441-0000
\n",
- "
\n",
- "
\n",
- "
1121
\n",
- "
fe969829-b8a4-4d38-88e2-8314b340d567
\n",
- "
6126e78b-4e80-4361-a019-9d99aa1623ed
\n",
- "
joanna simon ...
\n",
- "
rooted in community leadership pac
\n",
- "
1546 POPLAR GROVE DR
\n",
- "
3.82
\n",
- "
RESTON
\n",
- "
neutral
\n",
- "
None
\n",
- "
None
\n",
- "
...
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
VA
\n",
- "
None
\n",
- "
direct
\n",
- "
2022.0
\n",
- "
20194-1731
\n",
- "
\n",
- "
\n",
- "
1122
\n",
- "
ff1423ba-ff5e-4bc1-b864-303a9dcc9b32
\n",
- "
6126e78b-4e80-4361-a019-9d99aa1623ed
\n",
- "
adriana p{on ce ...
\n",
- "
rooted in community leadership pac
\n",
- "
9 BIRCH CT
\n",
- "
3.82
\n",
- "
NORMAL
\n",
- "
neutral
\n",
- "
None
\n",
- "
None
\n",
- "
...
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
IL
\n",
- "
None
\n",
- "
direct
\n",
- "
2022.0
\n",
- "
61761-3900
\n",
- "
\n",
- "
\n",
- "
1123
\n",
- "
ff24644e-d64a-4a8a-a87f-cdb53b86dd63
\n",
- "
6126e78b-4e80-4361-a019-9d99aa1623ed
\n",
- "
david friedman ...
\n",
- "
rooted in community leadership pac
\n",
- "
8823 MOUNTAIN PATH CIR
\n",
- "
0.15
\n",
- "
AUSTIN
\n",
- "
neutral
\n",
- "
None
\n",
- "
None
\n",
- "
...
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
TX
\n",
- "
None
\n",
- "
direct
\n",
- "
2022.0
\n",
- "
78759-0000
\n",
- "
\n",
- "
\n",
- "
1124
\n",
- "
ffb25947-c03f-43b2-abb4-23531cdb7324
\n",
- "
7f272fe4-d592-453c-9ca1-315ea3fdcff1
\n",
- "
dennis starner ...
\n",
- "
bill g schuette for state representative
\n",
- "
4612 CONGRESS DRIVE
\n",
- "
525.00
\n",
- "
MIDLAND
\n",
- "
neutral
\n",
- "
retired
\n",
- "
None
\n",
- "
...
\n",
- "
retired
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
None
\n",
- "
MI
\n",
- "
None
\n",
- "
direct/fund raiser
\n",
- "
2022.0
\n",
- "
48642-0000
\n",
- "
\n",
- " \n",
- "
\n",
- "
1125 rows × 25 columns
\n",
- "
"
- ],
- "text/plain": [
- " donor_id \\\n",
- "0 0007b184-4e1d-401a-ba51-99733d2e13e7 \n",
- "1 00523627-46c7-4f76-ab42-fb2c1fbac1b1 \n",
- "2 00934782-86e5-4941-94cf-0a700100a2c0 \n",
- "3 00f22bdd-96bf-4074-9620-4737e8444958 \n",
- "4 0138403b-b5b9-453a-a1d2-b6ed9fa5fe58 \n",
- "... ... \n",
- "1120 fdccce6b-e55f-4f1d-bd95-1714f2a667ed \n",
- "1121 fe969829-b8a4-4d38-88e2-8314b340d567 \n",
- "1122 ff1423ba-ff5e-4bc1-b864-303a9dcc9b32 \n",
- "1123 ff24644e-d64a-4a8a-a87f-cdb53b86dd63 \n",
- "1124 ffb25947-c03f-43b2-abb4-23531cdb7324 \n",
- "\n",
- " recipient_id \\\n",
- "0 d461f2bd-9074-44b3-8948-e659bead3e58 \n",
- "1 6126e78b-4e80-4361-a019-9d99aa1623ed \n",
- "2 2d1a0919-218e-4692-98ec-c4a73a126482 \n",
- "3 af8417ee-5bca-49f5-91e9-d2de65d73631 \n",
- "4 6126e78b-4e80-4361-a019-9d99aa1623ed \n",
- "... ... \n",
- "1120 a3fe20e2-8019-448e-9b54-bfdce4d87f2f \n",
- "1121 6126e78b-4e80-4361-a019-9d99aa1623ed \n",
- "1122 6126e78b-4e80-4361-a019-9d99aa1623ed \n",
- "1123 6126e78b-4e80-4361-a019-9d99aa1623ed \n",
- "1124 7f272fe4-d592-453c-9ca1-315ea3fdcff1 \n",
- "\n",
- " full_name \\\n",
- "0 graham filler ... \n",
- "1 daniel millstone ... \n",
- "2 josie petersheim ... \n",
- "3 robert doerfler ... \n",
- "4 joseph martinez ... \n",
- "... ... \n",
- "1120 michael olthoff ... \n",
- "1121 joanna simon ... \n",
- "1122 adriana p{on ce ... \n",
- "1123 david friedman ... \n",
- "1124 dennis starner ... \n",
- "\n",
- " recipient_name address \\\n",
- "0 saginaw county republican committee 12705 WARM CREEK \n",
- "1 rooted in community leadership pac 10518 ROUNTREE RD \n",
- "2 mi greenstone pac 7196 W. BRIGGS RD. \n",
- "3 michigan senate democratic fund 1534 NE 5TH AVE \n",
- "4 rooted in community leadership pac 139 HURON AVE \n",
- "... ... ... \n",
- "1120 bumstead leadership fund 1499 MIDDLEBROOK DR \n",
- "1121 rooted in community leadership pac 1546 POPLAR GROVE DR \n",
- "1122 rooted in community leadership pac 9 BIRCH CT \n",
- "1123 rooted in community leadership pac 8823 MOUNTAIN PATH CIR \n",
- "1124 bill g schuette for state representative 4612 CONGRESS DRIVE \n",
- "\n",
- " amount city classification company donor_office ... \\\n",
- "0 500.00 DEWITT neutral None None ... \n",
- "1 0.77 LOS ANGELES neutral None None ... \n",
- "2 25.00 STANTON neutral None None ... \n",
- "3 50.00 FORT LAUDERDALE neutral None None ... \n",
- "4 1.65 MOUNT CLEMENS neutral None None ... \n",
- "... ... ... ... ... ... ... \n",
- "1120 1000.00 NORTON SHORES neutral nichols None ... \n",
- "1121 3.82 RESTON neutral None None ... \n",
- "1122 3.82 NORMAL neutral None None ... \n",
- "1123 0.15 AUSTIN neutral None None ... \n",
- "1124 525.00 MIDLAND neutral retired None ... \n",
- "\n",
- " occupation office_sought party purpose recipient_type state \\\n",
- "0 None None None None None MI \n",
- "1 None None None None None CA \n",
- "2 None None None None None MI \n",
- "3 None None None None None FL \n",
- "4 None None None None None MI \n",
- "... ... ... ... ... ... ... \n",
- "1120 ceo None None None None MI \n",
- "1121 None None None None None VA \n",
- "1122 None None None None None IL \n",
- "1123 None None None None None TX \n",
- "1124 retired None None None None MI \n",
- "\n",
- " transaction_id transaction_type year zip \n",
- "0 None direct 2022.0 48820-0000 \n",
- "1 None direct 2022.0 90064-0000 \n",
- "2 None direct 2022.0 48888-0000 \n",
- "3 None direct 2022.0 33304-1006 \n",
- "4 None direct 2022.0 48043-0000 \n",
- "... ... ... ... ... \n",
- "1120 None direct 2022.0 49441-0000 \n",
- "1121 None direct 2022.0 20194-1731 \n",
- "1122 None direct 2022.0 61761-3900 \n",
- "1123 None direct 2022.0 78759-0000 \n",
- "1124 None direct/fund raiser 2022.0 48642-0000 \n",
- "\n",
- "[1125 rows x 25 columns]"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "attribute_cols = merged_inds_sample.columns.difference(['donor_id','recipient_id','full_name','recipient_name'])\n",
- "agg_functions = {col: 'sum' if col == 'amount' else 'first' for col in attribute_cols}\n",
- "grouped_sample = merged_inds_sample.groupby(['donor_id','recipient_id','full_name','recipient_name']).agg(agg_functions).reset_index()\n",
- "grouped_sample"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [],
- "source": [
- "def create_network_nodes(df: pd.DataFrame) -> nx.MultiDiGraph:\n",
- " G = nx.MultiDiGraph()\n",
- " # first check if df is individuals or organizations dataset\n",
- " if \"name\" in df.columns:\n",
- " node_name = \"name\"\n",
- " else:\n",
- " node_name = \"full_name\"\n",
- " \n",
- " transact_info = ['office_sought', 'purpose', 'transaction_type', 'year','transaction_id','donor_office','amount']\n",
- " for _, row in df.iterrows(): \n",
- " # add node attributes based on the columns relevant to the entity\n",
- " G.add_node(row[node_name])\n",
- " for column in df.columns.difference(transact_info):\n",
- " if not pd.isnull(row[column]):\n",
- " G.nodes[row[node_name]][column] = row[column]\n",
- " \n",
- " # link the donor node to the recipient node. add the attributes of the\n",
- " # edge based on relevant nodes \n",
- " edge_dictionary = {}\n",
- " for column in transact_info:\n",
- " if not pd.isnull(row[column]):\n",
- " edge_dictionary[column] = row[column]\n",
- " G.add_edge(row[node_name], row['recipient_name'], **edge_dictionary)\n",
- "\n",
- " # the added 'recipient_name' node has no attributes at this moment\n",
- " # for the final code this line won't be necessary, as each recipient\n",
- " # should ideally be referenced later on. For now, all added nodes for\n",
- " # the recipient will only have one default attribute: classification\n",
- " G.nodes[row['recipient_name']]['classification'] = 'neutral' \n",
- " \n",
- " edge_labels = {(u,v):d['amount'] for u,v,d in G.edges(data=True)}\n",
- " entity_colors = {'neutral': 'green', 'c':'blue', 'f':'red'}\n",
- " node_colors = [entity_colors[G.nodes[node]['classification']] for node in G.nodes()]\n",
- "\n",
- " nx.draw_planar(G, with_labels=False,node_color=node_colors)\n",
- " plt.figure(3,figsize=(12,12)) \n",
- " nx.draw_networkx_edge_labels(G, pos=nx.planar_layout(G),edge_labels=edge_labels, label_pos=0.5)\n",
- "\n",
- " #nx.draw_planar(G, with_labels=False)\n",
- " plt.show()\n",
- " return G"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 122,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{}"
- ]
- },
- "execution_count": 122,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#for u,v in G.nodes(data=True):\n",
- " #print(u)#['classification'])\n",
- " \n",
- "G.nodes['michigan association of health plans political action committee']#['classification'])#['nancy davis ']['classification']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 66,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array(['neutral', 'f'], dtype=object)"
- ]
- },
- "execution_count": 66,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "grouped_sample.classification.unique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- "