Skip to content

Commit

Permalink
Made file for processing indexes of SC 13D and 13G
Browse files Browse the repository at this point in the history
- Relates to #69
  • Loading branch information
bdcallen committed Feb 7, 2020
1 parent e7d4ff4 commit 0a4c799
Showing 1 changed file with 47 additions and 0 deletions.
47 changes: 47 additions & 0 deletions process_sc13dg_indexes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from schedule_13dg_indexing_functions import get_file_list_df, write_indexes_to_table, conn_string
from multiprocess import Pool
import datetime as dt

engine = create_engine(conn_string)

directory = os.getenv("EDGAR_DIR")

full_df = get_file_list_df(engine)
num_filings = full_df.shape[0]
num_cores = 12
batch_size = 240

num_batches = int(num_filings/batch_size) + 1
num_success = 0

p = Pool(num_cores)
start_time = dt.datetime.now()
for i in range(num_batches):

start = i * batch_size

if(i == num_batches - 1):

end = num_filings

else:

end = (i + 1) * batch_size

success = pd.Series(p.map(lambda i: write_indexes_to_table(full_df.loc[i, 'file_name'], full_df.loc[i, 'document'], full_df.loc[i, 'form_type'], directory, engine) , range(start, end)))
time_now = dt.datetime.now()
time_taken = time_now - start_time
num_success = num_success + success.sum()

if(i % 50 == 0 or i == num_batches - 1):

print(num_success + ' filings successfully process from ' + end)
print('Time taken: ' + time_taken)


p.close()

0 comments on commit 0a4c799

Please sign in to comment.