From b595d57a81aef6c9db8ff8999047b6317e2695ab Mon Sep 17 00:00:00 2001
From: Ben Callen <ben_callen@hotmail.com>
Date: Thu, 6 Feb 2020 20:02:17 +1100
Subject: [PATCH] Added more functions for helping to index sc13dg

- Includes code to clean text, get bounded segments of the text

- Relates to  #69
---
 schedule_13dg_indexing_functions.py | 510 +++++++++++++++++++++++++---
 1 file changed, 459 insertions(+), 51 deletions(-)
diff --git a/schedule_13dg_indexing_functions.py b/schedule_13dg_indexing_functions.py
index 670c95f..43d03b1 100644
--- a/schedule_13dg_indexing_functions.py
+++ b/schedule_13dg_indexing_functions.py
@@ -47,11 +47,38 @@ def get_file_text_from_directory(file_name, document, directory):
     return(txt)
     
 
+def fix_unmatched_page_tags(text):
+    
+    # First fix missing right dels
+    
+    start_tag_r_regex = '<PAGE[^>]' 
+    end_tag_r_regex = '</PAGE[^>]'
+    
+    corrected_r_start = '<PAGE>\n' # Introduce newline in case newline replace by right del
+    corrected_r_end = '</PAGE>\n'
+    
+    fixed_content = re.sub(start_tag_r_regex, corrected_r_start, text)
+    fixed_content = re.sub(end_tag_r_regex, corrected_r_end, fixed_content)
+    
+    # Now fix missing left dels
+    
+    start_tag_l_regex = '[^<]PAGE>' 
+    end_tag_l_regex = '[^<]/PAGE>'
+    
+    corrected_l_start = '\n<PAGE>' # Introduce newline in case newline replace by left del
+    corrected_l_end = '\n</PAGE>'
+    
+    fixed_content = re.sub(start_tag_l_regex, corrected_l_start, fixed_content)
+    fixed_content = re.sub(end_tag_l_regex, corrected_l_end, fixed_content)
+    
+    return(fixed_content)
+
+
 def clean_text(text):
     
     # this function gets rid of unwanted characters and spaces
     
-    result = text
+    result = fix_unmatched_page_tags(text) # First fix tags which are missing delimeters
     
     char_list = ['\xa0', '&#160;', '&#\n160;']
     
@@ -61,6 +88,35 @@ def clean_text(text):
         
     return(result)
 
+
+def get_bounded_text(text, lower_bound = None, upper_bound = None):
+    
+    if(lower_bound is not None):
+        
+        if(upper_bound is not None):
+            
+            if(lower_bound >= upper_bound):
+                
+                raise ValueError("upper_bound not more than lower_bound")
+                
+            else:
+                
+                return(text[lower_bound:upper_bound])
+            
+        else:
+            
+            return(text[lower_bound:])
+        
+    else:
+        
+        if(upper_bound is not None):
+            
+            return(text[lower_bound:upper_bound])
+        
+        else:
+            
+            return(text)
+
    
     
 def get_main_doc_text(file_name, document, directory):
@@ -242,10 +298,39 @@ def find_title_page_end(text):
         
         return(result)
         
+
+def is_rep_q_as_items(text, q1_start = None):
+    
+    # this function works on the basis that the the actual "items" in the real item section does not go beyond
+    # Item 10. So by searching for Item 12 or 14 (in SC 13G and SC 13D respectively), we can detect whether
+    # a filer has erroneously labelled the questions in the reporting pages as items
+    
+    if(q1_start is None):
+      
+        q1_start = find_orthodox_cover_page_q1_start(text)
+    
+    if(q1_start == -1):
+    
+        text_raised = text.upper()
+        
+        regex = '\n\s*ITEM\s*(\.)?\s*[\(]?\s*1[24]\s*[\)]?(\.)?'
+        
+        search = re.search(regex, text_raised)
+        
+        if(search):
+            return(True)
+        else:
+            return(False)
+            
+    else:
+      
+        return(False)
+        
         
-def find_orthodox_cover_page_q1_start(text):
+        
+def find_orthodox_cover_page_q1_start(text, lower_bound = None, upper_bound = None):
 
-    text_raised = text.upper()
+    text_raised = get_bounded_text(text, lower_bound, upper_bound).upper()
 
     regex0 = '\s+[\(\|]?\s*[1L]\s*[\)\|]?\s*[:\.]{0,1}(\s*[\(\|]\s*A\s*[\)\|])?\s*(.){0,50}\s*' + \
              'NAME[\(]?[S]?[\)]?\s+(?:OF|OR)(\s+THE)?\s+(?:REPORTING|ABOVE|REPORT)'
@@ -290,7 +375,6 @@ def find_orthodox_cover_page_q1_start(text):
     
     regex_list = [regex0, regex1, regex2, regex3, regex4, regex5, regex6, regex7, regex8, regex9, regex10]
     
-    start = None
     
     for i in range(len(regex_list)):
         
@@ -298,36 +382,86 @@ def find_orthodox_cover_page_q1_start(text):
         
         if(search):
             
-            if(start is None):
-                
-                start = search.start()
-                
-            else:
-                
-                new = search.start()
-                
-                if(new < start):
-                    
-                    start = new            
+            break
 
-    if(start):
+    if(search):
         
-        return(start)
+        return(search.start())
     
     else:
         
         return(-1)   
+
+
+def cover_page_start_is_rep_as_q(text, form_type, lower_bound = None, upper_bound = None, rep_q_as_items = None):
+    # this function is for cases where is_rep_q_as_items is True
+    
+    
+    text_raised = get_bounded_text(text, lower_bound, upper_bound).upper()
         
-        
-def cover_page_start(text, form_type, cover_page_q1 = None, rep_q_as_items = None):
+    search = None
     
     if(rep_q_as_items is None):
         
         rep_q_as_items = is_rep_q_as_items(text)
+
+    if(re.search('SC 13D', form_type)):
+        
+        if(rep_q_as_items):
+            
+            search = re.search('\n\s*ITEM\s+[\(\|]?\s?1\s?[\)\|]?\s*[:\.]{0,1}(.)+?' + \
+                  '\n\s*ITEM\s+[\(\|]?\s?14\s?[\)\|]?\s*[:\.]{0,1}', text_raised, flags = re.DOTALL)
+            
+        else:
+            return(-1)
+        
+    if(re.search('SC 13G', form_type)):
+        
+        if(rep_q_as_items):
+            
+            search = re.search('\n\s*ITEM\s+[\(\|]?\s?1\s?[\)\|]?\s*[:\.]{0,1}(.)+?' + \
+                  '\n\s*ITEM\s+[\(\|]?\s?12\s?[\)\|]?\s*[:\.]{0,1}', text_raised, flags = re.DOTALL)
+            
+        else:
+            return(-1)
+            
+    if(search):
+        
+        return(search.start() + lower_bound)
+    
+    else:
+        
+        return(-1)
+
+
+
+def find_cover_page_q1_start(text, form_type, lower_bound = None, upper_bound = None, rep_q_as_items = None):
+    
+    if(rep_q_as_items is None):
+        
+         rep_q_as_items = is_rep_q_as_items(get_bounded_text(text, lower_bound, upper_bound))
+            
+    if(rep_q_as_items):
+        
+        result = cover_page_start_is_rep_as_q(text, form_type, lower_bound, upper_bound, rep_q_as_items)
+        
+    else:
+        
+        result = find_orthodox_cover_page_q1_start(text, lower_bound, upper_bound)
+        
+    return(result)
+
+    
+        
+def cover_page_start(text, form_type, lower_bound = None, cover_page_q1 = None, rep_q_as_items = None):
+    
+    if(rep_q_as_items is None):
+        
+        rep_q_as_items = is_rep_q_as_items(text, cover_page_q1)
     
     if(cover_page_q1 is None):
         
-        cover_page_q1 = find_cover_page_q1_start(text, rep_q_as_items)
+        cover_page_q1 = find_cover_page_q1_start(text, form_type, lower_bound, cover_page_q1, rep_q_as_items)
         
     if(cover_page_q1 == -1):
         
@@ -385,11 +519,11 @@ def find_cover_pages_last_question(text, rep_q_as_items = None,  lower_bound = N
     
     if(rep_q_as_items):
         
-        regex = 'ITEM\s*[\(]?\s*1[24]\s*[\)]?\s*[\:\.]?'
+        regex = 'ITEM\s*[\(\|]?\s*1[24]\s*[\)\|]?\s*[\:\.]?'
         
     else:
     
-        regex = '[\(]?\s*1[24][\)]?\s*[\:\.]?\s*TYPE[S]?\s+OF\s+REPORTING\s+PERSON[S]?[\*]?\s*(\(SEE\s+INSTRUCTIONS\))?'
+        regex = '[\(\|]?\s*1[24]\s*[\)\|]?\s*[\:\.]?\s*TYPE[S]?\s+OF\s+REPORTING\s+PERSON[S]?[\*]?\s*(\(SEE\s+INSTRUCTIONS\))?'
 
     search = re.search(regex, text_raised)
 
@@ -411,30 +545,198 @@ def find_cover_pages_last_question(text, rep_q_as_items = None,  lower_bound = N
         
 
     
+def find_cover_pages_end(text, form_type, rep_q_as_items = None, lower_bound = None, upper_bound = None):
     
-def find_cover_pages_end(text, rep_q_as_items = None, lower_bound = None, upper_bound = None):
+    if(upper_bound is None):
+        
+        upper_bound = get_item_section_start(text, form_type)
+    
+    if(lower_bound is None):
+        
+        lower_bound = find_cover_pages_last_question(text, rep_q_as_items, None, upper_bound)
+    
+    end = None
+        
+    # First check for footnotes
     
+    last_footnote_start = find_last_footnote_index_end(text, lower_bound, upper_bound)
     
-    end = find_cover_pages_last_question(text, rep_q_as_items, lower_bound, upper_bound)
+    if(last_footnote_start != -1):
+        
+        end = last_footnote_start
         
+        search = re.search('\n\s*\n', text[end:upper_bound])
+
+        if(search is not None):
+
+            end = end + search.end()
+
+            return(end)
+        
+ 
     answer_regex = '\s*([A-Z0]{1}[;, \.]?[A-Z0]{1}[;, \.]?)*([A-Z0]{1}[;, \.]?[A-Z0]{1}[;, \.]?)\s*\n?'
+
+    last_answer = re.search(answer_regex, text[lower_bound:upper_bound])
+
+    if(last_answer is not None):
+
+        end = lower_bound + last_answer.end()
+        return(end)
+
+    # Finally, try finding a key stopword in all caps, such as SCHEDULE 13D, CUSIP, etc...
+
+    end = first_header_word_after_cover_page(text, form_type, lower_bound, \
+                                       upper_bound, rep_q_as_items)
+
+    if(end != -1):
+
+        return(end)
+
+    else:
+
+        return(-1)
     
-    if(upper_bound is not None):
+
+      
+ 
+def find_last_footnote_index_end(text, lower_bound = None, upper_bound = None):
     
-        last_answer = re.search(answer_regex, text[end:upper_bound])
+    
+    bounded_text = get_bounded_text(text, lower_bound, upper_bound)
         
+    regex1 = '\n\s*[\(\|]?\s*[*]+\s*[\)\|]?\s*[a-zA-Z]'
+    regex2 = '\n\s*[\(\|]\s*([0-9]{1,3}|[a-zA-Z]{1})\s*[\)\|]\s*[a-zA-Z]'   
+    
+    # first try regex1
+    
+    last_footnote_ind_end  = find_last_search_end(regex1, bounded_text)
+    
+    if(last_footnote_ind_end is not None):
+        
+        if(lower_bound is not None):
+            
+            result = last_footnote_ind_end + lower_bound
+            
+        else: 
+            
+            result = last_footnote_ind_end
+            
+        return(result)
+            
     else:
         
-        last_answer = re.search(answer_regex, text[end:])
+        # Try regex2
+        
+        last_footnote_ind_end = find_last_search_end(regex2, bounded_text)
     
-    if(last_answer is not None):
+        if(last_footnote_ind_end is not None):
+
+            if(lower_bound is not None):
+
+                result = last_footnote_ind_end + lower_bound
+
+            else: 
+
+                result = last_footnote_ind_end
+
+            return(result)
+        
+        else:
+            
+            return(-1)
+            
+            
+     
+      
+def find_last_footnote_start(text, lower_bound = None, upper_bound = None):
+    
+    
+    bounded_text = get_bounded_text(text, lower_bound, upper_bound)
+        
+    regex1 = '\n\s*[\(\|]?\s*[*]+\s*[\)\|]?\s*[a-zA-Z]'
+    regex2 = '\n\s*[\(\|]\s*([0-9]{1,3}|[a-zA-Z]{1})\s*[\)\|]\s*[a-zA-Z]'   
+    
+    # first try regex1
     
-        return(end + last_answer.end())
+    last_footnote_start = find_last_search_start(regex1, bounded_text)
     
+    if(last_footnote_start is not None):
+        
+        if(lower_bound is not None):
+            
+            result = last_footnote_start + lower_bound
+            
+        else: 
+            
+            result = last_footnote_start
+            
+        return(result)
+            
     else:
         
-        return(-1)
+        # Try regex2
+        
+        last_footnote_start = find_last_search_start(regex2, bounded_text)
+    
+        if(last_footnote_start is not None):
+
+            if(lower_bound is not None):
+
+                result = last_footnote_start + lower_bound
+
+            else: 
+
+                result = last_footnote_start
+
+            return(result)
+        
+        else:
+            
+            return(-1)
+            
+            
         
+
+def first_header_word_after_cover_page(text, form_type, cover_page_last_q = None, upper_bound = None, rep_q_as_items = None):
+    
+    if(rep_q_as_items is None):
+        
+        rep_q_as_items = is_rep_q_as_items(text)
+    
+    if(cover_page_last_q is None):
+        
+        cover_page_last_q = find_cover_pages_last_question(text, rep_q_as_items,  lower_bound, upper_bound)
+        
+    if(cover_page_last_q == -1):
+        
+        # No question 1 found, so no cover pages, return -1
+        return(-1) 
+    
+    else:
+        
+        text_to_search = get_bounded_text(text, cover_page_last_q, upper_bound)
+        
+        text_raised = text_to_search.upper()
+
+        regex = '\n\s*(CUSIP|SEDOL|SCHEDULE\s+13[DG]|PAGE|13[DG])'
+
+        search = re.search(regex, text_raised)
+
+        if(search):
+
+            num_forward = search.start()
+            
+            start = cover_page_last_q + num_forward
+
+            return(start)
+
+        else:
+
+            return(-1)
+
+        
+
+
  
 
 def num_cusip_sedol_before_index(text, index = None):
@@ -565,11 +867,15 @@ def get_item_section_start(text, form_type):
         search = re.search('\n\s*item\s*[#\.\;\:]?\s*(?:[0-9])[#\.\;\:]?', text_lowered)
         
         if(search):
+            
+            search_str = search.group(0)
+            
+            num_to_item = re.search('item', search_str).start()
 
-            return(cover_pages_end + search.start())  
+            return(cover_pages_end + search.start() + num_to_item)  
         
         else:
-            return(None) # ie. assume no Item section (this probably happens with 13D/A and 13G/A)
+            return(-1) # ie. assume no Item section (this probably happens with 13D/A and 13G/A)
     
     else:
         
@@ -579,11 +885,15 @@ def get_item_section_start(text, form_type):
 
         if(search):
 
-            return(search.start())  
+            search_str = search.group(0)
+            
+            num_to_item = re.search('item', search_str).start()
+
+            return(search.start() + num_to_item)  
         
         else: 
              
-            result = None # ie. assume no Item section (this probably happens with 13D/A and 13G/A) if no matches in
+            result = -1 # ie. assume no Item section (this probably happens with 13D/A and 13G/A) if no matches in
                           # re_list
                 
             for i in range(len(re_list)):
@@ -592,31 +902,31 @@ def get_item_section_start(text, form_type):
                 
                 if(search):
                     
-                    result = search.start()
+                    search_str = search.group(0)
+            
+                    num_to_item = re.search('item', search_str).start()
+                    
+                    result = search.start() + num_to_item
                     break
             
             
             return(result)     
     
     
+def get_signatures_sec_start(text, lower_bound = None, upper_bound = None):
     
-def get_signatures_sec_start(text):
-    
-    text_lowered = text.lower()
+    text_lowered = get_bounded_text(text, lower_bound, upper_bound).lower()
     
+    regex1 = '\n\s*(signature|signatures)\s*(\.)?\n'
     
-    regex0 = '\n\s*item\s+10(\.)?\s+certification'
-    
-    regex1 = 'the\s+following\s+certification\s+shall\s+be\s+included\s+if\s+the\s+statement\s+is\s+filed\s+' + \
+    regex2 = 'the\s+following\s+certification\s+shall\s+be\s+included\s+if\s+the\s+statement\s+is\s+filed\s+' + \
              'pursuant to rule 13d-1\(b\)'
     
-    regex2 = 'by\s+signing\s+below\s+i\s+certify\s+that,\s+to\s+the\s+best\s+of\s+my\s+knowledge'
-    
-    regex3 = '\n\s*(signature|signatures)\s*(\.)?\n'
+    regex3 = 'by\s+signing\s+below\s+i\s+certify\s+that,\s+to\s+the\s+best\s+of\s+my\s+knowledge'
 
     regex4 = 'after\s+reasonable\s+inquiry\s+'
     
-    regex_list = [regex0, regex1, regex2, regex3, regex4]
+    regex_list = [regex1, regex2, regex3, regex4]
     
     regex_search = None
     
@@ -630,10 +940,105 @@ def get_signatures_sec_start(text):
         
     if(regex_search):
         
-        return(regex_search.start())
+        if(lower_bound is not None):
+        
+            return(regex_search.start() + lower_bound)
+        
+        else:
+            
+            return(regex_search.start())
     
     else:
-        return(None)
+      
+        return(-1)
+        
+        
+def get_exhibit_sec_start(text, lower_bound = None, upper_bound = None):
+    
+    text_upper = get_bounded_text(text, lower_bound, upper_bound).upper()
+    
+    regex0 = '\n\s*EXHIBIT\s+INDEX'
+    
+    regex1 = '\n\s*EXHIBIT\s+[0-9A-Z](:)?'
+    
+    regex2 = '\n\s*APPENDIX\s+[0-9A-Z](:)?'
+    
+    regex_list = [regex0, regex1, regex2]
+    
+    regex_search = None
+    
+    for i in range(len(regex_list)):
+        
+        regex_search = re.search(regex_list[i], text_upper)
+        
+        if(regex_search):
+            
+            break
+        
+    if(regex_search):
+        
+        if(lower_bound is not None):
+        
+            return(regex_search.start() + lower_bound)
+        
+        else:
+            
+            return(regex_search.start())
+    
+    else:
+        return(-1)
+    
+    
+    
+def has_exhibit_break(text, cover_page_start, signature_start):
+    
+    exhibit_st = get_exhibit_sec_start(text, cover_page_start, signature_start)
+    
+    if(exhibit_st != -1):
+        
+        return(True)
+    
+    else:
+        
+        return(False)    
+    
+ 
+ 
+ 
+def find_all_orthodox_q1_starts(text, exhibit_start):
+    
+    # Finds all orthodox q1s BEFORE the exhibit section if it exists in the main text
+    # (we do not want cover pages from exhibit parts)
+    
+    lst = []
+    
+    start, end = find_orthodox_cover_page_q1_start_end(text)
+    
+    while(start != -1):
+        
+        lst.append(start)
+        p_start, p_end = find_orthodox_cover_page_q1_start_end(text, end)
+        
+        if(p_start != -1):
+            
+            start = end + p_start
+            end = end + p_end
+            
+        else:
+            
+            start = -1
+            
+    if(exhibit_start is not None and exhibit_start != -1):
+        
+        result = [i for i in lst if i < exhibit_start]
+        return(result)
+
+    else:
+        
+        return(lst)
+         
+ 
+ 
     
     
 def get_key_indices(file_name, document, form_type, directory):
@@ -663,12 +1068,14 @@ def get_key_indices(file_name, document, form_type, directory):
     key_info['cover_page_last_q_end'] = [find_cover_pages_last_question(text, \
                         key_info['is_rep_q_as_items'][0], key_info['cover_page_q1_start'][0], \
                                                                         key_info['item_section_start'][0])]
-    key_info['cover_page_end'] = [find_cover_pages_end(text, key_info['is_rep_q_as_items'][0],\
-                                key_info['cover_page_q1_start'][0], key_info['item_section_start'][0])]
+    key_info['cover_page_end'] = [find_cover_pages_end(text, form_type, key_info['is_rep_q_as_items'][0],\
+                                key_info['cover_page_last_q_end'][0], key_info['item_section_start'][0])]
     
     key_info['num_cusip_b_items'] = [num_cusip_sedol_before_index(text, \
                                                                   index = key_info['item_section_start'][0])]
-    key_info['signature_start'] = [get_signatures_sec_start(text)]
+    key_info['signature_start'] = [get_signatures_sec_start(text, key_info['item_section_start'][0])]
+    key_info['exhibit_start'] = [get_exhibit_sec_start(text, key_info['signature_start'][0])]
+    key_info['main_text_length'] = [len(text)]
     
     if(key_info['cover_page_start'][0] != -1 and key_info['item_section_start'][0] != -1\
       and key_info['item_section_start'][0] > key_info['cover_page_start'][0]):
@@ -681,11 +1088,12 @@ def get_key_indices(file_name, document, form_type, directory):
     
     key_info['amendment_statement_start'] = find_amendment_statement_start(text, form_type, \
                     lower_bound = amend_l_bound, upper_bound = key_info['item_section_start'][0])
+
     
     
     key_info_df = pd.DataFrame(key_info)
     
-    return(key_info_df) 
+    return(key_info_df)
     
     
 def write_indexes_to_table(file_name, document, form_type, directory, engine):