Skip to content

Commit

Permalink
Merge pull request #1 from aleks-v-k/update_six_version
Browse files Browse the repository at this point in the history
Update six version
  • Loading branch information
aleks-v-k authored Feb 22, 2024
2 parents 102a584 + 6a2643f commit b160aae
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 10 deletions.
4 changes: 2 additions & 2 deletions requirements/python
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ argcomplete~=1.10.0
beautifulsoup4~=4.8.0
chardet==3.*
docx2txt~=0.8
extract-msg<=0.29.* #Last with python2 support
extract-msg<=0.29.6 #Last with python2 support
pdfminer.six==20191110 #Last with python2 support
python-pptx~=0.6.18
six~=1.12.0
six~=1.16.0
SpeechRecognition~=3.8.1
xlrd~=1.2.0
57 changes: 49 additions & 8 deletions textract/parsers/html_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,22 @@
from .utils import BaseParser


HTML_TAG_RE = re.compile(r'(<[^>]+>)')
HTML_SPACE_SQUASH_RE = re.compile(r'\s+')
HTML_SPACE_RE = re.compile(r'\s')


class Parser(BaseParser):
"""Extract text from html file using beautifulsoup4. Filter text to
only show the visible parts of the page. Insipration from `here
<http://stackoverflow.com/a/1983219/564709>`_.
By default it preserves spaces and tries to render tables with ASCII
symbols '|' and '-'. It may be useless if you want to, for example,
extract text and put it to some full text search engine.
To replace several spaces with single one add option
`squash_html_spaces=True` to `textract.process` function.
To not render tables (just extract text) add an argument
`strip_html_tables=True` to `textract.process`.
"""

_disallowed_names = [
Expand Down Expand Up @@ -41,18 +53,23 @@ def _inline(self, element):
return True
return False

def _find_any_text(self, tag):
def _find_any_text(self, tag, squash_spaces=False):
"""Looks for any possible text within given tag.
"""
text = ''
if tag is not None:
text = six.text_type(tag)
text = re.sub(r'(<[^>]+>)', '', text)
text = re.sub(r'\s', ' ', text)
text = re.sub(HTML_TAG_RE, '', text)
if squash_spaces:
text = re.sub(HTML_SPACE_SQUASH_RE, ' ', text)
else:
text = re.sub(HTML_SPACE_RE, ' ', text)
text = text.strip()
return text

def _parse_tables(self, soup):
def _parse_tables(self, soup, squash_spaces):
"""Returns array containing basic informations about tables for ASCII
replacement (look: _replace_tables()).
"""
Expand All @@ -66,7 +83,9 @@ def _parse_tables(self, soup):
tds = tr.find_all('th') + tr.find_all('td')
if len(tds) > 0:
for i, td in enumerate(tds):
td_text = self._find_any_text(td)
td_text = self._find_any_text(
td, squash_spaces=squash_spaces
)
length = len(td_text)
if i in t_dict['col_width']:
t_dict['col_width'][i] = max(
Expand All @@ -85,10 +104,21 @@ def _parse_tables(self, soup):
tables.append(t_dict)
return tables

def _replace_tables(self, soup, v_separator=' | ', h_separator='-'):
def _strip_tables(self, soup, squash_spaces=False):
tables = self._parse_tables(soup, squash_spaces)
for t in tables:
html = ''
for tr in t['trs']:
html += u'{0}\n'.format(u' '.join(td['text'] for td in tr))
new_table = soup.new_tag('div')
new_table.string = html
t['table'].replace_with(new_table)
return soup

def _replace_tables(self, soup, squash_spaces=False, v_separator=' | ', h_separator='-'):
"""Replaces <table> elements with its ASCII equivalent.
"""
tables = self._parse_tables(soup)
tables = self._parse_tables(soup, squash_spaces)
v_sep_len = len(v_separator)
v_left_sep = v_separator.lstrip()
for t in tables:
Expand Down Expand Up @@ -124,12 +154,21 @@ def _join_inlines(self, soup):
elem.unwrap()
return soup

def extract(self, filename, **kwargs):
def extract(
self,
filename,
strip_html_tables=False,
squash_html_spaces=False,
**kwargs
):
with open(filename, "rb") as stream:
soup = BeautifulSoup(stream, 'lxml')

# Convert tables to ASCII ones
soup = self._replace_tables(soup)
if strip_html_tables:
soup = self._strip_tables(soup, squash_spaces=squash_html_spaces)
else:
soup = self._replace_tables(soup, squash_spaces=squash_html_spaces)

# Join inline elements
soup = self._join_inlines(soup)
Expand All @@ -141,7 +180,9 @@ def extract(self, filename, **kwargs):
for elem in elements:
string = elem.string
if string is None:
string = self._find_any_text(elem)
string = self._find_any_text(
elem, squash_spaces=squash_html_spaces
)
string = string.strip()
if len(string) > 0:
html += "\n" + string + "\n"
Expand Down

0 comments on commit b160aae

Please sign in to comment.