Skip to content

Commit

Permalink
Changed DSV file parser to check for expected line length. Remove DSV…
Browse files Browse the repository at this point in the history
… file size check. (#2158)

* Changed DSV file parser to check for expected line length. Remove DSV file size check. #2056
  • Loading branch information
Onager authored Sep 29, 2018
1 parent 893f8f1 commit df20c26
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 24 deletions.
2 changes: 1 addition & 1 deletion plaso/lib/line_reader_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def readline(self, size=None):
size = self._file_object_size - self._lines_buffer_offset

self._file_object.seek(self._lines_buffer_offset, os.SEEK_SET)
read_buffer = self._file_object.read(size)
read_buffer = self._file_object.read(read_size)

self._lines_buffer_offset += len(read_buffer)

Expand Down
105 changes: 83 additions & 22 deletions plaso/parsers/dsv_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ class DSVParser(interface.FileObjectParser):
# it can be defined here.
QUOTE_CHAR = b'"'

# The maximum size of a single field in the parser
FIELD_SIZE_LIMIT = csv.field_size_limit()

# Value that should not appear inside the file, made to test the actual
# file to see if it confirms to standards.
_MAGIC_TEST_STRING = b'RegnThvotturMeistarans'
Expand All @@ -59,6 +62,13 @@ def __init__(self, encoding=None):
"""
super(DSVParser, self).__init__()
self._encoding = encoding
if py2to3.PY_2:
self._end_of_line = b'\n'
else:
self._end_of_line = '\n'
self._maximum_line_length = (
len(self._end_of_line) +
len(self.COLUMNS) * (self.FIELD_SIZE_LIMIT + len(self.DELIMITER)))

def _ConvertRowToUnicode(self, parser_mediator, row):
"""Converts all strings in a DSV row dict to Unicode.
Expand Down Expand Up @@ -114,6 +124,65 @@ def _CreateDictReader(self, line_reader):
quotechar=quotechar, restkey=magic_test_string,
restval=magic_test_string)

# pylint: disable=missing-return-type-doc
def _CreateLineReader(self, file_object):
"""Creates an object that reads lines from a text file.
The line reader is advanced to the beginning of the DSV content, skipping
any header lines.
Args:
file_object (dfvfs.FileIO): file-like object.
Returns:
TextFile|BinaryLineReader: an object that implements an iterator
over lines in a text file.
Raises:
UnicodeDecodeError: if the file cannot be read with the specified
encoding.
"""
# The Python 2 csv module reads bytes and the Python 3 csv module Unicode
# reads strings.
if py2to3.PY_3:
line_reader = text_file.TextFile(
file_object, encoding=self._encoding, end_of_line=self._end_of_line)
else:
line_reader = line_reader_file.BinaryLineReader(
file_object, end_of_line=self._end_of_line)
# If we specifically define a number of lines we should skip, do that here.
for _ in range(0, self.NUMBER_OF_HEADER_LINES):
try:
line_reader.readline(self._maximum_line_length)
except UnicodeDecodeError:
raise
return line_reader

def _HasExpectedLineLength(self, file_object):
"""Determines if a file begins with lines of the expected length.
As we know the maximum length of valid lines in the DSV file, the presence
of lines longer than this indicates that the file will not be parsed
successfully, without reading excessive data from a large file.
Args:
file_object (dfvfs.FileIO): file-like object.
Returns:
bool: True if the file has lines of the expected length.
"""
original_file_position = file_object.tell()
line_reader = self._CreateLineReader(file_object)
for _ in range(0, 20):
# Attempt to read a line that is longer than any line that should be in
# the file.
sample_line = line_reader.readline(self._maximum_line_length + 1)
if len(sample_line) > self._maximum_line_length:
file_object.seek(original_file_position)
return False
file_object.seek(original_file_position)
return True

@classmethod
def GetFormatSpecification(cls):
"""Retrieves the format specification.
Expand All @@ -134,35 +203,27 @@ def ParseFileObject(self, parser_mediator, file_object):
Raises:
UnableToParseFile: when the file cannot be parsed.
"""
file_size = file_object.get_size()
# The csv module can consume a lot of memory, 1 GiB for a 100 MiB file.
# Hence that the maximum supported file size is restricted.
if file_size > self._MAXIMUM_SUPPORTED_FILE_SIZE:
display_name = parser_mediator.GetDisplayName()
raise errors.UnableToParseFile((
'[{0:s}] Unable to parse DSV file: {1:s} size of file exceeds '
'maximum supported size').format(self.NAME, display_name))

# TODO: Replace this with detection of the file encoding via byte-order
# marks. Also see: https://github.com/log2timeline/plaso/issues/1971
if not self._encoding:
self._encoding = parser_mediator.codepage

# The Python 2 csv module reads bytes and the Python 3 csv module Unicode
# reads strings.
if py2to3.PY_3:
line_reader = text_file.TextFile(file_object, encoding=self._encoding)
else:
line_reader = line_reader_file.BinaryLineReader(file_object)

# If we specifically define a number of lines we should skip, do that here.
for _ in range(0, self.NUMBER_OF_HEADER_LINES):
line_reader.readline()

reader = self._CreateDictReader(line_reader)
try:
if not self._HasExpectedLineLength(file_object):
display_name = parser_mediator.GetDisplayName()
raise errors.UnableToParseFile((
'[{0:s}] Unable to parse DSV file: {1:s} with error: '
'unexpected line length.').format(self.NAME, display_name))
except UnicodeDecodeError as exception:
display_name = parser_mediator.GetDisplayName()
raise errors.UnableToParseFile(
'[{0:s}] Unable to parse DSV file: {1:s} with error: {2!s}.'.format(
self.NAME, display_name, exception))

row_offset = line_reader.tell()
try:
line_reader = self._CreateLineReader(file_object)
reader = self._CreateDictReader(line_reader)
row_offset = line_reader.tell()
row = next(reader)
except (StopIteration, csv.Error, UnicodeDecodeError) as exception:
display_name = parser_mediator.GetDisplayName()
Expand Down
15 changes: 14 additions & 1 deletion tests/parsers/dsv_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class TestDSVParser(dsv_parser.DSVParser):

def __init__(self):
"""Initializes a DSV parser."""
super(TestDSVParser, self).__init__()
super(TestDSVParser, self).__init__(encoding='utf-8')
self.row_offsets = []
self.rows = []

Expand Down Expand Up @@ -107,6 +107,19 @@ def testParseFileObject(self):
self.assertEqual(row['user'], 'joesmith')
self.assertEqual(row['password'], 'superrich')

@shared_test_lib.skipUnlessHasTestFile(['password.csv'])
def testHasExpectedLineLength(self):
"""Tests the _HasExpectedLineLength function."""
parser = TestDSVParser()
test_file_entry = self._GetTestFileEntry(['password.csv'])
test_file_object = test_file_entry.GetFileObject()

self.assertTrue(parser._HasExpectedLineLength(test_file_object))

parser._maximum_line_length = 2
parser._HasExpectedLineLength(test_file_object)
self.assertFalse(parser._HasExpectedLineLength(test_file_object))


if __name__ == '__main__':
unittest.main()

0 comments on commit df20c26

Please sign in to comment.