Changed DSV file parser to check for expected line length. Remove DSV…

… file size check. (#2158) * Changed DSV file parser to check for expected line length. Remove DSV file size check. #2056
log2timeline · Sep 29, 2018 · df20c26 · df20c26
1 parent 893f8f1
commit df20c26
Show file tree

Hide file tree

Showing 3 changed files with 98 additions and 24 deletions.
diff --git a/plaso/lib/line_reader_file.py b/plaso/lib/line_reader_file.py
@@ -100,7 +100,7 @@ def readline(self, size=None):
         size = self._file_object_size - self._lines_buffer_offset
 
       self._file_object.seek(self._lines_buffer_offset, os.SEEK_SET)
-      read_buffer = self._file_object.read(size)
+      read_buffer = self._file_object.read(read_size)
 
       self._lines_buffer_offset += len(read_buffer)
 

diff --git a/plaso/parsers/dsv_parser.py b/plaso/parsers/dsv_parser.py
@@ -43,6 +43,9 @@ class DSVParser(interface.FileObjectParser):
   # it can be defined here.
   QUOTE_CHAR = b'"'
 
+  # The maximum size of a single field in the parser
+  FIELD_SIZE_LIMIT = csv.field_size_limit()
+
   # Value that should not appear inside the file, made to test the actual
   # file to see if it confirms to standards.
   _MAGIC_TEST_STRING = b'RegnThvotturMeistarans'
@@ -59,6 +62,13 @@ def __init__(self, encoding=None):
     """
     super(DSVParser, self).__init__()
     self._encoding = encoding
+    if py2to3.PY_2:
+      self._end_of_line = b'\n'
+    else:
+      self._end_of_line = '\n'
+    self._maximum_line_length = (
+        len(self._end_of_line) +
+        len(self.COLUMNS) * (self.FIELD_SIZE_LIMIT + len(self.DELIMITER)))
 
   def _ConvertRowToUnicode(self, parser_mediator, row):
     """Converts all strings in a DSV row dict to Unicode.
@@ -114,6 +124,65 @@ def _CreateDictReader(self, line_reader):
         quotechar=quotechar, restkey=magic_test_string,
         restval=magic_test_string)
 
+  # pylint: disable=missing-return-type-doc
+  def _CreateLineReader(self, file_object):
+    """Creates an object that reads lines from a text file.
+
+    The line reader is advanced to the beginning of the DSV content, skipping
+    any header lines.
+
+    Args:
+      file_object (dfvfs.FileIO): file-like object.
+
+    Returns:
+      TextFile|BinaryLineReader: an object that implements an iterator
+          over lines in a text file.
+
+    Raises:
+      UnicodeDecodeError: if the file cannot be read with the specified
+          encoding.
+    """
+    # The Python 2 csv module reads bytes and the Python 3 csv module Unicode
+    # reads strings.
+    if py2to3.PY_3:
+      line_reader = text_file.TextFile(
+          file_object, encoding=self._encoding, end_of_line=self._end_of_line)
+    else:
+      line_reader = line_reader_file.BinaryLineReader(
+          file_object, end_of_line=self._end_of_line)
+    # If we specifically define a number of lines we should skip, do that here.
+    for _ in range(0, self.NUMBER_OF_HEADER_LINES):
+      try:
+        line_reader.readline(self._maximum_line_length)
+      except UnicodeDecodeError:
+        raise
+    return line_reader
+
+  def _HasExpectedLineLength(self, file_object):
+    """Determines if a file begins with lines of the expected length.
+
+    As we know the maximum length of valid lines in the DSV file, the presence
+    of lines longer than this indicates that the file will not be parsed
+    successfully, without reading excessive data from a large file.
+
+    Args:
+      file_object (dfvfs.FileIO): file-like object.
+
+    Returns:
+      bool: True if the file has lines of the expected length.
+    """
+    original_file_position = file_object.tell()
+    line_reader = self._CreateLineReader(file_object)
+    for _ in range(0, 20):
+      # Attempt to read a line that is longer than any line that should be in
+      # the file.
+      sample_line = line_reader.readline(self._maximum_line_length + 1)
+      if len(sample_line) > self._maximum_line_length:
+        file_object.seek(original_file_position)
+        return False
+    file_object.seek(original_file_position)
+    return True
+
   @classmethod
   def GetFormatSpecification(cls):
     """Retrieves the format specification.
@@ -134,35 +203,27 @@ def ParseFileObject(self, parser_mediator, file_object):
     Raises:
       UnableToParseFile: when the file cannot be parsed.
     """
-    file_size = file_object.get_size()
-    # The csv module can consume a lot of memory, 1 GiB for a 100 MiB file.
-    # Hence that the maximum supported file size is restricted.
-    if file_size > self._MAXIMUM_SUPPORTED_FILE_SIZE:
-      display_name = parser_mediator.GetDisplayName()
-      raise errors.UnableToParseFile((
-          '[{0:s}] Unable to parse DSV file: {1:s} size of file exceeds '
-          'maximum supported size').format(self.NAME, display_name))
-
     # TODO: Replace this with detection of the file encoding via byte-order
     # marks. Also see: https://github.com/log2timeline/plaso/issues/1971
     if not self._encoding:
       self._encoding = parser_mediator.codepage
 
-    # The Python 2 csv module reads bytes and the Python 3 csv module Unicode
-    # reads strings.
-    if py2to3.PY_3:
-      line_reader = text_file.TextFile(file_object, encoding=self._encoding)
-    else:
-      line_reader = line_reader_file.BinaryLineReader(file_object)
-
-    # If we specifically define a number of lines we should skip, do that here.
-    for _ in range(0, self.NUMBER_OF_HEADER_LINES):
-      line_reader.readline()
-
-    reader = self._CreateDictReader(line_reader)
+    try:
+      if not self._HasExpectedLineLength(file_object):
+        display_name = parser_mediator.GetDisplayName()
+        raise errors.UnableToParseFile((
+            '[{0:s}] Unable to parse DSV file: {1:s} with error: '
+            'unexpected line length.').format(self.NAME, display_name))
+    except UnicodeDecodeError as exception:
+      display_name = parser_mediator.GetDisplayName()
+      raise errors.UnableToParseFile(
+          '[{0:s}] Unable to parse DSV file: {1:s} with error: {2!s}.'.format(
+              self.NAME, display_name, exception))
 
-    row_offset = line_reader.tell()
     try:
+      line_reader = self._CreateLineReader(file_object)
+      reader = self._CreateDictReader(line_reader)
+      row_offset = line_reader.tell()
       row = next(reader)
     except (StopIteration, csv.Error, UnicodeDecodeError) as exception:
       display_name = parser_mediator.GetDisplayName()

diff --git a/tests/parsers/dsv_parser.py b/tests/parsers/dsv_parser.py
@@ -25,7 +25,7 @@ class TestDSVParser(dsv_parser.DSVParser):
 
   def __init__(self):
     """Initializes a DSV parser."""
-    super(TestDSVParser, self).__init__()
+    super(TestDSVParser, self).__init__(encoding='utf-8')
     self.row_offsets = []
     self.rows = []
 
@@ -107,6 +107,19 @@ def testParseFileObject(self):
     self.assertEqual(row['user'], 'joesmith')
     self.assertEqual(row['password'], 'superrich')
 
+  @shared_test_lib.skipUnlessHasTestFile(['password.csv'])
+  def testHasExpectedLineLength(self):
+    """Tests the _HasExpectedLineLength function."""
+    parser = TestDSVParser()
+    test_file_entry = self._GetTestFileEntry(['password.csv'])
+    test_file_object = test_file_entry.GetFileObject()
+
+    self.assertTrue(parser._HasExpectedLineLength(test_file_object))
+
+    parser._maximum_line_length = 2
+    parser._HasExpectedLineLength(test_file_object)
+    self.assertFalse(parser._HasExpectedLineLength(test_file_object))
+
 
 if __name__ == '__main__':
   unittest.main()