Skip to content

Commit

Permalink
Update search files (mrpowers-io#215)
Browse files Browse the repository at this point in the history
* add results to `search_file()` and `search_files()`

* update dict schema from {path:count} -> {path:{keyword:count}}

* remove duplicate line printing

* update type hint

* add class for keyword search results

* use dataclass instead of typeddict
  • Loading branch information
jeffbrennan authored Feb 27, 2024
1 parent a661769 commit ac07479
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 14 deletions.
38 changes: 28 additions & 10 deletions quinn/keyword_finder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import os
from dataclasses import dataclass
from glob import iglob

default_keywords = [
Expand Down Expand Up @@ -40,42 +41,59 @@
"sparkContext",
]

@dataclass
class SearchResult:
"""Class to hold the results of a file search.
file_path: The path to the file that was searched.
word_count: A dictionary containing the number of times each keyword was found in the file.
"""

file_path: str
word_count: dict[str, int]


def search_file(path: str, keywords: list[str] = default_keywords) -> None:
def search_file(path: str, keywords: list[str] = default_keywords) -> SearchResult:
"""Searches a file for keywords and prints the line number and line containing the keyword.
:param path: The path to the file to search.
:type path: str
:param keywords: The list of keywords to search for.
:type keywords: list[str]
:returns: None
:rtype: None
:returns: A dictionary containing a file path and the number of lines containing a keyword in `keywords`.
:rtype: SearchResult
"""
match_results = SearchResult(file_path=path, word_count={keyword: 0 for keyword in keywords})

print(f"\nSearching: {path}")
with open(path) as f:
for line_number, line in enumerate(f, 1):
line_printed = False
for keyword in keywords:
if keyword in line:
print(f"{line_number}: {keyword_format(line)}", end="")
break
match_results.word_count[keyword] += 1

if not line_printed:
print(f"{line_number}: {keyword_format(line)}", end="")
line_printed = True

return match_results


def search_files(path: str, keywords: list[str] = default_keywords) -> None:
def search_files(path: str, keywords: list[str] = default_keywords) -> list[SearchResult]:
"""Searches all files in a directory for keywords.
:param path: The path to the directory to search.
:type path: str
:param keywords: The list of keywords to search for.
:type keywords: list[str]
:returns: None
:rtype: None
:returns: A list of dictionaries containing file paths and the number of lines containing a keyword in `keywords`.
:rtype: list[SearchResult]
"""
rootdir_glob = f"{path}/**/*"
file_list = [f for f in iglob(rootdir_glob, recursive=True) if os.path.isfile(f)]
for f in file_list:
search_file(f, keywords)
return [search_file(f, keywords) for f in file_list]


def keyword_format(input: str, keywords: list[str] = default_keywords) -> str:
Expand Down
17 changes: 13 additions & 4 deletions tests/test_keyword_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,22 @@


def test_search_file():
search_file("tests/test_files/some_pyspark.py")
file_path = "tests/test_files/some_pyspark.py"
results = search_file(file_path)

assert results.word_count["rdd"] == 5
assert results.word_count["sparkContext"] == 2


def test_search_files():
search_files("tests/test_files")
results = search_files("tests/test_files")

pyspark_file = [result for result in results if result.file_path == "tests/test_files/some_pyspark.py"][0]
csv_file = [result for result in results if result.file_path == "tests/test_files/good_schema1.csv"][0]

assert pyspark_file.word_count["rdd"] == 5
assert pyspark_file.word_count["sparkContext"] == 2
assert csv_file.word_count["rdd"] == 0


def test_keyword_format():
Expand All @@ -21,5 +32,3 @@ def test_surround_substring():
assert "spark **rdd|| stuff" == surround_substring("spark rdd stuff", "rdd", "**", "||")
assert "spark **rdd|| stuff with **rdd||" == surround_substring("spark rdd stuff with rdd", "rdd", "**", "||")
assert "spark **rdd||dd stuff" == surround_substring("spark rdddd stuff", "rdd", "**", "||")


0 comments on commit ac07479

Please sign in to comment.