Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor hashing/dedupe #299

Merged
merged 36 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
0f63401
allow for multiple hash types
datawhores Nov 16, 2024
97bedd0
remove add_columns_hash
datawhores Nov 16, 2024
581e3c4
changes to hashclient and manager to allow for multiple hash types, a…
datawhores Nov 17, 2024
cace26e
return specific hashtype in get_unique_hashes
datawhores Nov 17, 2024
7721ae5
support hash_type when getting hashes from file and folder
datawhores Nov 17, 2024
025a458
fix: changes to hash_client to support multiple hash types
datawhores Nov 17, 2024
c417ce6
refactor :use download_filename for hash_table
datawhores Nov 17, 2024
8ad0f3c
refactor: move dupe_cleanup_options to settings and add hashing_options
datawhores Nov 17, 2024
d2fa988
docs: update docs for hashing + dedupe settings
datawhores Nov 17, 2024
8299b7b
docs: update docs for hashing + dedupe by combining
datawhores Nov 17, 2024
50c5e95
refactor: remove hash_options
datawhores Nov 17, 2024
94278a6
refactor: only hash if delete after download is on
datawhores Nov 17, 2024
a5715ef
refactor: fix docs by explaining settings better, also re-add disable…
datawhores Nov 17, 2024
8fcf36d
docs: fix hashing docs again
datawhores Nov 17, 2024
755fbb9
split settings into sub dicts
datawhores Nov 17, 2024
c76e14a
refactor: fix issues with last commit
datawhores Nov 18, 2024
c2fbe15
add folder for storing data and enum classes
datawhores Nov 18, 2024
b459691
fix: move paths confirmation loop out of nested loop, as it is repeated
datawhores Nov 18, 2024
035cce1
fix: redo last commit use a tuple set to check for current key as a p…
datawhores Nov 18, 2024
c728ac3
refactor: redo dupe cleanup config def also add enums for dupe
datawhores Nov 18, 2024
1810b42
fix: lower case sub keys
datawhores Nov 18, 2024
571cf1f
fix: lower case sub keys
datawhores Nov 18, 2024
9d50e3f
refactor: rename implace as in_place
datawhores Nov 18, 2024
1c46c67
refactor: finish most refactoring of deduping based on discussion
datawhores Nov 19, 2024
2b3b76c
merge master
datawhores Nov 19, 2024
657dacb
refactor: fix prompts for deduping changes
datawhores Nov 19, 2024
1f10ab9
fix: remove prompt files
datawhores Nov 19, 2024
06d49f0
fix docs
datawhores Nov 19, 2024
f697553
ran ruff
datawhores Nov 19, 2024
352785c
refactor:remove keep_newest, keep_oldest switch to a simple bool
datawhores Nov 25, 2024
23a949d
Update cyberdrop_dl/managers/config_manager.py
datawhores Nov 25, 2024
e4ed925
refactor: remove unneeded _eq_ function
datawhores Nov 25, 2024
ff0f51e
fix: throw error if key is not right
datawhores Nov 25, 2024
a84a4fa
sync: remove browser cookie file
datawhores Nov 25, 2024
aed6b5b
Merge branch 'depended-fix' of https://github.com/datawhores/CyberDro…
datawhores Nov 25, 2024
6298de9
Merge branch 'master' into depended-fix
datawhores Nov 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 35 additions & 19 deletions cyberdrop_dl/clients/hash_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,37 +47,49 @@
self.manager = manager
self.hashes = defaultdict(lambda: None)
self.prev_hashes = None
self.xxhash="xxh128"
self.md5="md5"
self.sha256="sha256"

async def startup(self) -> None:
self.prev_hashes = set(await self.manager.db_manager.hash_table.get_all_unique_hashes())
self.prev_hashes = set(await self.manager.db_manager.hash_table.get_all_unique_hashes(self.xxhash))

async def hash_directory(self, path: Path) -> None:
path = Path(path)
async with self.manager.live_manager.get_hash_live(stop=True):
if not path.is_dir():
raise NotADirectoryError
for file in path.rglob("*"):
await self.hash_item(file, None, None)
await self.hash_item_helper(file, None, None)

@staticmethod
def _get_key_from_file(file: Path | str):
return str(Path(file).absolute())
async def hash_item_helper(self, file: Path | str, original_filename: str, referer: URL):
hash=await self.hash_item(file, original_filename,referer,hash_type=self.xxhash)
if self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["add_md5_hash"]:
await self.hash_item(file, original_filename,referer,hash_type=self.md5)
if self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["add_sha256_hash"]:
await self.hash_item(file, original_filename, referer, hash_type=self.sha256)
return hash


async def hash_item(self, file: Path | str, original_filename: str, referer: URL) -> str:
async def hash_item(self, file: Path | str, original_filename: str, referer: URL,hash_type=None) -> str:
"""Generates hash of a file."""
key = self._get_key_from_file(file)
file = Path(file)
if not file.is_file():
return None
if self.hashes[key]:
return self.hashes[key]
if self.hashes[(key,hash_type)]:
return self.hashes[(key,hash_type)]
self.manager.progress_manager.hash_progress.update_currently_hashing(file)
hash = await self.manager.db_manager.hash_table.get_file_hash_exists(file)
hash = await self.manager.db_manager.hash_table.get_file_hash_exists(file,hash_type)
try:
if not hash:
hash = await self.manager.hash_manager.hash_file(file)
hash = await self.manager.hash_manager.hash_file(file,hash_type)
await self.manager.db_manager.hash_table.insert_or_update_hash_db(
hash,
hash_type,
file,
original_filename,
referer,
Expand All @@ -87,25 +99,31 @@
self.manager.progress_manager.hash_progress.add_prev_hash()
await self.manager.db_manager.hash_table.insert_or_update_hash_db(
hash,
hash_type,
file,
original_filename,
referer,
)
except Exception as e:
log(f"Error hashing {file} : {e}", 40, exc_info=True)
self.hashes[key] = hash
self.hashes[(key,hash_type)] = hash
return hash

async def hash_item_during_download(self, media_item: MediaItem) -> None:
try:
if self.manager.config_manager.global_settings_data["Dupe_Cleanup_Options"]["hash_while_downloading"]:
await self.hash_item(media_item.complete_file, media_item.original_filename, media_item.referer)

Check failure on line 114 in cyberdrop_dl/clients/hash_client.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (W293)

cyberdrop_dl/clients/hash_client.py:114:1: W293 Blank line contains whitespace
if not self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["delete_after_download"]:
return
if self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["disable_individual_download_hashing"]:
return

Check failure on line 119 in cyberdrop_dl/clients/hash_client.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (W293)

cyberdrop_dl/clients/hash_client.py:119:1: W293 Blank line contains whitespace
await self.hash_item_helper(media_item.complete_file, media_item.original_filename, media_item.referer)
except Exception as e:
log(f"After hash processing failed: {media_item.complete_file} with error {e}", 40, exc_info=True)

async def cleanup_dupes(self) -> None:
with self.manager.live_manager.get_hash_live(stop=True):
if not self.manager.config_manager.global_settings_data["Dupe_Cleanup_Options"]["delete_after_download"]:
if not self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["delete_after_download"]:
return
file_hashes_dict = await self.get_file_hashes_dict()
with self.manager.live_manager.get_remove_file_via_hash_live(stop=True):
Expand All @@ -117,13 +135,11 @@
for size, data in size_dict.items():
selected_file = Path(data["selected"])
other_files = data["others"]

# Get all matches from the database
all_matches = [
Path(x[0], x[1])
for x in await self.manager.db_manager.hash_table.get_files_with_hash_matches(hash, size)
for x in await self.manager.db_manager.hash_table.get_files_with_hash_matches(hash, size,self.xxhash)
]

# Filter out files with the same path as any file in other_files
other_matches = [match for match in all_matches if str(match) not in other_files]
# Filter files based on if the file exists
Expand Down Expand Up @@ -166,7 +182,7 @@
hashes_dict = defaultdict(lambda: defaultdict(list))
# first compare downloads to each other
for media_item in list(self.manager.path_manager.completed_downloads):
hash = await self.hash_item(media_item.complete_file, media_item.original_filename, media_item.referer)
hash = await self.hash_item_helper(media_item.complete_file, media_item.original_filename, media_item.referer)
item = media_item.complete_file.absolute()
try:
size = item.stat().st_size
Expand Down Expand Up @@ -205,7 +221,7 @@
return hashes_dict

def send2trash(self, path: Path) -> None:
if self.manager.config_manager.global_settings_data["Dupe_Cleanup_Options"]["delete_off_disk"]:
if self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["delete_off_disk"]:
Path(path).unlink(missing_ok=True)
else:
send2trash(path)
Expand All @@ -218,10 +234,10 @@

def keep_new_download(self, hash: str, selected_file: Path | str) -> bool:
return bool(
self.manager.config_manager.global_settings_data["Dupe_Cleanup_Options"]["keep_new_download"]
self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["keep_new_download"]
or hash not in self.prev_hashes
or str(selected_file) in self.manager.path_manager.prev_downloads_paths,
or Path(selected_file) in self.manager.path_manager.prev_downloads_paths,
)

def keep_prev_file(self) -> bool:
return self.manager.config_manager.global_settings_data["Dupe_Cleanup_Options"]["keep_prev_download"]
return self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["keep_prev_download"]
29 changes: 23 additions & 6 deletions cyberdrop_dl/managers/hash_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,46 @@
from cyberdrop_dl.clients.hash_client import HashClient

try:
from xxhash import xxh128 as hasher
from xxhash import xxh128 as xxhasher
except ImportError:
from hashlib import md5 as hasher
xxhasher = None
from hashlib import md5 as md5hasher
from hashlib import sha256 as sha256hasher

if TYPE_CHECKING:
from cyberdrop_dl.managers.manager import Manager


class HashManager:
def __init__(self, manager: Manager) -> None:
self.hasher = hasher
self.xx_hasher = xxhasher
self.md5_hasher = md5hasher
self.sha_256_hasher = sha256hasher
self.hash_client = HashClient(manager) # Initialize hash client in constructor

async def startup(self) -> None:
await self.hash_client.startup()

async def hash_file(self, filename: str) -> str:

Check failure on line 30 in cyberdrop_dl/managers/hash_manager.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (W293)

cyberdrop_dl/managers/hash_manager.py:30:1: W293 Blank line contains whitespace

async def hash_file(self, filename: str,hash_type:str) -> str:
file_path = Path.cwd() / filename
async with aiofiles.open(file_path, "rb") as fp:
CHUNK_SIZE = 1024 * 1024 # 1MB
filedata = await fp.read(CHUNK_SIZE)
current_hasher = hasher() # Use the initialized hasher
current_hasher = self._get_hasher(hash_type) # Use the initialized hasher
while filedata:
current_hasher.update(filedata)
filedata = await fp.read(CHUNK_SIZE)
return current_hasher.hexdigest()

Check failure on line 42 in cyberdrop_dl/managers/hash_manager.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (W293)

cyberdrop_dl/managers/hash_manager.py:42:1: W293 Blank line contains whitespace
def _get_hasher(self,hash_type):
if hash_type =="xx128" and not self.xx_hasher:
raise ImportError("xxhash module is not installed")
elif hash_type == "xxh128":
return self.xx_hasher()
elif hash_type == "md5":
return self.md5_hasher()
elif hash_type == "sha256":
return self.sha_256_hasher()
else:
raise ValueError("Invalid hash type")
33 changes: 10 additions & 23 deletions cyberdrop_dl/ui/prompts/settings_global_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,60 +219,47 @@ def edit_dupe_settings_prompt(manager: Manager) -> None:

delete_after = inquirer.select(
message="Toggle duplicate files deletion using hashes:",
default=manager.config_manager.global_settings_data["Dupe_Cleanup_Options"]["delete_after_download"],
choices=[Choice(True, "True"), Choice(False, "False")],
vi_mode=manager.vi_mode,
).execute()
hash_while_downloading = inquirer.select(
message="Hash Files during downloading:",
long_instruction="""
Generate file hashes after each download, instead of batched
together during deduplication process
""",
default=manager.config_manager.global_settings_data["Dupe_Cleanup_Options"]["hash_while_downloading"],
default=manager.config_manager.settings_data["Dupe_Cleanup_Options"]["delete_after_download"],
choices=[Choice(True, "True"), Choice(False, "False")],
vi_mode=manager.vi_mode,
).execute()

dedupe_already_downloaded = inquirer.select(
message="How to handle files already on system: ",
default=manager.config_manager.global_settings_data["Dupe_Cleanup_Options"]["dedupe_already_downloaded"],
default=manager.config_manager.settings_data["Dupe_Cleanup_Options"]["dedupe_already_downloaded"],
choices=[Choice(True, "Mark Existing Files as 'new'"), Choice(False, "Skip Existing Files")],
vi_mode=manager.vi_mode,
).execute()

keep_current = inquirer.select(
message="What to do with new file when deduping: ",
long_instruction="Keep a curent file. Current files are files that were either downloaded or a file was skipped for already existing when dedupe_already_downloaded is true",
default=manager.config_manager.global_settings_data["Dupe_Cleanup_Options"]["keep_new_download"],
default=manager.config_manager.settings_data["Dupe_Cleanup_Options"]["keep_new_download"],
choices=[Choice(True, "Keep a newfile"), Choice(False, "Delete all new files")],
vi_mode=manager.vi_mode,
).execute()

keep_prev = inquirer.select(
message="What to do with previous file(s) when deduping:",
default=manager.config_manager.global_settings_data["Dupe_Cleanup_Options"],
default=manager.config_manager.settings_data["Dupe_Cleanup_Options"],
long_instruction="Any file that is not in the list of current files with a matching hash",
choices=[Choice(True, "Keep a previous file "), Choice(False, "Delete all previous files")],
vi_mode=manager.vi_mode,
).execute()

delete_off_disk = inquirer.select(
message="How to handle removal of files: ",
default=manager.config_manager.global_settings_data["Dupe_Cleanup_Options"]["delete_off_disk"],
default=manager.config_manager.settings_data["Dupe_Cleanup_Options"]["delete_off_disk"],
choices=[Choice(True, "Permanently Delete File"), Choice(False, "Send to Trash")],
vi_mode=manager.vi_mode,
).execute()

manager.config_manager.global_settings_data["Dupe_Cleanup_Options"]["delete_after_download"] = delete_after
manager.config_manager.global_settings_data["Dupe_Cleanup_Options"]["hash_while_downloading"] = (
hash_while_downloading
)
manager.config_manager.global_settings_data["Dupe_Cleanup_Options"]["keep_prev_download"] = keep_prev
manager.config_manager.global_settings_data["Dupe_Cleanup_Options"]["keep_new_download"] = keep_current
manager.config_manager.settings_data["Dupe_Cleanup_Options"]["delete_after_download"] = delete_after
manager.config_manager.settings_data["Dupe_Cleanup_Options"]["keep_prev_download"] = keep_prev
manager.config_manager.settings_data["Dupe_Cleanup_Options"]["keep_new_download"] = keep_current

manager.config_manager.global_settings_data["Dupe_Cleanup_Options"]["dedupe_already_downloaded"] = (
manager.config_manager.settings_data["Dupe_Cleanup_Options"]["dedupe_already_downloaded"] = (
dedupe_already_downloaded
)

manager.config_manager.global_settings_data["Dupe_Cleanup_Options"]["delete_off_disk"] = delete_off_disk
manager.config_manager.settings_data["Dupe_Cleanup_Options"]["delete_off_disk"] = delete_off_disk
19 changes: 11 additions & 8 deletions cyberdrop_dl/utils/args/config_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,17 @@
"auto_import": False,
"sites": None,
},
"Dupe_Cleanup_Options": {
"delete_after_download": False,
"keep_prev_download": False,
"keep_new_download": True,
"dedupe_already_downloaded": False,
"delete_off_disk": False,
"add_md5_hash": False,
"disable_individual_download_hashing":False,
"add_sha256_hash": False,
},

Check failure on line 143 in cyberdrop_dl/utils/args/config_definitions.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (W293)

cyberdrop_dl/utils/args/config_definitions.py:143:1: W293 Blank line contains whitespace
}

global_settings: dict = {
Expand All @@ -152,14 +163,6 @@
"max_simultaneous_downloads_per_domain": 3,
"download_speed_limit": 0,
},
"Dupe_Cleanup_Options": {
"delete_after_download": False,
"hash_while_downloading": False,
"keep_prev_download": False,
"keep_new_download": True,
"dedupe_already_downloaded": False,
"delete_off_disk": False,
},
"UI_Options": {
"vi_mode": False,
"refresh_rate": 10,
Expand Down
30 changes: 27 additions & 3 deletions cyberdrop_dl/utils/database/table_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,37 @@

create_temp_referer = """CREATE TABLE IF NOT EXISTS temp_referer (referer TEXT);"""

create_hash = """CREATE TABLE IF NOT EXISTS hash (
create_files= """
CREATE TABLE IF NOT EXISTS files (
folder TEXT,
download_filename TEXT,
original_filename TEXT,
file_size INT,
referer TEXT,
PRIMARY KEY (folder, download_filename)
);

"""

create_hash= """
CREATE TABLE IF NOT EXISTS hash (
folder TEXT,
download_filename TEXT,
hash_type TEXT,
hash TEXT,
PRIMARY KEY (folder, download_filename, hash_type),
FOREIGN KEY (folder, download_filename) REFERENCES files(folder, download_filename)
);

"""

create_temp_hash= """
CREATE TABLE IF NOT EXISTS temp_hash (
folder TEXT,
download_filename TEXT,
hash_type TEXT,
hash TEXT,
UNIQUE (folder, original_filename)
PRIMARY KEY (folder, original_filename,hash)
PRIMARY KEY (folder, download_filename, hash_type),
FOREIGN KEY (folder, download_filename) REFERENCES files(folder, download_filename)
);
"""
Loading
Loading