Skip to content

Commit

Permalink
refactor: finish most refactoring of deduping based on discussion
Browse files Browse the repository at this point in the history
  • Loading branch information
datawhores committed Nov 19, 2024
1 parent 9d50e3f commit 1c46c67
Show file tree
Hide file tree
Showing 11 changed files with 130 additions and 148 deletions.
8 changes: 1 addition & 7 deletions cyberdrop_dl/clients/download_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,13 +244,7 @@ async def handle_media_item_completion(self, media_item: MediaItem, downloaded:
"""Sends to hash client to handle hashing and marks as completed/current download."""
try:
await self.manager.hash_manager.hash_client.hash_item_during_download(media_item)
if (
downloaded
or self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["dedupe_already_downloaded"]
):
self.manager.path_manager.add_completed(media_item)
if not downloaded:
self.manager.path_manager.add_prev(media_item)
self.manager.path_manager.add_completed(media_item)
except Exception:
log(f"Error handling media item completion of: {media_item.complete_file}", 10, exc_info=True)

Expand Down
140 changes: 43 additions & 97 deletions cyberdrop_dl/clients/hash_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,12 @@ class HashClient:
def __init__(self, manager: Manager) -> None:
self.manager = manager
self.hashes = defaultdict(lambda: None)
self.prev_hashes = None
self.xxhash="xxh128"
self.md5="md5"
self.sha256="sha256"

async def startup(self) -> None:
self.prev_hashes = set(await self.manager.db_manager.hash_table.get_all_unique_hashes(self.xxhash))
pass

async def hash_directory(self, path: Path) -> None:
path = Path(path)
Expand All @@ -67,9 +66,9 @@ def _get_key_from_file(file: Path | str):
return str(Path(file).absolute())
async def hash_item_helper(self, file: Path | str, original_filename: str, referer: URL):
hash=await self.hash_item(file, original_filename,referer,hash_type=self.xxhash)
if self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["Hashing_Modications"]["allow_md5_hash"]:
if self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["allow_md5_hash"]:
await self.hash_item(file, original_filename,referer,hash_type=self.md5)
if self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["Hashing_Modications"]["allow_sha256_hash"]:
if self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["allow_sha256_hash"]:
await self.hash_item(file, original_filename, referer, hash_type=self.sha256)
return hash

Expand Down Expand Up @@ -112,67 +111,50 @@ async def hash_item(self, file: Path | str, original_filename: str, referer: URL
async def hash_item_during_download(self, media_item: MediaItem) -> None:
try:

if self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["hashing"].IN_PLACE:
await self.hash_item_helper(media_item.complete_file, media_item.original_filename, media_item.referer)
if self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["hashing"]!="IN_PLACE":
return
await self.hash_item_helper(media_item.complete_file, media_item.original_filename, media_item.referer)
except Exception as e:
log(f"After hash processing failed: {media_item.complete_file} with error {e}", 40, exc_info=True)

async def cleanup_dupes(self) -> None:
async def cleanup_dupes_after_download(self) -> None:
with self.manager.live_manager.get_hash_live(stop=True):
if not self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["enable_dedupe_settings"]:
if self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["hashing"]=="OFF":
return
if self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["dedupe"]=="OFF":
return
file_hashes_dict = await self.get_file_hashes_dict()
with self.manager.live_manager.get_remove_file_via_hash_live(stop=True):
final_candiates_dict = self.get_candiate_per_group(file_hashes_dict)
await self.final_dupe_cleanup(final_candiates_dict)
with self.manager.live_manager.get_remove_file_via_hash_live(stop=True):
await self.final_dupe_cleanup(file_hashes_dict)

async def final_dupe_cleanup(self, final_dict: dict[str, dict]) -> None:
async def final_dupe_cleanup(self, final_dict: dict[str, dict]) -> None:
"""cleanup files based on dedupe setting"""
for hash, size_dict in final_dict.items():
for size, data in size_dict.items():
selected_file = Path(data["selected"])
other_files = data["others"]
for size in size_dict.keys():
# Get all matches from the database
all_matches = [
Path(x[0], x[1])
(Path(x[0], x[1]))
for x in await self.manager.db_manager.hash_table.get_files_with_hash_matches(hash, size,self.xxhash)
]
# Filter out files with the same path as any file in other_files
other_matches = [match for match in all_matches if str(match) not in other_files]
#reverse
if self.keep_newest or self.keep_newest_all:
all_matches=reversed(all_matches)
# Filter files based on if the file exists
existing_other_matches = [file for file in other_matches if file.exists()]

if self.delete_all_prev_downloads():
for ele in existing_other_matches:
if not ele.exists():
continue
try:
if self.send2trash(ele):
log(f"removed prev download: {ele!s} with hash {hash}", 10)
self.manager.progress_manager.hash_progress.add_removed_prev_file()
except OSError:
continue
# keep a previous downloads
else:
for ele in existing_other_matches[1:]:
if not ele.exists():
continue
try:
if self.send2trash(ele):
log(f"removed prev download: {ele!s} with hash {hash}", 10)
self.manager.progress_manager.hash_progress.add_removed_prev_file()
except OSError:
continue
# delete selected current download
if self.delete_selected_current_download(hash, selected_file):
if self.keep_oldest or self.keep_newest:
all_matches = [file for file in all_matches if file.exists()]
for file in all_matches[1:]:
try:
if selected_file.exists():
if self.send2trash(selected_file):
log(f"removed new download:{selected_file} with hash {hash}", 10)
self.manager.progress_manager.hash_progress.add_removed_file()

if not file.exists():
continue
self.send2trash(file)
log(f"Removed new download : {file} with hash {hash}", 10)
self.manager.progress_manager.hash_progress.add_removed_file()
except OSError:
pass




async def get_file_hashes_dict(self) -> dict:
hashes_dict = defaultdict(lambda: defaultdict(list))
# first compare downloads to each other
Expand All @@ -187,61 +169,25 @@ async def get_file_hashes_dict(self) -> dict:
log(f"After hash processing failed: {item} with error {e}", 40, exc_info=True)
return hashes_dict

def get_candiate_per_group(self, hashes_dict: dict[str, dict[int, list[Path]]]) -> dict:
# create dictionary with one selected file, per value and list of other files with matching hashes
for hash, size_dict in hashes_dict.items():
for size, files in size_dict.items():
selected_file = None
for file in files:
if file.is_file():
selected_file = file
if file in self.manager.path_manager.prev_downloads_paths:
break
continue

for file in filter(lambda x: x != selected_file, files):
try:
if self.send2trash(file):
log(f"removed new download : {file} with hash {hash}", 10)
self.manager.progress_manager.hash_progress.add_removed_file()
except OSError:
pass
if selected_file:
size_dict[size] = {
"selected": selected_file,
"others": [str(x.absolute()) for x in files],
}
else:
del size_dict[size]
return hashes_dict

def send2trash(self, path: Path) -> None:
if self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["Deletion_Settings"]["disable_all_file_deletions"]:
return False
elif not self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["Deletion_Settings"]["send_deleted_to_trash"]:
if not self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["send_deleted_to_trash"]:
Path(path).unlink(missing_ok=True)
log(f"permanently deleted file at {path}", 10)
return True
else:
send2trash(path)
log(f"sent file at{path} to trash", 10)
return True

def delete_all_prev_downloads(self) -> bool:
return not self.keep_prev_file()


def delete_selected_current_download(self, hash: str, selected_file: Path | str) -> bool:
return not self.keep_selected_current_download(hash, selected_file)

def keep_selected_current_download(self, hash: str, selected_file: Path | str) -> bool:
return bool(
self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["Deletion_Settings"]["keep_new_download"]
or hash not in self.prev_hashes
or Path(selected_file) in self.manager.path_manager.prev_downloads_paths,
)

def keep_prev_file(self) -> bool:
return self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["Deletion_Settings"]["keep_prev_download"]


@property
def keep_oldest_all(self):
return self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["dedupe"]=="KEEP_OLDEST_ALL"
@property
def keep_newest_all(self):
return self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["dedupe"]=="KEEP_NEWEST_ALL"
@property
def keep_oldest(self):
return self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["dedupe"]=="KEEP_OLDEST"
@property
def keep_newest(self):
return self.manager.config_manager.settings_data["Dupe_Cleanup_Options"]["dedupe"]=="KEEP_NEWEST"
2 changes: 1 addition & 1 deletion cyberdrop_dl/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ async def post_runtime(manager: Manager) -> None:
)
# checking and removing dupes
if not manager.args_manager.sort_all_configs:
await manager.hash_manager.hash_client.cleanup_dupes()
await manager.hash_manager.hash_client.cleanup_dupes_after_download()
if (
isinstance(manager.args_manager.sort_downloads, bool)
and manager.args_manager.sort_downloads
Expand Down
20 changes: 17 additions & 3 deletions cyberdrop_dl/managers/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,29 @@


def _match_config_dicts(default: dict, existing: dict) -> dict:
"""Matches the keys of two dicts and returns the default dict with the values of the existing dict."""
"""Matches the keys of two dicts and returns the new dict with the values of the existing dict."""
for group in default:
for key in default[group]:
if group in existing and key in existing[group]:
default[group][key] = existing[group][key]
return default
return copy.deepcopy(default)



# Custom representer function for YAML
def _enum_representer(dumper, data):
return dumper.represent_int(data.value)

def _save_yaml(file: Path, data: dict) -> None:
"""Saves a dict to a yaml file."""
file.parent.mkdir(parents=True, exist_ok=True)
# Register the custom representer
yaml.add_representer(Dedupe, _enum_representer)
yaml.add_representer(Hashing, _enum_representer)
#dump
with file.open("w") as yaml_file:
yaml.dump(data, yaml_file)
pass


def _load_yaml(file: Path) -> dict:
Expand Down Expand Up @@ -173,7 +183,11 @@ def _verify_settings_config(self) -> None:
if (key ,subkey) in enums:
enum_value= self.settings_data[key][subkey]
enum_class=enums[(key ,subkey)]
self.settings_data[key][subkey]=enum_class(enum_value)
if enum_value and str(enum_value).isnumeric():
self.settings_data[key][subkey]=enum_class(int(enum_value))
else:
self.settings_data[key][subkey]=enum_class(enum_value)



if get_keys(default_settings_data) == get_keys(existing_settings_data):
Expand Down
2 changes: 2 additions & 0 deletions cyberdrop_dl/managers/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ def args_logging(self) -> None:
print_settings["Logs"]["webhook_url"] = bool(print_settings["Logs"]["webhook_url"])
print_settings["Sorting"]["sort_folder"] = str(print_settings["Sorting"]["sort_folder"])
print_settings["Sorting"]["scan_folder"] = str(print_settings["Sorting"]["scan_folder"]) or ""
print_settings["Dupe_Cleanup_Options"]["hashing"]=print_settings["Dupe_Cleanup_Options"]["hashing"].value
print_settings["Dupe_Cleanup_Options"]["dedupe"]=print_settings["Dupe_Cleanup_Options"]["dedupe"].value

log(f"Starting Cyberdrop-DL Process - Config: {self.config_manager.loaded_config}", 10)
log(f"Running version {__version__}", 10)
Expand Down
3 changes: 1 addition & 2 deletions cyberdrop_dl/managers/progress_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,7 @@ def print_stats(self, start_time: timedelta | float) -> None:
log_with_color("Dupe Stats:", "cyan", 20)
log_with_color(f" Previously Hashed: {self.hash_progress.prev_hashed_files} files", "yellow", 20)
log_with_color(f" Newly Hashed: {self.hash_progress.hashed_files} files", "yellow", 20)
log_with_color(f" Removed (Current Downloads): {self.hash_progress.removed_files} files", "yellow", 20)
log_with_color(f" Removed (Previous Downloads): {self.hash_progress.removed_prev_files} files", "yellow", 20)
log_with_color(f" Removed (Downloads): {self.hash_progress.removed_files} files", "yellow", 20)

log_spacer(20, "")
log_with_color("Sort Stats:", "cyan", 20)
Expand Down
18 changes: 4 additions & 14 deletions cyberdrop_dl/ui/progress/hash_progress.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,27 +36,21 @@ def __init__(self, manager: Manager) -> None:

self.current_hashing_text = Progress("{task.description}")


#hashing
self.hashed_files = 0
self.prev_hashed_files = 0

self.hash_progress_group = Group(self.current_hashing_text, self.hash_progress)

self.hashed_files_task_id = self.hash_progress.add_task("[green]Hashed", total=None)
self.prev_hashed_files_task_id = self.hash_progress.add_task("[green]Previously Hashed", total=None)

self.currently_hashing_task_id = self.current_hashing_text.add_task("")

self.currently_hashing_size_task_id = self.current_hashing_text.add_task("")

#remove
self.removed_files = 0
self.removed_prev_files = 0
self.removed_progress_group = Group(self.match_progress, self.remove_progress)
self.removed_files_task_id = self.remove_progress.add_task(
"[green]Removed From Currently Downloaded Files",
total=None,
)
self.removed_prev_files_task_id = self.remove_progress.add_task(
"[green]Removed From Previously Downloaded Files",
"[green]Removed From Downloaded Files",
total=None,
)

Expand Down Expand Up @@ -96,7 +90,3 @@ def add_removed_file(self) -> None:
self.remove_progress.advance(self.removed_files_task_id, 1)
self.removed_files += 1

def add_removed_prev_file(self) -> None:
"""Adds a completed file to the progress bar."""
self.remove_progress.advance(self.removed_prev_files_task_id, 1)
self.removed_prev_files += 1
5 changes: 2 additions & 3 deletions cyberdrop_dl/utils/args/config_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,11 @@
"sites": None,
},
"Dupe_Cleanup_Options": {
"hashing":0,
"dedupe": 0,
"hashing":"IN_PLACE",
"dedupe": "KEEP_OLDEST_ALL",
"allow_md5_hash": False,
"allow_sha256_hash": False,
"send_deleted_to_trash": True,
"delete_if_seen_before":False,
},

}
Expand Down
24 changes: 24 additions & 0 deletions cyberdrop_dl/utils/data_enums_classes/hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,36 @@ class Hashing(Enum):
IN_PLACE = 1
POST_DOWNLOAD = 2


@classmethod
def _missing_(cls, value):
try:
return cls[str(value.upper())]
except KeyError:
return cls.OFF

def __eq__(self,value):
return self.value == value or self.name==value or super().__eq__(value)

class Dedupe(Enum):
OFF = 0
KEEP_OLDEST = 1
KEEP_NEWEST = 2
KEEP_OLDEST_ALL = 3
KEEP_NEWEST_ALL = 4

@classmethod
def _missing_(cls, value):
try:
return cls[str(value.upper())]
except KeyError:
return cls.OFF
def compare(self,value):
return self.value == value or self.name==value
def __eq__(self,value):
return self.value == value or self.name==value or super().__eq__(value)





1 change: 1 addition & 0 deletions cyberdrop_dl/utils/database/table_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
original_filename TEXT,
file_size INT,
referer TEXT,
date INT,
PRIMARY KEY (folder, download_filename)
);
Expand Down
Loading

0 comments on commit 1c46c67

Please sign in to comment.