Merge pull request #95 from Ljzd-PRO/devel

Bump to v0.5.2
Ljzd-PRO · Apr 15, 2024 · ed8e8f1 · ed8e8f1
2 parents a98f920 + d54bb31
commit ed8e8f1
Show file tree

Hide file tree

Showing 11 changed files with 203 additions and 177 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,41 +1,23 @@
 ## Changes
 
-### 💡 Feature
-
-- Added support for downloading works within a specified range of quantity.
-  - Added `--offset`, `--length` options in `sync-creator` command
-  - `--offset`: Posts result offset (or start offset)
-  - `--length`: The number of posts to fetch, defaults to fetching all posts
-
-  ```bash
-  # Download latest 10 posts of the creator/artist
-  ktoolbox sync-creator https://kemono.su/fanbox/user/xxxx --length=10
-
-  # Download latest No.11-No.15 posts of the creator/artist
-  ktoolbox sync-creator https://kemono.su/fanbox/user/xxxx --offset=10 --length=5
-
-  # Download all posts of the creator/artist
-  ktoolbox sync-creator https://kemono.su/fanbox/user/xxxx
-  ```
+[//]: # (### 💡 Feature)
+
+### 🪲 Fix
+
+- Fix `FileNotFoundError` occurred when filename contains special characters (#94)
+- Fix `TypeError` occurred when using `--start-time`, `--end-time` options and posts had no `published` property (#93)
+- Fixed incorrect argument order when using bucket storage (#89 - @Nacosia)
+- Duplicate file check after HTTP connection started (#88)
 
 - - -
 
-### 💡 新特性
-
-- 增加下载指定数量范围作品的支持
-  - 在 `sync-creator` 命令中增加了 `--offset`, `--length` 选项
-  - `--offset`：作品结果偏移量（或起始偏移量）
-  - `--length`：要获取的作品数量，默认获取所有作品
-
-  ```bash
-  # 下载作者/画师最新的 10 个作品
-  ktoolbox sync-creator https://kemono.su/fanbox/user/xxxx --length=10
-
-  # 下载作者/画师最新的第 11 至 15 个作品
-  ktoolbox sync-creator https://kemono.su/fanbox/user/xxxx --offset=10 --length=5
-
-  # 下载作者/画师的所有作品
-  ktoolbox sync-creator https://kemono.su/fanbox/user/xxxx
-  ```
-
-**Full Changelog**: https://github.com/Ljzd-PRO/KToolBox/compare/v0.5.0...v0.5.1
+[//]: # (### 💡 新特性)
+
+### 🪲 Fix
+
+- 修复当文件名包含特殊字符时会出现 `FileNotFoundError` 错误的问题 (#94)
+- 修复当使用 `--start-time`, `--end-time` 参数且作品 `published` 属性不存在的情况下会出现 `TypeError` 错误的问题 (#93)
+- 修复当使用桶储存时参数顺序不正确的问题 (#89 - @Nacosia)
+- 在建立 HTTP 连接后进行重复文件检查 (#88)
+
+**Full Changelog**: https://github.com/Ljzd-PRO/KToolBox/compare/v0.5.1...v0.5.2
diff --git a/ktoolbox/__init__.py b/ktoolbox/__init__.py
@@ -1,4 +1,4 @@
 __title__ = "KToolBox"
 # noinspection SpellCheckingInspection
 __description__ = "A useful CLI tool for downloading posts in Kemono.party / .su"
-__version__ = "0.5.1"
+__version__ = "0.5.2"
diff --git a/ktoolbox/action/job.py b/ktoolbox/action/job.py
@@ -10,7 +10,7 @@
 
 from ktoolbox._enum import PostFileTypeEnum, DataStorageNameEnum
 from ktoolbox.action import ActionRet, fetch_creator_posts, FetchInterruptError
-from ktoolbox.action.utils import generate_post_path_name, filter_posts_by_time
+from ktoolbox.action.utils import generate_post_path_name, filter_posts_by_date
 from ktoolbox.api.model import Post, Attachment
 from ktoolbox.configuration import config, PostStructureConfiguration
 from ktoolbox.job import Job, CreatorIndices
@@ -29,7 +29,7 @@ async def create_job_from_post(
     Create a list of download job from a post data
 
     :param post: post data
-    :param post_path: Path of the post directory
+    :param post_path: Path of the post directory, which needs to be sanitized
     :param post_structure: post path structure, ``False`` -> disable, \
      ``True`` & ``None`` -> ``config.job.post_structure``
     :param dump_post_data: Whether to dump post data (post.json) in post directory
@@ -109,7 +109,7 @@ async def create_job_from_creator(
 
     :param service: The service where the post is located
     :param creator_id: The ID of the creator
-    :param path: The path for posts to download
+    :param path: The path for downloading posts, which needs to be sanitized
     :param all_pages: Fetch all posts, ``offset`` and ``length`` will be ignored if enabled
     :param offset: Result offset (or start offset)
     :param length: The number of posts to fetch
@@ -147,7 +147,7 @@ async def create_job_from_creator(
 
     # Filter posts by publish time
     if start_time or end_time:
-        post_list = list(filter_posts_by_time(post_list, start_time, end_time))
+        post_list = list(filter_posts_by_date(post_list, start_time, end_time))
     logger.info(f"Get {len(post_list)} posts, start creating jobs")
 
     # Filter posts and generate ``CreatorIndices``

diff --git a/ktoolbox/action/utils.py b/ktoolbox/action/utils.py
@@ -8,7 +8,7 @@
 from ktoolbox.configuration import config
 from ktoolbox.job import CreatorIndices
 
-__all__ = ["generate_post_path_name", "filter_posts_by_time", "filter_posts_by_indices"]
+__all__ = ["generate_post_path_name", "filter_posts_by_date", "filter_posts_by_indices"]
 
 
 def generate_post_path_name(post: Post) -> str:
@@ -34,39 +34,40 @@ def generate_post_path_name(post: Post) -> str:
             exit(1)
 
 
-def _match_post_time(
+def _match_post_date(
         post: Post,
-        start_time: Optional[datetime],
-        end_time: Optional[datetime]
+        start_date: Optional[datetime],
+        end_date: Optional[datetime]
 ) -> bool:
     """
-    Check if the post publish date match the time range.
+    Check if the post date match the time range.
 
     :param post: Target post object
-    :param start_time: Start time of the time range
-    :param end_time: End time of the time range
+    :param start_date: Start time of the time range
+    :param end_date: End time of the time range
     :return: Whether if the post publish date match the time range
     """
-    if start_time and post.published < start_time:
+    post_date = post.published or post.added
+    if start_date and post_date and post_date < start_date:
         return False
-    if end_time and post.published > end_time:
+    if end_date and post_date and post_date > end_date:
         return False
     return True
 
 
-def filter_posts_by_time(
+def filter_posts_by_date(
         post_list: List[Post],
-        start_time: Optional[datetime],
-        end_time: Optional[datetime]
+        start_date: Optional[datetime],
+        end_date: Optional[datetime]
 ) -> Generator[Post, Any, Any]:
     """
-    Filter posts by publish time range
+    Filter posts by publish date range
 
     :param post_list: List of posts
-    :param start_time: Start time of the time range
-    :param end_time: End time of the time range
+    :param start_date: Start time of the time range
+    :param end_date: End time of the time range
     """
-    post_filter = filter(lambda x: _match_post_time(x, start_time, end_time), post_list)
+    post_filter = filter(lambda x: _match_post_date(x, start_date, end_date), post_list)
     yield from post_filter
 
 

diff --git a/ktoolbox/downloader/__init__.py b/ktoolbox/downloader/__init__.py
@@ -1,2 +1,3 @@
 from .base import *
 from .downloader import *
+from .utils import *
diff --git a/ktoolbox/downloader/downloader.py b/ktoolbox/downloader/downloader.py
@@ -11,55 +11,60 @@
 import tenacity
 import tqdm.asyncio
 from loguru import logger
+from pathvalidate import sanitize_filename
 from tenacity import wait_fixed, retry_if_result, retry_if_exception
 from tenacity.stop import stop_after_attempt, stop_never
 from tqdm import tqdm as std_tqdm
 
 from ktoolbox._enum import RetCodeEnum
 from ktoolbox.configuration import config
-from ktoolbox.downloader import DownloaderRet
-from ktoolbox.utils import filename_from_headers, generate_msg
+from ktoolbox.downloader.base import DownloaderRet
+from ktoolbox.downloader.utils import filename_from_headers, duplicate_file_check
+from ktoolbox.utils import generate_msg
 
 __all__ = ["Downloader"]
 
 
 class Downloader:
+    """
+    :ivar _save_filename: The actual filename for saving.
+    """
+
     def __init__(
             self,
             url: str,
             path: Path,
             *,
             buffer_size: int = None,
             chunk_size: int = None,
-            alt_filename: str = None,
+            designated_filename: str = None,
             server_path: str = None
     ):
         # noinspection GrazieInspection
         """
         Initialize a file downloader
 
         - About filename:
-            * If ``alt_filename`` parameter is set, use it.
-            * Else if ``Content-Disposition`` is set in headers, use filename from it.
-            * Else use filename from URL 'path' part.
+            1. If ``designated_filename`` parameter is set, use it.
+            2. Else if ``Content-Disposition`` is set in headers, use filename from it.
+            3. Else use filename from 'file' part of ``server_path``.
 
         :param url: Download URL
-        :param path: Directory path to save the file
+        :param path: Directory path to save the file, which needs to be sanitized
         :param buffer_size: Number of bytes for file I/O buffer
         :param chunk_size: Number of bytes for chunk of download stream
-        :param alt_filename: Use this name if no filename given by the server
-        :param server_path: Server path of the file. if config.use_bucket is True, \
-        it will be used as save the path to the file
+        :param designated_filename: Manually specify the filename for saving, which needs to be sanitized
+        :param server_path: Server path of the file. if ``DownloaderConfiguration.use_bucket`` enabled, \
+        it will be used as the save path.
         """
 
         self._url = url
         self._path = path
         self._buffer_size = buffer_size or config.downloader.buffer_size
         self._chunk_size = chunk_size or config.downloader.chunk_size
-        # _alt_filename 是用于下载的文件名
-        self._alt_filename = alt_filename  # 用于下载的文件名
-        self._server_path = server_path  # 服务器文件路径 /hash[:1]/hash2[1:3]/hash
-        self._filename = alt_filename  # 保留用做实际文件名
+        self._designated_filename = designated_filename
+        self._server_path = server_path  # /hash[:1]/hash2[1:3]/hash
+        self._save_filename = designated_filename  # Prioritize the manually specified filename
 
         self._lock = asyncio.Lock()
         self._stop: bool = False
@@ -87,7 +92,7 @@ def chunk_size(self) -> int:
     @property
     def filename(self) -> Optional[str]:
         """Actual filename of the download file"""
-        return self._filename
+        return self._save_filename
 
     @property
     def finished(self) -> bool:
@@ -141,34 +146,27 @@ async def run(
         :return: ``DownloaderRet`` which contain the actual output filename
         :raise CancelledError
         """
-        # Get filename to check if file exists
+        # Get filename to check if file exists (First-time duplicate file check)
         # Check it before request to make progress more efficiency
         server_relpath = self._server_path[1:]
         server_relpath_without_params = urlparse(server_relpath).path
         server_path_filename = unquote(Path(server_relpath_without_params).name)
-        art_file_path = self._path / (self._filename or server_path_filename)
-        check_path = art_file_path
+        # Priority order can be referenced from the constructor's documentation
+        save_filepath = self._path / (self._save_filename or server_path_filename)
 
         # Get bucket file path
-        art_bucket_file_path: Optional[Path] = None
+        bucket_file_path: Optional[Path] = None
         if config.downloader.use_bucket:
-            art_bucket_file_path = config.downloader.bucket_path / server_relpath
-            check_path = art_bucket_file_path
+            bucket_file_path = config.downloader.bucket_path / server_relpath
 
         # Check if the file exists
-        if check_path.is_file():
-            if config.downloader.use_bucket:
-                ret_msg = "Download file already exists in both bucket and local, skipping"
-                if not art_file_path.is_file():
-                    ret_msg = "Download file already exists in bucket, linking to target path"
-                    check_path.hardlink_to(art_file_path)
-            else:
-                ret_msg = "Download file already exists, skipping"
+        file_existed, ret_msg = duplicate_file_check(save_filepath, bucket_file_path)
+        if file_existed:
             return DownloaderRet(
                 code=RetCodeEnum.FileExisted,
                 message=generate_msg(
                     ret_msg,
-                    path=art_file_path
+                    path=save_filepath
                 )
             )
 
@@ -187,21 +185,33 @@ async def run(
                             message=generate_msg(
                                 "Download failed",
                                 status_code=res.status_code,
-                                filename=art_file_path
+                                filename=save_filepath
                             )
                         )
 
-                    # Get filename
-                    filename = self._alt_filename or filename_from_headers(res.headers) or server_path_filename
-                    self._filename = filename
+                    # Get filename for saving and check if file exists (Second-time duplicate file check)
+                    # Priority order can be referenced from the constructor's documentation
+                    self._save_filename = self._designated_filename or sanitize_filename(
+                        filename_from_headers(res.headers)
+                    ) or server_path_filename
+                    save_filepath = self._path / self._save_filename
+                    file_existed, ret_msg = duplicate_file_check(save_filepath, bucket_file_path)
+                    if file_existed:
+                        return DownloaderRet(
+                            code=RetCodeEnum.FileExisted,
+                            message=generate_msg(
+                                ret_msg,
+                                path=save_filepath
+                            )
+                        )
 
                     # Download
-                    temp_filepath = Path(f"{(self._path / server_path_filename)}.{config.downloader.temp_suffix}")
+                    temp_filepath = Path(f"{save_filepath}.{config.downloader.temp_suffix}")
                     total_size = int(length_str) if (length_str := res.headers.get("Content-Length")) else None
                     async with aiofiles.open(str(temp_filepath), "wb", self._buffer_size) as f:
                         chunk_iterator = res.aiter_bytes(self._chunk_size)
                         t = tqdm_class(
-                            desc=filename,
+                            desc=self._save_filename,
                             total=total_size,
                             disable=not progress,
                             unit="iB",
@@ -216,21 +226,23 @@ async def run(
 
             # Download finished
             if config.downloader.use_bucket:
-                art_bucket_file_path.parent.mkdir(parents=True, exist_ok=True)
-                os.link(temp_filepath, art_bucket_file_path)
+                bucket_file_path.parent.mkdir(parents=True, exist_ok=True)
+                os.link(temp_filepath, bucket_file_path)
+            temp_filepath.rename(self._path / self._save_filename)
 
-            temp_filepath.rename(self._path / filename)
+            # Callbacks
             if sync_callable:
                 sync_callable(self)
             if async_callable:
                 await async_callable(self)
+
             return DownloaderRet(
-                data=filename
-            ) if filename else DownloaderRet(
+                data=self._save_filename
+            ) if self._save_filename else DownloaderRet(
                 code=RetCodeEnum.GeneralFailure,
                 message=generate_msg(
                     "Download failed",
-                    filename=self._alt_filename
+                    filename=self._designated_filename
                 )
             )