Skip to content

Commit

Permalink
Merge pull request #95 from Ljzd-PRO/devel
Browse files Browse the repository at this point in the history
Bump to v0.5.2
  • Loading branch information
Ljzd-PRO authored Apr 15, 2024
2 parents a98f920 + d54bb31 commit ed8e8f1
Show file tree
Hide file tree
Showing 11 changed files with 203 additions and 177 deletions.
54 changes: 18 additions & 36 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,41 +1,23 @@
## Changes

### 💡 Feature

- Added support for downloading works within a specified range of quantity.
- Added `--offset`, `--length` options in `sync-creator` command
- `--offset`: Posts result offset (or start offset)
- `--length`: The number of posts to fetch, defaults to fetching all posts

```bash
# Download latest 10 posts of the creator/artist
ktoolbox sync-creator https://kemono.su/fanbox/user/xxxx --length=10

# Download latest No.11-No.15 posts of the creator/artist
ktoolbox sync-creator https://kemono.su/fanbox/user/xxxx --offset=10 --length=5

# Download all posts of the creator/artist
ktoolbox sync-creator https://kemono.su/fanbox/user/xxxx
```
[//]: # (### 💡 Feature)

### 🪲 Fix

- Fix `FileNotFoundError` occurred when filename contains special characters (#94)
- Fix `TypeError` occurred when using `--start-time`, `--end-time` options and posts had no `published` property (#93)
- Fixed incorrect argument order when using bucket storage (#89 - @Nacosia)
- Duplicate file check after HTTP connection started (#88)

- - -

### 💡 新特性

- 增加下载指定数量范围作品的支持
-`sync-creator` 命令中增加了 `--offset`, `--length` 选项
- `--offset`:作品结果偏移量(或起始偏移量)
- `--length`:要获取的作品数量,默认获取所有作品

```bash
# 下载作者/画师最新的 10 个作品
ktoolbox sync-creator https://kemono.su/fanbox/user/xxxx --length=10

# 下载作者/画师最新的第 11 至 15 个作品
ktoolbox sync-creator https://kemono.su/fanbox/user/xxxx --offset=10 --length=5

# 下载作者/画师的所有作品
ktoolbox sync-creator https://kemono.su/fanbox/user/xxxx
```

**Full Changelog**: https://github.com/Ljzd-PRO/KToolBox/compare/v0.5.0...v0.5.1
[//]: # (### 💡 新特性)

### 🪲 Fix

- 修复当文件名包含特殊字符时会出现 `FileNotFoundError` 错误的问题 (#94)
- 修复当使用 `--start-time`, `--end-time` 参数且作品 `published` 属性不存在的情况下会出现 `TypeError` 错误的问题 (#93)
- 修复当使用桶储存时参数顺序不正确的问题 (#89 - @Nacosia)
- 在建立 HTTP 连接后进行重复文件检查 (#88)

**Full Changelog**: https://github.com/Ljzd-PRO/KToolBox/compare/v0.5.1...v0.5.2
2 changes: 1 addition & 1 deletion ktoolbox/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__title__ = "KToolBox"
# noinspection SpellCheckingInspection
__description__ = "A useful CLI tool for downloading posts in Kemono.party / .su"
__version__ = "0.5.1"
__version__ = "0.5.2"
8 changes: 4 additions & 4 deletions ktoolbox/action/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from ktoolbox._enum import PostFileTypeEnum, DataStorageNameEnum
from ktoolbox.action import ActionRet, fetch_creator_posts, FetchInterruptError
from ktoolbox.action.utils import generate_post_path_name, filter_posts_by_time
from ktoolbox.action.utils import generate_post_path_name, filter_posts_by_date
from ktoolbox.api.model import Post, Attachment
from ktoolbox.configuration import config, PostStructureConfiguration
from ktoolbox.job import Job, CreatorIndices
Expand All @@ -29,7 +29,7 @@ async def create_job_from_post(
Create a list of download job from a post data
:param post: post data
:param post_path: Path of the post directory
:param post_path: Path of the post directory, which needs to be sanitized
:param post_structure: post path structure, ``False`` -> disable, \
``True`` & ``None`` -> ``config.job.post_structure``
:param dump_post_data: Whether to dump post data (post.json) in post directory
Expand Down Expand Up @@ -109,7 +109,7 @@ async def create_job_from_creator(
:param service: The service where the post is located
:param creator_id: The ID of the creator
:param path: The path for posts to download
:param path: The path for downloading posts, which needs to be sanitized
:param all_pages: Fetch all posts, ``offset`` and ``length`` will be ignored if enabled
:param offset: Result offset (or start offset)
:param length: The number of posts to fetch
Expand Down Expand Up @@ -147,7 +147,7 @@ async def create_job_from_creator(

# Filter posts by publish time
if start_time or end_time:
post_list = list(filter_posts_by_time(post_list, start_time, end_time))
post_list = list(filter_posts_by_date(post_list, start_time, end_time))
logger.info(f"Get {len(post_list)} posts, start creating jobs")

# Filter posts and generate ``CreatorIndices``
Expand Down
33 changes: 17 additions & 16 deletions ktoolbox/action/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from ktoolbox.configuration import config
from ktoolbox.job import CreatorIndices

__all__ = ["generate_post_path_name", "filter_posts_by_time", "filter_posts_by_indices"]
__all__ = ["generate_post_path_name", "filter_posts_by_date", "filter_posts_by_indices"]


def generate_post_path_name(post: Post) -> str:
Expand All @@ -34,39 +34,40 @@ def generate_post_path_name(post: Post) -> str:
exit(1)


def _match_post_time(
def _match_post_date(
post: Post,
start_time: Optional[datetime],
end_time: Optional[datetime]
start_date: Optional[datetime],
end_date: Optional[datetime]
) -> bool:
"""
Check if the post publish date match the time range.
Check if the post date match the time range.
:param post: Target post object
:param start_time: Start time of the time range
:param end_time: End time of the time range
:param start_date: Start time of the time range
:param end_date: End time of the time range
:return: Whether if the post publish date match the time range
"""
if start_time and post.published < start_time:
post_date = post.published or post.added
if start_date and post_date and post_date < start_date:
return False
if end_time and post.published > end_time:
if end_date and post_date and post_date > end_date:
return False
return True


def filter_posts_by_time(
def filter_posts_by_date(
post_list: List[Post],
start_time: Optional[datetime],
end_time: Optional[datetime]
start_date: Optional[datetime],
end_date: Optional[datetime]
) -> Generator[Post, Any, Any]:
"""
Filter posts by publish time range
Filter posts by publish date range
:param post_list: List of posts
:param start_time: Start time of the time range
:param end_time: End time of the time range
:param start_date: Start time of the time range
:param end_date: End time of the time range
"""
post_filter = filter(lambda x: _match_post_time(x, start_time, end_time), post_list)
post_filter = filter(lambda x: _match_post_date(x, start_date, end_date), post_list)
yield from post_filter


Expand Down
1 change: 1 addition & 0 deletions ktoolbox/downloader/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .base import *
from .downloader import *
from .utils import *
96 changes: 54 additions & 42 deletions ktoolbox/downloader/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,55 +11,60 @@
import tenacity
import tqdm.asyncio
from loguru import logger
from pathvalidate import sanitize_filename
from tenacity import wait_fixed, retry_if_result, retry_if_exception
from tenacity.stop import stop_after_attempt, stop_never
from tqdm import tqdm as std_tqdm

from ktoolbox._enum import RetCodeEnum
from ktoolbox.configuration import config
from ktoolbox.downloader import DownloaderRet
from ktoolbox.utils import filename_from_headers, generate_msg
from ktoolbox.downloader.base import DownloaderRet
from ktoolbox.downloader.utils import filename_from_headers, duplicate_file_check
from ktoolbox.utils import generate_msg

__all__ = ["Downloader"]


class Downloader:
"""
:ivar _save_filename: The actual filename for saving.
"""

def __init__(
self,
url: str,
path: Path,
*,
buffer_size: int = None,
chunk_size: int = None,
alt_filename: str = None,
designated_filename: str = None,
server_path: str = None
):
# noinspection GrazieInspection
"""
Initialize a file downloader
- About filename:
* If ``alt_filename`` parameter is set, use it.
* Else if ``Content-Disposition`` is set in headers, use filename from it.
* Else use filename from URL 'path' part.
1. If ``designated_filename`` parameter is set, use it.
2. Else if ``Content-Disposition`` is set in headers, use filename from it.
3. Else use filename from 'file' part of ``server_path``.
:param url: Download URL
:param path: Directory path to save the file
:param path: Directory path to save the file, which needs to be sanitized
:param buffer_size: Number of bytes for file I/O buffer
:param chunk_size: Number of bytes for chunk of download stream
:param alt_filename: Use this name if no filename given by the server
:param server_path: Server path of the file. if config.use_bucket is True, \
it will be used as save the path to the file
:param designated_filename: Manually specify the filename for saving, which needs to be sanitized
:param server_path: Server path of the file. if ``DownloaderConfiguration.use_bucket`` enabled, \
it will be used as the save path.
"""

self._url = url
self._path = path
self._buffer_size = buffer_size or config.downloader.buffer_size
self._chunk_size = chunk_size or config.downloader.chunk_size
# _alt_filename 是用于下载的文件名
self._alt_filename = alt_filename # 用于下载的文件名
self._server_path = server_path # 服务器文件路径 /hash[:1]/hash2[1:3]/hash
self._filename = alt_filename # 保留用做实际文件名
self._designated_filename = designated_filename
self._server_path = server_path # /hash[:1]/hash2[1:3]/hash
self._save_filename = designated_filename # Prioritize the manually specified filename

self._lock = asyncio.Lock()
self._stop: bool = False
Expand Down Expand Up @@ -87,7 +92,7 @@ def chunk_size(self) -> int:
@property
def filename(self) -> Optional[str]:
"""Actual filename of the download file"""
return self._filename
return self._save_filename

@property
def finished(self) -> bool:
Expand Down Expand Up @@ -141,34 +146,27 @@ async def run(
:return: ``DownloaderRet`` which contain the actual output filename
:raise CancelledError
"""
# Get filename to check if file exists
# Get filename to check if file exists (First-time duplicate file check)
# Check it before request to make progress more efficiency
server_relpath = self._server_path[1:]
server_relpath_without_params = urlparse(server_relpath).path
server_path_filename = unquote(Path(server_relpath_without_params).name)
art_file_path = self._path / (self._filename or server_path_filename)
check_path = art_file_path
# Priority order can be referenced from the constructor's documentation
save_filepath = self._path / (self._save_filename or server_path_filename)

# Get bucket file path
art_bucket_file_path: Optional[Path] = None
bucket_file_path: Optional[Path] = None
if config.downloader.use_bucket:
art_bucket_file_path = config.downloader.bucket_path / server_relpath
check_path = art_bucket_file_path
bucket_file_path = config.downloader.bucket_path / server_relpath

# Check if the file exists
if check_path.is_file():
if config.downloader.use_bucket:
ret_msg = "Download file already exists in both bucket and local, skipping"
if not art_file_path.is_file():
ret_msg = "Download file already exists in bucket, linking to target path"
check_path.hardlink_to(art_file_path)
else:
ret_msg = "Download file already exists, skipping"
file_existed, ret_msg = duplicate_file_check(save_filepath, bucket_file_path)
if file_existed:
return DownloaderRet(
code=RetCodeEnum.FileExisted,
message=generate_msg(
ret_msg,
path=art_file_path
path=save_filepath
)
)

Expand All @@ -187,21 +185,33 @@ async def run(
message=generate_msg(
"Download failed",
status_code=res.status_code,
filename=art_file_path
filename=save_filepath
)
)

# Get filename
filename = self._alt_filename or filename_from_headers(res.headers) or server_path_filename
self._filename = filename
# Get filename for saving and check if file exists (Second-time duplicate file check)
# Priority order can be referenced from the constructor's documentation
self._save_filename = self._designated_filename or sanitize_filename(
filename_from_headers(res.headers)
) or server_path_filename
save_filepath = self._path / self._save_filename
file_existed, ret_msg = duplicate_file_check(save_filepath, bucket_file_path)
if file_existed:
return DownloaderRet(
code=RetCodeEnum.FileExisted,
message=generate_msg(
ret_msg,
path=save_filepath
)
)

# Download
temp_filepath = Path(f"{(self._path / server_path_filename)}.{config.downloader.temp_suffix}")
temp_filepath = Path(f"{save_filepath}.{config.downloader.temp_suffix}")
total_size = int(length_str) if (length_str := res.headers.get("Content-Length")) else None
async with aiofiles.open(str(temp_filepath), "wb", self._buffer_size) as f:
chunk_iterator = res.aiter_bytes(self._chunk_size)
t = tqdm_class(
desc=filename,
desc=self._save_filename,
total=total_size,
disable=not progress,
unit="iB",
Expand All @@ -216,21 +226,23 @@ async def run(

# Download finished
if config.downloader.use_bucket:
art_bucket_file_path.parent.mkdir(parents=True, exist_ok=True)
os.link(temp_filepath, art_bucket_file_path)
bucket_file_path.parent.mkdir(parents=True, exist_ok=True)
os.link(temp_filepath, bucket_file_path)
temp_filepath.rename(self._path / self._save_filename)

temp_filepath.rename(self._path / filename)
# Callbacks
if sync_callable:
sync_callable(self)
if async_callable:
await async_callable(self)

return DownloaderRet(
data=filename
) if filename else DownloaderRet(
data=self._save_filename
) if self._save_filename else DownloaderRet(
code=RetCodeEnum.GeneralFailure,
message=generate_msg(
"Download failed",
filename=self._alt_filename
filename=self._designated_filename
)
)

Expand Down
Loading

0 comments on commit ed8e8f1

Please sign in to comment.