From fa5ddd8add43679c18e3608ee1c142777855fd4c Mon Sep 17 00:00:00 2001 From: Muggle Wei Date: Tue, 9 Jan 2024 23:43:34 +0800 Subject: [PATCH 1/6] fix missing sub-package --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e4e4f2b..6ca5f2e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ bdchecker = "bdchecker.main:main" [tool.setuptools] include-package-data = true -packages = ["bdchecker"] +packages = ["bdchecker", "bdchecker.command", "bdchecker.utils"] [tool.setuptools.dynamic] dependencies = {file = ["requirements.txt"]} From e4efb1ca4465937fd52f956ac42c8cfe56c2ebf0 Mon Sep 17 00:00:00 2001 From: Muggle Wei Date: Tue, 9 Jan 2024 23:44:36 +0800 Subject: [PATCH 2/6] update version to 0.0.2-alpha.1 --- bdchecker/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdchecker/__version__.py b/bdchecker/__version__.py index f102a9c..c2f2170 100644 --- a/bdchecker/__version__.py +++ b/bdchecker/__version__.py @@ -1 +1 @@ -__version__ = "0.0.1" +__version__ = "0.0.2-alpha.1" From 80f29514f5f8eed0be66b7ba0a3bdb741e02fee3 Mon Sep 17 00:00:00 2001 From: Muggle Wei Date: Wed, 10 Jan 2024 00:13:44 +0800 Subject: [PATCH 3/6] update check, output check pass info --- bdchecker/__version__.py | 2 +- bdchecker/checker.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/bdchecker/__version__.py b/bdchecker/__version__.py index c2f2170..3b93d0b 100644 --- a/bdchecker/__version__.py +++ b/bdchecker/__version__.py @@ -1 +1 @@ -__version__ = "0.0.2-alpha.1" +__version__ = "0.0.2" diff --git a/bdchecker/checker.py b/bdchecker/checker.py index aa947cc..85c2d9b 100644 --- a/bdchecker/checker.py +++ b/bdchecker/checker.py @@ -162,19 +162,26 @@ def check(self, dst_dir): logging.debug( "calculate hash value: {}, {}".format(k, hash_val_hex)) + is_all_pass = True for k, v in old_meta_dict.items(): if k not in meta_dict: logging.warning("missing file: {}".format(k)) + is_all_pass = False for k, v in meta_dict.items(): if k not in old_meta_dict: logging.warning("new file: {}".format(k)) + is_all_pass = False old_v = old_meta_dict[k] if v != old_v: logging.error( "check failed: {}, old hash: {}, cur hash: {}".format( k, old_v, v)) + is_all_pass = False + + if is_all_pass is True: + logging.info("all check pass") def _dump_meta(self, meta_filepath, meta_dict): """ From bd5b652f8c78b7cda89593fccbc2b6b9800f8d0e Mon Sep 17 00:00:00 2001 From: Muggle Wei Date: Wed, 10 Jan 2024 00:19:54 +0800 Subject: [PATCH 4/6] update reamd --- README.md | 2 ++ README_cn.md | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/README.md b/README.md index e69de29..ee19948 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,2 @@ +* [readme 中文](./README_cn.md) +* [readme EN](./README.md) \ No newline at end of file diff --git a/README_cn.md b/README_cn.md index e69de29..bac3538 100644 --- a/README_cn.md +++ b/README_cn.md @@ -0,0 +1,77 @@ +## bdchecker + +* [readme 中文](./README_cn.md) +* [readme EN](./README.md) + +## 概述 +bdchecker (**B**ackup **D**ata Checker) 是用于个人冷备数据检查的工具, 帮助你及时发现数据的损坏 + +## 为什么需要它 +想象一下, 我们有一些数据需要进行冷备, 可能是每天压缩后的某个金融市场的原始行情数据; 或是一些个人拥有的经典电影的电子版本; 抑或是一些常年用不到的密钥;我们先列出一些方案: + +| 方案 | 单个存储期限级别 | +| ---- | ---- | +| 固态盘 | 几年至十几年 | +| 机械盘 | 10年+ | +| 磁带 | 30年+ | +| 打孔纸带 | 千年 | +| 刻在石头上 (罗辑把拐杖高举过头, 庄严地喊道) | 百万年 | + +毫无疑问, 若是有足够的财力, 把信息刻在石头上并妥当存储, 除非遭到了二向箔攻击, 否则应该十分安全;但是对于个人而言, 从石头上读取信息带来的成本应该是远大于我们需要保存的数据的价值的 +所以当考虑数据的易于读写性的时候, 那么毫无疑问, 硬盘是最为方便的;但是这带来了额外的要求, 那便是我们需要定期的检查数据是否出现了损坏, 这便可以通过 **bdchecker** 来实现 + +## 安装 +* 使用 pip 安装 +``` +pip install bdchecker +``` +* 直接从 [Releases](https://github.com/MuggleWei/bdchecker/releases) 中获取, 解压并使用 + +## 使用 +**bdchecker** 包含三个命令, 分别为 +* gen: 扫描目录, 并递归遍历生成该目录下所有**新增**文件的 hash 信息, 放置在目录中的 `.bdchecker.meta` 文件夹中 +* clean: 扫描目录, 从 hash 信息中清理掉已删除的文件 +* check: 扫描目录, 查找出现损坏的文件 + +### 示例目录 +假设当前有如下目录结构 +``` +data +├──── a.txt +├──── b.txt +└──── c + ├──── c1.txt + └──── c2.txt +``` + +### gen 示例 +生成信息 +``` +bdchecker gen -d data -v 1 +``` +* `-d`: 表示要生成信息的目录 +* `-v`: 表示日志输出级别, 越高输出越详细 + +完成后, 可以看到屏幕上日志输出: `dump meta info to data/.bdchecker.meta/sha256.csv` +当目录中没有新增文件时, 重复执行 `gen` 命令并不会真正的去生成文件的 hash 信息 + +### clean 示例 +删除 `data/c/c2.txt`, 并运行 +``` +bdchecker clean -d data -v 1 +``` + +可以在日志倒数几行看到: `clean missing file's meta info: c/c2.txt`, 表示当前我们已经成功清理了文件对应的 hash 信息 + +### check 示例 +运行 +``` +bdchecker check -d data -v 1 +``` +日志的最后一行出现: `all check pass`, 代表没有新增/删除的文件, 且所有的文件都没有损坏 + +现在让我们稍微更改一下文件 `a.txt`,随便更改一下其中的内容,再次运行 +``` +bdchecker check -d data -v 1 +``` +此时,日志出现错误信息: `check failed: a.txt, old hash: ..., cur hash: ...`,表示 `a.txt` 的内容出现了改变 \ No newline at end of file From acc360ef50562a79c2fb8c847375c357929b8d0b Mon Sep 17 00:00:00 2001 From: Muggle Wei Date: Wed, 10 Jan 2024 01:01:58 +0800 Subject: [PATCH 5/6] update README_cn --- README_cn.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_cn.md b/README_cn.md index bac3538..9299329 100644 --- a/README_cn.md +++ b/README_cn.md @@ -31,7 +31,7 @@ pip install bdchecker **bdchecker** 包含三个命令, 分别为 * gen: 扫描目录, 并递归遍历生成该目录下所有**新增**文件的 hash 信息, 放置在目录中的 `.bdchecker.meta` 文件夹中 * clean: 扫描目录, 从 hash 信息中清理掉已删除的文件 -* check: 扫描目录, 查找出现损坏的文件 +* check: 扫描目录, 查找出现损坏的文件 (注意, 此操作会计算所有文件的 hash 值, 每次都会较为耗费时间) ### 示例目录 假设当前有如下目录结构 From feae9f099bc00bb0eb01640bda6d20d3ace1b0c5 Mon Sep 17 00:00:00 2001 From: Muggle Wei Date: Wed, 10 Jan 2024 14:09:01 +0800 Subject: [PATCH 6/6] update readme --- README.md | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++- README_cn.md | 10 +++++-- 2 files changed, 84 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ee19948..d8e7835 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,78 @@ * [readme 中文](./README_cn.md) -* [readme EN](./README.md) \ No newline at end of file +* [readme EN](./README.md) + +## Overview +bdchecker (**B**ackup **D**ata Checker) is a tool for checking personal cold backup data, helping you discover data corruption in time + +## Why use it +Imagine that we have some data that needs to be cold backup, which may be the raw market data of a certain financial market that is compressed every day; or some electronic versions of classic movies owned by individuals; or some keys that are not used all year round; Let’s first list some options: + +| Storage plan | Storage life span | +| ---- | ---- | +| SSD | several years to more than ten years | +| HDD | 10+ years | +| tape drive | 30+ years | +| punched paper | thousand of years | +| Carved in stone (Luo Ji raised his crutch above his head and shouted solemnly) | millions of years | + +There is no doubt that if you have enough financial resources to engrave the information on stone and store it properly, it should be very safe unless you are attacked by a dual-vector foil attack; but for individuals, the cost of reading information from stones should be far greater than the value of the data we need to save. +So when considering the ease of reading and writing of data, there is no doubt that the hard disk is the most convenient; but this brings additional requirements, that is, we need to regularly check whether the data is corrupted, this is the reason why use **bdchecker** + +## Install +* use pip +``` +pip install bdchecker +``` +* download from project's [Releases](https://github.com/MuggleWei/bdchecker/releases), and decompress + +## Usage +**bdchecker** include 3 sub-command +* gen: scan directory, recursively traverse to generate the hash information of all **new** files in the directory, and place them in the `.bdchecker.meta` folder. +* clean: scan directory, clean deleted files from hash information +* check: scan directory, Find corrupted files (note that this operation will calculate the hash value of all files, which is more time-consuming) + +### Example directory +Assume that we currently have the following directory structure +``` +data +├──── a.txt +├──── b.txt +└──── c + ├──── c1.txt + └──── c2.txt +``` + +### Command: gen +Generate hash infos +``` +bdchecker gen -d data -v 1 +``` +* `-d`: directory for which information needs to be generated +* `-v`: verbose level + +After missiong completed, you can see console output: `dump meta info to data/.bdchecker.meta/sha256.csv` +When there are no new files in the directory, repeatedly executing the `gen` command will not actually generate the hash information of the file. + +### Command: clean +remove `data/c/c2.txt`, then run +``` +bdchecker clean -d data -v 1 +``` +You can see in the last few lines of the log: `clean missing file's meta info: c/c2.txt`, which means that we have successfully cleaned the hash information corresponding to the file. + +### Command: check +run +``` +bdchecker check -d data -v 1 +``` +The last line of the log appears: `all check pass`, which means there are no new/deleted files and all files are not corrupted. + +Now, let's modify `a.text`, write something randomly, and then run again +``` +bdchecker check -d data -v 1 +``` +At this time, an error message appears in the log: `check failed: a.txt, old hash: ..., cur hash: ...`, indicating that the content of `a.txt` has changed. + +### Migration and comparison +The hash information generated by `bdchecker` will be saved in the `.bdchecker.meta` in the directory, so you can directly migrate the entire folder during migration. +When there are already multiple backup data and no hash value has been generated; at this time, you can use the `bdchecker gen` command to generate a hash value for each backup data, and then compare the two files. Since the generated file lines are already sorted, so you can directly use commands such as `diff` for comparison. diff --git a/README_cn.md b/README_cn.md index 9299329..fd988e4 100644 --- a/README_cn.md +++ b/README_cn.md @@ -17,7 +17,7 @@ bdchecker (**B**ackup **D**ata Checker) 是用于个人冷备数据检查的工 | 打孔纸带 | 千年 | | 刻在石头上 (罗辑把拐杖高举过头, 庄严地喊道) | 百万年 | -毫无疑问, 若是有足够的财力, 把信息刻在石头上并妥当存储, 除非遭到了二向箔攻击, 否则应该十分安全;但是对于个人而言, 从石头上读取信息带来的成本应该是远大于我们需要保存的数据的价值的 +毫无疑问, 若是有足够的财力, 把信息刻在石头上并妥当存储, 除非遭到了二向箔攻击, 否则应该十分安全;但是对于个人而言, 从石头上读取信息带来的成本应该是远大于我们需要保存的数据的价值 所以当考虑数据的易于读写性的时候, 那么毫无疑问, 硬盘是最为方便的;但是这带来了额外的要求, 那便是我们需要定期的检查数据是否出现了损坏, 这便可以通过 **bdchecker** 来实现 ## 安装 @@ -31,7 +31,7 @@ pip install bdchecker **bdchecker** 包含三个命令, 分别为 * gen: 扫描目录, 并递归遍历生成该目录下所有**新增**文件的 hash 信息, 放置在目录中的 `.bdchecker.meta` 文件夹中 * clean: 扫描目录, 从 hash 信息中清理掉已删除的文件 -* check: 扫描目录, 查找出现损坏的文件 (注意, 此操作会计算所有文件的 hash 值, 每次都会较为耗费时间) +* check: 扫描目录, 查找出现损坏的文件 (注意, 此操作会计算所有文件的 hash 值, 较为耗费时间) ### 示例目录 假设当前有如下目录结构 @@ -74,4 +74,8 @@ bdchecker check -d data -v 1 ``` bdchecker check -d data -v 1 ``` -此时,日志出现错误信息: `check failed: a.txt, old hash: ..., cur hash: ...`,表示 `a.txt` 的内容出现了改变 \ No newline at end of file +此时,日志出现错误信息: `check failed: a.txt, old hash: ..., cur hash: ...`,表示 `a.txt` 的内容出现了改变 + +## 迁移与对比 +由 `bdchecker` 生成的 hash 信息会保存在目录中的 `.bdchecker.meta` 目录中, 所以迁移时直接整个文件夹迁移即可 +当已经有多份冷备数据存在, 且并没有生成过 hash 值时; 此时可以对每份冷备数据都使用 `bdchecker gen` 命令来生成 hash 值, 接着对比两份文件即可. 由于生成文件行是已经排序的, 所以可以直接使用 `diff` 之类的命令进行对比