From 91ad4ade4e338747a6106217a408e062d4806e62 Mon Sep 17 00:00:00 2001 From: WINEZERO Date: Wed, 26 Jul 2023 05:46:55 +0800 Subject: [PATCH] =?UTF-8?q?0.6.0=20=E6=9B=B4=E6=96=B0request=5Fplus?= =?UTF-8?q?=E4=BE=9D=E8=B5=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 0.6.0 更新request_plus依赖, 1、实现主动重试等 2、响应体、相应头数据返回(可选) --- DynaScan.py | 10 +- libs/lib_args/input_parse.py | 2 +- libs/lib_requests/check_protocol.py | 2 +- libs/lib_requests/requests_const.py | 162 ++++--- libs/lib_requests/requests_plus.py | 406 ++++++------------ libs/lib_requests/requests_thread.py | 52 ++- .../{requests_tools.py => requests_utils.py} | 71 +-- libs/lib_requests/response_handle.py | 215 ++++++++++ libs/lib_url_analysis/parse_words.py | 18 +- setting_com.py | 2 +- 10 files changed, 554 insertions(+), 386 deletions(-) rename libs/lib_requests/{requests_tools.py => requests_utils.py} (84%) create mode 100644 libs/lib_requests/response_handle.py diff --git a/DynaScan.py b/DynaScan.py index 7861f9b..3160abd 100644 --- a/DynaScan.py +++ b/DynaScan.py @@ -9,9 +9,9 @@ from libs.lib_dyna_rule.set_basic_var import set_base_var_dict_with_freq from libs.lib_dyna_rule.set_depend_var import set_dependent_var_dict from libs.lib_file_operate.rw_json_file import load_json_to_dict, dump_dict_to_json -from libs.lib_requests.requests_const import HTTP_FILTER_VALUE_DICT, HTTP_FILTER_IGNORE_KEYS, HTTP_CONST_SIGN +from libs.lib_requests.requests_const import FILTER_HTTP_VALUE_DICT, FILTER_DYNA_IGNORE_KEYS, HTTP_CONST_SIGN from libs.lib_requests.requests_thread import multi_thread_requests_url, multi_thread_requests_url_sign -from libs.lib_requests.requests_tools import get_random_str, analysis_dict_same_keys, access_result_handle +from libs.lib_requests.requests_utils import get_random_str, analysis_dict_same_keys, access_result_handle from libs.lib_url_analysis.parse_path import get_curr_dir_url from libs.lib_url_analysis.url_utils import combine_urls_and_paths, get_segment_urls from libs.lib_url_analysis.parse_host import get_proto, get_host_port @@ -111,7 +111,7 @@ def init_load_dict(config_dict): # 循环读取每个文件夹下的规则字典 for rule_dir in cur_rule_dir_list: - # 1、获取基本变量替换字典 # 只获取目标文件的下的依赖 + # 1、获取基本变量替换字典 # 只获取目标文件下的依赖 base_replace_dict = set_base_var_dict_with_freq( base_var_dir=config_dict[GB_BASE_VAR_DIR].joinpath(rule_dir), ext_list=config_dict[GB_DICT_SUFFIX], @@ -192,8 +192,8 @@ def gen_dynamic_exclude_dict(target_url, config_dict): # 分析测试结果 dynamic_exclude_dict = analysis_dict_same_keys(test_result_dict_list, - HTTP_FILTER_VALUE_DICT, - HTTP_FILTER_IGNORE_KEYS) + FILTER_HTTP_VALUE_DICT, + FILTER_DYNA_IGNORE_KEYS) return dynamic_exclude_dict diff --git a/libs/lib_args/input_parse.py b/libs/lib_args/input_parse.py index 284c953..55a0796 100644 --- a/libs/lib_args/input_parse.py +++ b/libs/lib_args/input_parse.py @@ -7,7 +7,7 @@ from libs.lib_log_print.logger_printer import output, LOG_ERROR from libs.lib_args.input_const import * from libs.lib_requests.requests_const import HTTP_USER_AGENTS -from libs.lib_requests.requests_tools import random_useragent, random_x_forwarded_for +from libs.lib_requests.requests_utils import random_useragent, random_x_forwarded_for def args_parser(config_dict): diff --git a/libs/lib_requests/check_protocol.py b/libs/lib_requests/check_protocol.py index 5d6f34c..5f3e226 100644 --- a/libs/lib_requests/check_protocol.py +++ b/libs/lib_requests/check_protocol.py @@ -100,7 +100,7 @@ def check_url_list_access(target_list, ) # 分析多线程检测结果 for access_result_dict in access_result_dict_list: - req_url = access_result_dict[HTTP_REQ_URL] + req_url = access_result_dict[HTTP_REQ_TARGET] resp_status = access_result_dict[HTTP_RESP_STATUS] if resp_status > 0: output(f"[*] 当前目标 {req_url} 将被添加 响应结果:{access_result_dict}", level=LOG_INFO) diff --git a/libs/lib_requests/requests_const.py b/libs/lib_requests/requests_const.py index 5f681ec..83318ac 100644 --- a/libs/lib_requests/requests_const.py +++ b/libs/lib_requests/requests_const.py @@ -2,70 +2,130 @@ # encoding: utf-8 ############################################################ -# 响应需要的内容 -HTTP_CONST_SIGN = "HTTP_CONST_SIGN" -HTTP_REQ_URL = "HTTP_REQ_URL" - -HTTP_RESP_STATUS = "HTTP_RESP_STATUS" -HTTP_RESP_REDIRECT_URL = "HTTP_RESP_REDIRECT_URL" -HTTP_RESP_TEXT_HASH = "HTTP_RESP_TEXT_HASH" -HTTP_RESP_TEXT_TITLE = "HTTP_RESP_TEXT_TITLE" -HTTP_RESP_TEXT_SIZE = "HTTP_RESP_TEXT_SIZE" -HTTP_RESP_CONTENT_LENGTH = "HTTP_RESP_CONTENT_LENGTH" -HTTP_RESP_BYTES_HEAD = "HTTP_RESP_BYTES_HEAD" +# 需要返回的响应内容 +HTTP_REQ_TARGET = "HTTP_REQ_TARGET" # 用户输入的请求地址 +HTTP_CONST_SIGN = "HTTP_CONST_SIGN" # 用户输入的自定义标记 + +HTTP_RESP_STATUS = "HTTP_RESP_STATUS" # 响应状态码 + +HTTP_RESP_LENGTH = "HTTP_RESP_LENGTH" # 响应头中的CL头部 +HTTP_RESP_HEADERS_CRC = "HTTP_RESP_HEADERS_CRC" # 响应实际头部 HASH标记 +HTTP_RESP_HEADERS_OPT = "HTTP_RESP_HEADERS_OPT" # 响应实际头部 (OP=可选) + +HTTP_RESP_REDIRECT = "HTTP_RESP_REDIRECT" # 响应中的请求URL,302时不一定相同 + +HTTP_RESP_CONTENT_CRC = "HTTP_RESP_CONTENT_CRC" # 响应实际内容 HASH标记 +HTTP_RESP_CONTENT_OPT = "HTTP_RESP_CONTENT_OPT" # 响应实际内容 (OP=可选) +HTTP_RESP_SIZE = "HTTP_RESP_SIZE" # 响应内容标记 大小标记 +HTTP_RESP_TITLE = "HTTP_RESP_TITLE" # 响应文本的标题 ############################################################ # 一些响应值的常量 -HTTP_NONE = None +HTTP_MAXIMUM_READ = 1024000 # 设置最大读取的响应内容(字数) 一般网页有150字X3000行 +NONE = None +NULL = "" + # 状态码常量 -HTTP_STATUS_MINUS = -1 -HTTP_STATUS_ZERO = 0 -HTTP_STATUS_ONE = 1 +RESP_STATUS_DEFAULT = "RESP_STATUS_DEFAULT" # 0 # 没有任何操作时候的 默认值 +RESP_STATUS_IGNORE = "RESP_STATUS_IGNORE" # 1 # 已知 错误 情况的 标记赋值 # 不需要手动处理 +RESP_STATUS_ERROR = "RESP_STATUS_ERROR" # -1 # 未知 错误 情况的 标记赋值 # 需要手动处理 + +# 响应头长度常量 +RESP_LENGTH_DEFAULT = "RESP_LENGTH_DEFAULT" # 没有任何操作时候的 默认值 +RESP_LENGTH_BLANK = "RESP_LENGTH_BLANK" # 获取结果为空白 +RESP_LENGTH_ERROR = "RESP_LENGTH_ERROR" # 未知 错误 情况的 标记赋值 + +# 响应实际头部HASH +RESP_HEADERS_CRC_DEFAULT = "RESP_HEADERS_CRC_DEFAULT" # 没有任何操作时候的 默认值 +RESP_HEADERS_CRC_ERROR = "RESP_HEADERS_CRC_ERROR" # 未知 错误 情况的 标记赋值 +RESP_HEADERS_CRC_BLANK = "RESP_HEADERS_CRC_BLANK" # 获取结果为空白 + +# 响应实际头部 +RESP_HEADERS_DEFAULT = "RESP_HEADERS_DEFAULT" # 没有任何操作时候的 默认值 +RESP_HEADERS_ERROR = "RESP_HEADERS_ERROR" # 未知 错误 情况的 标记赋值 +RESP_HEADERS_BLANK = "RESP_HEADERS_BLANK" # 获取结果为空白 +RESP_HEADERS_IGNORE = "RESP_HEADERS_IGNORE" # 已知 错误 情况的 标记赋值 + # 重定向常量 -HTTP_NULL_REDIRECT_URL = "HTTP_NULL_REDIRECT_URL" -HTTP_RAW_REDIRECT_URL = "HTTP_RAW_REDIRECT_URL" -# 响应内容常量 -HTTP_BLANK_BYTES = "HTTP_BLANK_BYTES" -HTTP_NULL_BYTES = "HTTP_NULL_BYTES" -# 文本HASH常量 -HTTP_IGNORE_TEXT_HASH = "HTTP_IGNORE_TEXT_HASH" -HTTP_NULL_TEXT_HASH = "HTTP_NULL_TEXT_HASH" +RESP_REDIRECT_DEFAULT = "RESP_REDIRECT_DEFAULT" # 没有任何操作时候的 默认值 +RESP_REDIRECT_ORIGIN = "RESP_REDIRECT_ORIGIN" # 获取结果为原始情况 +RESP_REDIRECT_ERROR = "RESP_REDIRECT_ERROR" # 未知 错误 情况的 标记赋值 + +# 响应实际内容HASH +RESP_CONTENT_CRC_DEFAULT = "RESP_CONTENT_CRC_DEFAULT" # 没有任何操作时候的 默认值 +RESP_CONTENT_CRC_ERROR = "RESP_CONTENT_CRC_ERROR" # 未知 错误 情况的 标记赋值 +RESP_CONTENT_CRC_BLANK = "RESP_CONTENT_CRC_BLANK" # 获取结果为空白 +RESP_CONTENT_CRC_LARGE = "RESP_CONTENT_CRC_LARGE" # 获取结果为空白 超限 + +# 响应实际内容 +RESP_CONTENT_DEFAULT = "RESP_CONTENT_DEFAULT" # 没有任何操作时候的 默认值 +RESP_CONTENT_LARGE = "RESP_CONTENT_LARGE" # 获取结果为空白 +RESP_CONTENT_ERROR = "RESP_CONTENT_ERROR" # 未知 错误 情况的 标记赋值 +RESP_CONTENT_IGNORE = "RESP_CONTENT_IGNORE" # 已知 情况的 标记赋值 +RESP_CONTENT_BLANK = "RESP_CONTENT_BLANK" # 已知 情况的 标记赋值 + +# 文本大小常量 +RESP_SIZE_DEFAULT = "RESP_SIZE_DEFAULT" # 没有任何操作时候的 默认值 +RESP_SIZE_ERROR = "RESP_SIZE_ERROR" # 未知 错误 情况的 标记赋值 +RESP_SIZE_LARGE = "RESP_SIZE_LARGE" # 已知 情况的 内容太大 +RESP_SIZE_BLANK = "RESP_SIZE_BLANK" # 获取结果为空白 + # 文本标题常量 -HTTP_BLANK_TITLE = "HTTP_BLANK_TITLE" -HTTP_NULL_TITLE = "HTTP_NULL_TITLE" -HTTP_IGNORE_TITLE = "HTTP_IGNORE_TITLE" +RESP_TITLE_DEFAULT = "RESP_TITLE_DEFAULT" # 没有任何操作时候的 默认值 +RESP_TITLE_ERROR = "RESP_TITLE_ERROR" # 未知 错误 情况的 标记赋值 +RESP_TITLE_LARGE = "RESP_TITLE_LARGE" # 已知 情况的 内容太大 +RESP_TITLE_BLANK = "RESP_TITLE_BLANK" # 获取结果为空白 ############################################################ # 默认的响应字典,使用前被copy一份 -HTTP_DEFAULT_RESP_DICT = { - HTTP_REQ_URL: HTTP_NONE, # 请求的URL 必须在请求时填充 - HTTP_CONST_SIGN: HTTP_NONE, # 请求自定义的标记, 必须在请求时填充 原样返回 - HTTP_RESP_STATUS: HTTP_STATUS_MINUS, # 响应状态码 赋值默认值 - HTTP_RESP_BYTES_HEAD: HTTP_NULL_BYTES, # 响应头字节 赋值默认值 - HTTP_RESP_CONTENT_LENGTH: HTTP_STATUS_MINUS, # 响应内容长度 赋值默认值 - HTTP_RESP_TEXT_SIZE: HTTP_STATUS_MINUS, # 响应内容大小 赋值默认值 - HTTP_RESP_TEXT_TITLE: HTTP_NULL_TITLE, # 响应文本标题 赋值默认值 - HTTP_RESP_TEXT_HASH: HTTP_NULL_TEXT_HASH, # 响应文本HASH 赋值默认值 - HTTP_RESP_REDIRECT_URL: HTTP_NULL_REDIRECT_URL, # 响应重定向URL 赋值默认值 +DEFAULT_HTTP_RESP_DICT = { + HTTP_REQ_TARGET: NONE, # 请求的URL, 必须在请求时填充 + HTTP_CONST_SIGN: NONE, # 请求自定义的标记, 必须在请求时填充 原样返回 + + HTTP_RESP_STATUS: RESP_STATUS_DEFAULT, # 响应状态码 赋值默认值 + HTTP_RESP_LENGTH: RESP_LENGTH_DEFAULT, # 响应CL长度 赋值默认值 + + HTTP_RESP_SIZE: RESP_SIZE_DEFAULT, # 响应文本大小 赋值默认值 + HTTP_RESP_TITLE: RESP_TITLE_DEFAULT, # 响应文本标题 赋值默认值 + + HTTP_RESP_REDIRECT: RESP_REDIRECT_DEFAULT, # 响应重定向URL 赋值默认值 + + HTTP_RESP_HEADERS_CRC: RESP_HEADERS_CRC_DEFAULT, # 响应头部HASH 赋值默认值 + HTTP_RESP_CONTENT_CRC: RESP_CONTENT_CRC_DEFAULT, # 响应文本HASH 赋值默认值 + + HTTP_RESP_HEADERS_OPT: RESP_HEADERS_DEFAULT, # 响应头部信息 赋值默认值 + HTTP_RESP_CONTENT_OPT: RESP_CONTENT_DEFAULT, # 响应内容信息 赋值默认值 } ############################################################ # 每个响应键的默认值或空值,在动态筛选时被忽略 -HTTP_FILTER_VALUE_DICT = { - HTTP_REQ_URL: [HTTP_NONE, ""], - HTTP_CONST_SIGN: [HTTP_NONE, ""], - HTTP_RESP_STATUS: [HTTP_STATUS_MINUS, HTTP_STATUS_ZERO, HTTP_STATUS_ONE], - HTTP_RESP_BYTES_HEAD: [HTTP_NULL_BYTES, HTTP_BLANK_BYTES], - HTTP_RESP_CONTENT_LENGTH: [HTTP_STATUS_MINUS, HTTP_STATUS_ZERO], - HTTP_RESP_TEXT_TITLE: [HTTP_NULL_TITLE, HTTP_IGNORE_TITLE, HTTP_BLANK_TITLE], - HTTP_RESP_TEXT_HASH: [HTTP_NULL_TEXT_HASH, HTTP_IGNORE_TEXT_HASH], - HTTP_RESP_TEXT_SIZE: [HTTP_STATUS_MINUS, HTTP_STATUS_ZERO], - HTTP_RESP_REDIRECT_URL: [HTTP_NULL_REDIRECT_URL, HTTP_RAW_REDIRECT_URL], +FILTER_HTTP_VALUE_DICT = { + HTTP_REQ_TARGET: [NONE, NULL], + HTTP_CONST_SIGN: [NONE, NULL], + + HTTP_RESP_STATUS: [RESP_STATUS_DEFAULT, RESP_STATUS_ERROR, RESP_STATUS_IGNORE, NONE, NULL], + HTTP_RESP_LENGTH: [RESP_LENGTH_DEFAULT, RESP_LENGTH_ERROR, RESP_LENGTH_BLANK, NONE, NULL], + HTTP_RESP_SIZE: [RESP_SIZE_DEFAULT, RESP_SIZE_ERROR, RESP_SIZE_LARGE, RESP_SIZE_BLANK, NONE, NULL], + HTTP_RESP_TITLE: [RESP_TITLE_DEFAULT, RESP_TITLE_ERROR, RESP_TITLE_LARGE, RESP_TITLE_BLANK , NONE, NULL], + + HTTP_RESP_REDIRECT: [RESP_REDIRECT_DEFAULT, RESP_REDIRECT_ERROR, RESP_REDIRECT_ORIGIN, NONE, NULL], + + HTTP_RESP_HEADERS_CRC: [RESP_HEADERS_CRC_DEFAULT, RESP_HEADERS_CRC_ERROR, RESP_HEADERS_CRC_BLANK, NONE, NULL], + HTTP_RESP_HEADERS_OPT: [RESP_HEADERS_DEFAULT, RESP_HEADERS_ERROR, RESP_HEADERS_BLANK, RESP_HEADERS_IGNORE, NONE, NULL], + + HTTP_RESP_CONTENT_CRC: [RESP_CONTENT_CRC_DEFAULT, RESP_CONTENT_CRC_ERROR, RESP_CONTENT_CRC_BLANK, RESP_CONTENT_CRC_LARGE, NONE, NULL], + HTTP_RESP_CONTENT_OPT: [RESP_CONTENT_DEFAULT, RESP_CONTENT_ERROR, RESP_CONTENT_BLANK, RESP_CONTENT_LARGE, RESP_CONTENT_IGNORE, NONE, NULL], } -# 分析动态排除字典时,需要被忽略的键列表 -HTTP_FILTER_IGNORE_KEYS = [HTTP_CONST_SIGN, HTTP_REQ_URL] +# 分析动态排除字典时,需要被忽略的键列表, 应该动态性强的(用户输入的、响应头的时间戳、) +FILTER_DYNA_IGNORE_KEYS = [HTTP_CONST_SIGN, + HTTP_REQ_TARGET, + HTTP_RESP_CONTENT_OPT, + HTTP_RESP_HEADERS_OPT, + ] ############################################################ -# 记录由于代理服务器导致的协议判断不正确响应关键字 -HTTP_ERROR_PAGE_KEY = ["burp suite"] -# burpsuite中可通过 [勾选抑制错误消息] 修复该问题 +# 默认请求头 +HTTP_HEADERS = { + 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', + 'Accept-Encoding': '' +} ############################################################ # 随机HTTP头 HTTP_USER_AGENTS = [ diff --git a/libs/lib_requests/requests_plus.py b/libs/lib_requests/requests_plus.py index e1fbe59..1b1d8c5 100644 --- a/libs/lib_requests/requests_plus.py +++ b/libs/lib_requests/requests_plus.py @@ -1,56 +1,33 @@ #!/usr/bin/env python # encoding: utf-8 -import hashlib -import re import sys -import time -from binascii import b2a_hex -from urllib.parse import urlparse, urljoin, quote +from datetime import datetime +from urllib.parse import urlparse, urljoin import requests -from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry -from libs.lib_log_print.logger_printer import output, LOG_DEBUG, LOG_ERROR +from libs.lib_log_print.logger_printer import output, LOG_ERROR from libs.lib_requests.requests_const import * -from libs.lib_requests.requests_tools import list_ele_in_str, content_encode - +from libs.lib_requests.requests_utils import get_random_str +from libs.lib_requests.response_handle import show_requests_error, handle_common_error, get_resp_header_info, \ + get_resp_redirect_url, get_resp_text_info, retry_action_check requests.packages.urllib3.disable_warnings() -sys.dont_write_bytecode = True # 设置不生成pyc文件 - - -# 处理错误消息 -def show_requests_error(url_info, common_error_list, module_name, error_info): - # 把常规错误的关键字加入列表common_error_list内,列表为空时都作为非常规错误处理 - common_error_flag = list_ele_in_str(common_error_list, str(error_info), default=False) - if common_error_flag: - output(f"[-] 当前目标 {url_info} COMMON ERROR ON Acquire [{module_name}]: [{error_info}]", level=LOG_DEBUG) - else: - output(f"[-] 当前目标 {url_info} OTHERS ERROR ON Acquire [{module_name}]: [{error_info}]", level=LOG_ERROR) # 支持重试等操作的请求库 -def requests_plus(req_url, - req_method='GET', - req_headers=None, - req_data=None, - req_proxies=None, - req_timeout=10, - verify_ssl=False, - req_allow_redirects=False, - req_stream=False, - retry_times=0, - const_sign=None, - add_host_header=None, - add_refer_header=None, - ignore_encode_error=None): +def requests_plus(req_url, req_method='GET', req_headers=None, req_data=None, req_proxies=None, req_timeout=10, + verify_ssl=False, req_allow_redirects=False, req_stream=False, retry_times=3, const_sign=None, + add_host_header=None, add_refer_header=None, ignore_encode_error=None, + resp_headers_need=False, resp_content_need=False, active_retry_dict=None, + ): # const_sign # 设置本请求的标记 - const_sign = const_sign or str(time.time()) + random = get_random_str(length=5, has_num=True, has_char=False, has_capital=False) + const_sign = const_sign or datetime.now().strftime(f'%Y%m%d-%H%M%S-{random}') # 设置默认请求头 if not req_headers: - req_headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', - 'Accept-Encoding': ''} + req_headers = HTTP_HEADERS # 需要动态添加host字段 if add_host_header: @@ -61,267 +38,146 @@ def requests_plus(req_url, req_headers["Referer"] = urljoin(req_url, "./") # 设置需要接受的参数的默认值 #如果返回结果是默认值,说明程序异常没有获取到 - resp_status = HTTP_DEFAULT_RESP_DICT[HTTP_RESP_STATUS] # 响应状态码 赋值默认值 NUM_MINUS - resp_bytes_head = HTTP_DEFAULT_RESP_DICT[HTTP_RESP_BYTES_HEAD] # 响应头字节 赋值默认值 NULL_BYTES - resp_content_length = HTTP_DEFAULT_RESP_DICT[HTTP_RESP_CONTENT_LENGTH] # 响应内容长度 赋值默认值 NUM_MINUS - resp_text_size = HTTP_DEFAULT_RESP_DICT[HTTP_RESP_TEXT_SIZE] # 响应内容大小 赋值默认值 NUM_MINUS - resp_text_title = HTTP_DEFAULT_RESP_DICT[HTTP_RESP_TEXT_TITLE] # 响应文本标题 赋值默认值 NULL_TITLE - resp_text_hash = HTTP_DEFAULT_RESP_DICT[HTTP_RESP_TEXT_HASH] # 响应文本HASH 赋值默认值 NULL_TEXT_HASH - resp_redirect_url = HTTP_DEFAULT_RESP_DICT[HTTP_RESP_REDIRECT_URL] # 响应重定向URL 赋值默认值 NULL_REDIRECT_URL + resp_status = DEFAULT_HTTP_RESP_DICT[HTTP_RESP_STATUS] # 响应状态码 # 完成 + resp_headers_opt = DEFAULT_HTTP_RESP_DICT[HTTP_RESP_HEADERS_OPT] # 响应实际头部 (OP=可选) # 完成 + resp_hash_headers = DEFAULT_HTTP_RESP_DICT[HTTP_RESP_HEADERS_CRC] # 响应实际头部 HASH标记 # 完成 + resp_length = DEFAULT_HTTP_RESP_DICT[HTTP_RESP_LENGTH] # 响应头中的CL头部 # 完成 + + resp_content_opt = DEFAULT_HTTP_RESP_DICT[HTTP_RESP_CONTENT_OPT] # 响应实际内容 (OP=可选) + resp_text_size = DEFAULT_HTTP_RESP_DICT[HTTP_RESP_SIZE] # 响应内容 大小标记 + resp_text_title = DEFAULT_HTTP_RESP_DICT[HTTP_RESP_TITLE] # 响应内容 网页标题 + resp_hash_content = DEFAULT_HTTP_RESP_DICT[HTTP_RESP_CONTENT_CRC] # 响应实际内容 HASH标记 + + resp_redirect_url = DEFAULT_HTTP_RESP_DICT[HTTP_RESP_REDIRECT] # 从响应中获取302的请求URL 应该有别的办法 try: - resp = request_retry(req_url=req_url, - req_method=req_method, - req_headers=req_headers, - req_data=req_data, - req_proxies=req_proxies, - req_timeout=req_timeout, - verify_ssl=verify_ssl, - req_allow_redirects=req_allow_redirects, - req_stream=req_stream) + resp = request_base(target=req_url, + method=req_method, + headers=req_headers, + data=req_data, + proxies=req_proxies, + timeout=req_timeout, + verify=verify_ssl, + allow_redirects=req_allow_redirects, + stream=req_stream) resp_status = resp.status_code except Exception as error: - # 当错误原因时一般需要重试的错误时,直接忽略输出,进行访问重试 - current_module = HTTP_RESP_STATUS # 把常规错误的关键字加入列表内,列表为空时都作为非常规错误处理 - module_common_error_list = ["without response", - "retries", - "Read timed out", - "codec can't encode", - "No host supplied", - "Exceeded 30 redirects", - 'WSAECONNRESET'] + current_module = HTTP_RESP_STATUS + module_common_error_list = ["without response", "retries", "Read timed out", + "codec can't encode", "No host supplied", + "Exceeded 30 redirects", 'WSAECONNRESET'] show_requests_error(req_url, module_common_error_list, current_module, error) - # 如果是数据编码错误,需要进行判断处理 - if "codec can't encode" in str(error): - # 如果是数据编码错误,就不再进行尝试 ,返回固定结果状态码 - # 'latin-1' codec can't encode characters in position 17-18: ordinal not in range(256) - if ignore_encode_error: - # 不需要重试的结果 设置resp_status标记为1, - resp_status = HTTP_STATUS_ONE - output(f"[-] 当前目标 {req_url} 中文数据编码错误,但是已经开启中文编码处理功能,忽略本次错误!!!", level=LOG_DEBUG) - else: - # 需要手动访问重试的结果 - output(f"[-] 当前目标 {req_url} 中文数据编码错误,需要针对中文编码进行额外处理,返回固定结果!!!", level=LOG_ERROR) - elif "No host supplied" in str(error): - # 不需要重试的结果 设置resp_status标记为1, - resp_status = HTTP_STATUS_ONE - output(f"[-] 当前目标 {req_url} 格式输入错误,忽略本次结果!!!", level=LOG_ERROR) + + if any(key in str(error) for key in ["codec can't encode", "No host supplied"]): + # 不常见错误中需要重试的类型 + resp_status = handle_common_error(req_url, error, ignore_encode_error) else: - # 如果服务器没有响应,但是也有可能存在能访问的URL,因此不能简单以状态码判断结果 # 如果是其他访问错误,就进程访问重试 - if retry_times > 0: + if retry_times <= 0: + resp_status = RESP_STATUS_ERROR + output(f"[-] 当前目标 {req_url} 剩余重试次数为0, 返回错误状态!", level=LOG_ERROR) + else: + # 处理一种需要额外修改请求头的情况 if "Exceeded 30 redirects" in str(error): - req_headers = None - output(f"[-] 当前目标 {req_url} 即将修改请求头为默认头后进行重试!!!", level=LOG_ERROR) + req_headers = HTTP_HEADERS + output(f"[-] 当前目标 {req_url} 将自动进行请求头修改重试操作", level=LOG_ERROR) output(f"[-] 当前目标 {req_url} 开始进行倒数第 {retry_times} 次重试, TIMEOUT * 1.5...", level=LOG_ERROR) - return requests_plus(req_url=req_url, - req_method=req_method, - req_headers=req_headers, - req_data=req_data, - req_proxies=req_proxies, - req_timeout=req_timeout * 1.5, - verify_ssl=verify_ssl, - req_allow_redirects=req_allow_redirects, - retry_times=retry_times - 1, - const_sign=const_sign, - add_host_header=add_host_header, - add_refer_header=add_refer_header, - ignore_encode_error=ignore_encode_error + return requests_plus(req_url=req_url, req_method=req_method, req_headers=req_headers, req_data=req_data, + req_proxies=req_proxies, req_timeout=req_timeout * 1.5, verify_ssl=verify_ssl, + req_allow_redirects=req_allow_redirects, retry_times=retry_times - 1, + const_sign=const_sign, add_host_header=add_host_header, + add_refer_header=add_refer_header, ignore_encode_error=ignore_encode_error, + resp_headers_need=resp_headers_need, resp_content_need=resp_content_need, + active_retry_dict=active_retry_dict, ) - - else: - # 如果重试次数为小于0,返回固定结果-1 - output(f"[-] 当前目标 {req_url} 剩余重试次数为0,返回固定结果,需要后续手动进行验证...", level=LOG_ERROR) else: - # 当获取到响应结果时,获取三个响应关键匹配项目 + # ############################################################# + # 当获取到响应结果时,获取响应关键匹配项目 + # ############################################################# + # 1 获取响应头相关的数据 resp_headers_opt | resp_hash_headers | resp_length # 流模式|普通模式都可以获取 + resp_headers_opt, resp_hash_headers, resp_length = get_resp_header_info(req_url, resp, resp_headers_need) + # ############################################################# + # 2、获取响应内容相关的信息 # resp_content_opt | resp_text_title | resp_hash_content | resp_text_size + text_info = get_resp_text_info(req_url, resp, req_stream, resp_content_need, resp_length, HTTP_MAXIMUM_READ) + resp_content_opt, resp_hash_content, resp_text_title, resp_text_size = text_info ############################################################# - # 排除由于代理服务器导致的访问BUG - if list_ele_in_str(HTTP_ERROR_PAGE_KEY, str(resp.text).lower(), False): - output("[!] 当前由于代理服务器问题导致响应状态码错误...Fixed...", level=LOG_ERROR) - resp_status = HTTP_STATUS_MINUS + # 3 获取重定向后的URL 通过判断请求的URL是不是响应的URL #需要跟随重定向才行 + resp_redirect_url = get_resp_redirect_url(req_url, resp) ############################################################# - # 1、resp_bytes_head 获取响应内容的前十字节 # 需要流模式才能获取 - current_module = HTTP_RESP_BYTES_HEAD - try: - resp_bytes_head = b2a_hex(resp.raw.read(10)).decode() - if resp_bytes_head.strip() == "": - resp_bytes_head = HTTP_BLANK_BYTES - else: - pass - # output(RESP_BYTES_HEAD, resp_bytes_head) #需要流模式才能获取resp_bytes_head - except Exception as error: - # 当错误原因时一般需要重试的错误时,直接忽略输出,进行访问重试 - module_common_error_list = [] # 把常规错误的关键字加入列表内,列表为空时都作为非常规错误处理 - show_requests_error(req_url, module_common_error_list, current_module, error) - ############################################################# - # 2、resp_content_length 获取响应的content_length头部 - current_module = HTTP_RESP_CONTENT_LENGTH - try: - resp_content_length = int(str(resp.headers.get('Content-Length'))) - except Exception as error: - module_common_error_list = ["invalid literal for int()"] # 把常规错误的关键字加入列表内,列表为空时都作为非常规错误处理 - show_requests_error(req_url, module_common_error_list, current_module, error) - ############################################################# - # 3、resp_text_size 获取响应内容实际长度,如果响应长度过大就放弃读取,从resp_content_length进行读取 - current_module = HTTP_RESP_TEXT_SIZE - - if resp_content_length >= 1024000 * 5: - # 结果文本长度太大,不进行实际获取 - resp_text_size = resp_content_length - else: - try: - resp_text_size = len(resp.text) - except Exception as error: - module_common_error_list = ["content-encoding: gzip", "Connection broken: IncompleteRead"] - # 把常规错误的关键字加入列表内,列表为空时都作为非常规错误处理 - # Received response with content-encoding: gzip, but failed to decode it. # 返回gzip不解压报错 - # ('Connection broken: IncompleteRead(22 bytes read)', IncompleteRead(22 bytes read)) # 使用流模式导致不完全读取报错 - show_requests_error(req_url, module_common_error_list, current_module, error) - ############################################################# - # 4、resp_text_title 获取网页标题,如果resp_text_size获取到了就直接获取 - current_module = HTTP_RESP_TEXT_TITLE - # 如果resp_text_size没有获取到,说明没有resp_text 不参考上级处理结果 - encode_content = "" - try: - if resp_content_length >= 1024000 * 5: - # 如果返回值太大,就忽略获取结果 - resp_text_title = HTTP_IGNORE_TITLE - else: - # 解决响应解码问题 - encode_content = content_encode(resp.content) # type(resp.content) # bytes类型 - re_find_result_list = re.findall(r"(.+?)", encode_content) - resp_text_title = ",".join(re_find_result_list) - # 解决所有系统下字符串无法编码输出的问题,比如windows下控制台gbk的情况下,不能gbk解码就是BUG - # output(f"当前控制台输出编码为:{sys.stdout.encoding}", level=SHOW_ERROR) - # 解决windows下韩文无法输出的问题,如果不能gbk解码就是window BUG - # if sys.platform.lower().startswith('win'): - try: - resp_text_title.encode(sys.stdout.encoding) - except Exception as error: - resp_text_title = quote(resp_text_title.encode('utf-8')) - output(f"[!] 字符串使用当前控制台编码 {sys.stdout.encoding} 编码失败," - f"自动转换为UTF-8型URL编码 {resp_text_title}, " - f"ERROR:{error}", - level=LOG_ERROR) - if resp_text_title.strip() == "": - resp_text_title = HTTP_BLANK_TITLE - except Exception as error: - module_common_error_list = [] # 把常规错误的关键字加入列表内,列表为空时都作为非常规错误处理 - show_requests_error(req_url, module_common_error_list, current_module, error) - ############################################################# - # 5、resp_text_hash 获取网页内容hash - current_module = HTTP_RESP_TEXT_HASH - # 如果resp_text_title是空值,说明结果存在问题 - if resp_text_title != HTTP_NULL_TITLE and encode_content != "": - try: - if resp_content_length >= 1024000 * 5: - # 如果返回值太大,就忽略获取结果 - resp_text_hash = HTTP_IGNORE_TEXT_HASH - else: - resp_text_hash = hashlib.md5(resp.content).hexdigest() - except Exception as error: - module_common_error_list = [] # 把常规错误的关键字加入列表内,列表为空时都作为非常规错误处理 - show_requests_error(req_url, module_common_error_list, current_module, error) - ############################################################# - # 6、resp_redirect_url 获取重定向后的URL 通过判断请求的URL是不是响应的URL - current_module = HTTP_RESP_REDIRECT_URL - try: - if req_url.strip() == resp.url.strip(): - resp_redirect_url = HTTP_RAW_REDIRECT_URL - else: - resp_redirect_url = resp.url.strip() - except Exception as error: - module_common_error_list = [] # 把常规错误的关键字加入列表内,列表为空时都作为非常规错误处理 - show_requests_error(req_url, module_common_error_list, current_module, error) finally: # 最终合并所有获取到的结果 current_resp_dict = { - HTTP_REQ_URL: req_url, # 请求的URL + HTTP_REQ_TARGET: req_url, # 请求的URL HTTP_CONST_SIGN: const_sign, # 请求的标记,自定义标记,原样返回 + HTTP_RESP_STATUS: resp_status, # 响应状态码 - HTTP_RESP_BYTES_HEAD: resp_bytes_head, # 响应头字节 - HTTP_RESP_CONTENT_LENGTH: resp_content_length, # 响应内容长度 - HTTP_RESP_TEXT_SIZE: resp_text_size, # 响应内容大小 - HTTP_RESP_TEXT_TITLE: resp_text_title, # 响应文本标题 - HTTP_RESP_TEXT_HASH: resp_text_hash, # 响应文本HASH - HTTP_RESP_REDIRECT_URL: resp_redirect_url, # 响应重定向URL + + HTTP_RESP_HEADERS_CRC: resp_hash_headers, # 响应头HASH + HTTP_RESP_LENGTH: resp_length, # 响应头中的长度 + + HTTP_RESP_TITLE: resp_text_title, # 响应文本标题 + HTTP_RESP_CONTENT_CRC: resp_hash_content, # 响应内容HASH + HTTP_RESP_SIZE: resp_text_size, # 响应内容大小 + + HTTP_RESP_REDIRECT: resp_redirect_url, # 响应重定向URL + + HTTP_RESP_HEADERS_OPT: resp_headers_opt, # 实际响应头 + HTTP_RESP_CONTENT_OPT: resp_content_opt, # 实际响应内容 } + ############################################################# + # active_retry_dict 主动重试动作 当满足条件时,进行主动请求重试 + if retry_times and retry_action_check(active_retry_dict, current_resp_dict): + output(f"[!] 主动重试 {req_url} retry_times: {retry_times}") + return requests_plus(req_url=req_url, req_method=req_method, req_headers=req_headers, req_data=req_data, + req_proxies=req_proxies, req_timeout=req_timeout * 1.5, verify_ssl=verify_ssl, + req_allow_redirects=req_allow_redirects, retry_times=retry_times - 1, + const_sign=const_sign, add_host_header=add_host_header, + add_refer_header=add_refer_header, ignore_encode_error=ignore_encode_error, + resp_headers_need=resp_headers_need, resp_content_need=resp_content_need, + active_retry_dict=active_retry_dict, ) + ############################################################# output(f"[*] 当前目标 {req_url} 请求返回结果集合:{current_resp_dict}") return current_resp_dict # 支持基本重试的请求操作 -def request_retry(req_url, - req_method='GET', - req_headers=None, - req_data=None, - req_proxies=None, - req_timeout=10, - verify_ssl=False, - req_allow_redirects=False, - retry_times=0, - req_stream=False, - clear_cookies=True - ): - if not retry_times > 0: - # 使用常规请求模式 - response = requests.request(url=req_url, - method=req_method, - headers=req_headers, - data=req_data, - proxies=req_proxies, - timeout=req_timeout, - verify=verify_ssl, - allow_redirects=req_allow_redirects, - stream=req_stream) - return response - else: - # 使用session回话模式 - retry_strategy = Retry( - total=retry_times, # 重试的最大次数 - backoff_factor=1, # 重试的延迟时间因子 默认值为0,表示不延迟。 - status_forcelist=[429, 500, 503, 504], # 需要强制重试的HTTP状态码列表。 - allowed_methods=["HEAD", "GET", "POST"], # 允许进行重试的HTTP请求方法列表。 - connect=5, # 连接超时时间 - read=5, # 读取超时时间 - ) - - adapter = HTTPAdapter(max_retries=retry_strategy) - session = requests.Session() - if clear_cookies: - session.cookies.clear() # 清除 session 中的 cookies - session.mount("https://", adapter) - session.mount("http://", adapter) - response = session.request(url=req_url, - method=req_method, - headers=req_headers, - data=req_data, - proxies=req_proxies, - timeout=req_timeout, - verify=verify_ssl, - allow_redirects=req_allow_redirects, - stream=req_stream) - return response - -# if __name__ == '__main__': -# target_url_path_list = ['http://www.baidu.com/201902.iso', -# 'http://www.baidu.com/%%path%%_4_.gz', -# 'http://www.baidu.com/2013.7z', -# 'http://www.baidu.com/201804.rar', -# 'http://www.baidu.com/201706.z'] -# -# # 导入PY3多线程池模块 -# from concurrent.futures import ThreadPoolExecutor, as_completed -# -# threads_count = 3 # 线程池线程数 -# with ThreadPoolExecutor(max_workers=threads_count) as pool: -# all_task = [] -# for url in target_url_path_list: -# # 把请求任务加入线程池 -# task = pool.submit(requests_plus, target_url=url) -# all_task.append(task) -# # 输出线程返回的结果 -# for future in as_completed(all_task): -# output(future, future.result()) +def request_base(target, method='GET', headers=None, data=None, proxies=None, + timeout=10, verify=False, allow_redirects=False, stream=False): + response = requests.request(url=target, method=method, headers=headers, data=data, proxies=proxies, + timeout=timeout, verify=verify, allow_redirects=allow_redirects, stream=stream) + return response + + +if __name__ == '__main__': + target_url_path_list = ['http://www.baidu.com/201902.iso', + 'http://www.baidu.com/%%path%%_4_.gz', + 'http://www.baidu.com/2013.7z', + 'http://www.baidu.com/201804.rar', + 'http://www.baidu.com/201706.z'] + + action_dict = { + HTTP_RESP_STATUS: [429, 500, 503, 504], # 当状态码处于其中时,需要主动重试 + HTTP_RESP_TITLE: ["浏览器安全检查"], # 当 标题 包含关键字时,需要重试 + HTTP_RESP_CONTENT_OPT: ["浏览器安全检查"], # 当 请求体 包含关键字时,需要重试 + HTTP_RESP_HEADERS_OPT: ["浏览器安全检查"], # 当 请求头 包含关键字时,需要重试 + } + + # 导入PY3多线程池模块 + from concurrent.futures import ThreadPoolExecutor, as_completed + + threads_count = 1 # 线程池线程数 + with ThreadPoolExecutor(max_workers=threads_count) as pool: + all_task = [] + for url in target_url_path_list: + # 把请求任务加入线程池 + task = pool.submit(requests_plus, + req_url=url, + req_stream=False, + active_retry_dict=action_dict + ) + all_task.append(task) + # 输出线程返回的结果 + for future in as_completed(all_task): + output(future, future.result()) diff --git a/libs/lib_requests/requests_thread.py b/libs/lib_requests/requests_thread.py index d163c5b..d25f777 100644 --- a/libs/lib_requests/requests_thread.py +++ b/libs/lib_requests/requests_thread.py @@ -24,7 +24,10 @@ def multi_thread_requests_url(task_list, const_sign, add_host_header, add_refer_header, - ignore_encode_error + ignore_encode_error, + resp_headers_need=False, + resp_content_need=False, + active_retry_dict=None, ): """ # 对URL列表进行访问测试,输出返回响应结果 @@ -50,7 +53,11 @@ def multi_thread_requests_url(task_list, const_sign=const_sign, add_host_header=add_host_header, add_refer_header=add_refer_header, - ignore_encode_error=ignore_encode_error) + ignore_encode_error=ignore_encode_error, + resp_headers_need=resp_headers_need, + resp_content_need=resp_content_need, + active_retry_dict=active_retry_dict, + ) time.sleep(thread_sleep) all_task.append(task) output(f"[*] 当前进度 {task_index + 1}/{len(task_list)} {req_url}", level=LOG_DEBUG) @@ -79,8 +86,10 @@ def multi_thread_requests_url_sign(task_list, # const_sign, add_host_header, add_refer_header, - ignore_encode_error - ): + ignore_encode_error, + resp_headers_need=None, + resp_content_need=None, + active_retry_dict=None): # 存储所有响应结果 access_result_dict_list = [] with ThreadPoolExecutor(max_workers=threads_count) as pool: @@ -100,7 +109,11 @@ def multi_thread_requests_url_sign(task_list, const_sign=const_sign, add_host_header=add_host_header, add_refer_header=add_refer_header, - ignore_encode_error=ignore_encode_error) + ignore_encode_error=ignore_encode_error, + resp_headers_need=resp_headers_need, + resp_content_need=resp_content_need, + active_retry_dict=active_retry_dict, + ) time.sleep(thread_sleep) all_task.append(task) output(f"[*] 当前进度 {task_index + 1}/{len(task_list)} {const_sign}", level=LOG_DEBUG) @@ -119,7 +132,7 @@ def multi_thread_requests_url_body_sign(task_list, # target_url, req_method, req_headers, - # req_data, + # data, req_proxies, req_timeout, verify_ssl, @@ -129,8 +142,10 @@ def multi_thread_requests_url_body_sign(task_list, # const_sign, add_host_header, add_refer_header, - ignore_encode_error - ): + ignore_encode_error, + resp_headers_need=False, + resp_content_need=False, + active_retry_dict=None): # 存储所有响应结果 access_result_dict_list = [] with ThreadPoolExecutor(max_workers=threads_count) as pool: @@ -149,7 +164,11 @@ def multi_thread_requests_url_body_sign(task_list, const_sign=const_sign, add_host_header=add_host_header, add_refer_header=add_refer_header, - ignore_encode_error=ignore_encode_error) + ignore_encode_error=ignore_encode_error, + resp_headers_need=resp_headers_need, + resp_content_need=resp_content_need, + active_retry_dict=active_retry_dict, + ) time.sleep(thread_sleep) access_result_dict_list.append(task) output(f"[*] 当前进度 {task_index + 1}/{len(task_list)} {const_sign}", level=LOG_DEBUG) @@ -163,8 +182,8 @@ def multi_thread_requests_url_body_headers_sign(task_list, thread_sleep, # target_url, req_method, - # req_headers, - # req_data, + # headers, + # data, req_proxies, req_timeout, verify_ssl, @@ -174,7 +193,10 @@ def multi_thread_requests_url_body_headers_sign(task_list, # const_sign, add_host_header, add_refer_header, - ignore_encode_error + ignore_encode_error, + resp_headers_need=False, + resp_content_need=False, + active_retry_dict=None, ): # 存储所有响应结果 access_result_dict_list = [] @@ -194,7 +216,11 @@ def multi_thread_requests_url_body_headers_sign(task_list, const_sign=const_sign, add_host_header=add_host_header, add_refer_header=add_refer_header, - ignore_encode_error=ignore_encode_error) + ignore_encode_error=ignore_encode_error, + resp_headers_need=resp_headers_need, + resp_content_need=resp_content_need, + active_retry_dict=active_retry_dict, + ) time.sleep(thread_sleep) access_result_dict_list.append(task) output(f"[*] 当前进度 {task_index + 1}/{len(task_list)} {const_sign}", level=LOG_DEBUG) diff --git a/libs/lib_requests/requests_tools.py b/libs/lib_requests/requests_utils.py similarity index 84% rename from libs/lib_requests/requests_tools.py rename to libs/lib_requests/requests_utils.py index c3b74d1..e82b6e5 100644 --- a/libs/lib_requests/requests_tools.py +++ b/libs/lib_requests/requests_utils.py @@ -13,6 +13,16 @@ from libs.lib_requests.requests_const import * +def replace_content(content): + # 替换 \n 为空格 + content = content.replace('\n', ' ') + # 替换双引号为单引号 + content = content.replace('"', "'") + # 替换 \ 为 / + content = content.replace('\\', '/') + return content + + def content_encode(content): # 自动分析响应编码 # 1、使用import chardet @@ -39,23 +49,25 @@ def content_encode(content): # 判断列表内的元素是否存在有包含在字符串内的 def list_ele_in_str(list_=None, str_=None, default=False): - if list_ is None: - list_ = [] - - flag = False - if list_: - for ele in list_: - if ele in str_: - flag = True - break - else: + if not list_: flag = default + else: + # flag = False + # for ele in list_: + # if ele in str_: + # flag = True + # break + # 在 lists为空列表时,any(key in string for key in lists) 会返回 False。 + flag = any(key in str(str_) for key in list_) return flag # 获得随机字符串 -def get_random_str(length=12, has_num=True, has_capital=True, has_symbols=False, has_dot=False, with_slash=False): - base_str = 'abcdefghigklmnopqrstuvwxyz' +def get_random_str(length=12, has_char=True, has_num=True, has_capital=True, + has_symbols=False, has_dot=False, with_slash=False): + base_str = "" + if has_char: + base_str += 'abcdefghigklmnopqrstuvwxyz' if has_num: base_str += '0123456789' if has_capital: @@ -115,33 +127,42 @@ def analysis_dict_same_keys(result_dict_list, default_value_dict, filter_ignore_ return same_key_value_dict -def calc_dict_info_hash(resp_dict, crc_mode=True): +def sorted_data_dict(data_dict): + # 快速将响应头字典固定为字符串 + sorted_items = sorted(data_dict.items()) + stores_string = ', '.join([f'{key}: {value}' for key, value in sorted_items]) + return stores_string + + +def calc_dict_info_hash(data_dict, crc_mode=True): # 计算响应结果的特征值 - # 对字典的键值对进行排序 - str_sorted_items = str(sorted(resp_dict.items())) + + # 对字典的键值对进行固定和排序 + str_sorted_items = data_dict if isinstance(data_dict, str) else sorted_data_dict(data_dict) + if crc_mode: # 计算crc32的值,比md5更快 mark_value = binascii.crc32(str_sorted_items.encode()) - mark_value = f"crc32_{mark_value}" + mark_value = f"CRC32_{mark_value}" else: mark_value = hashlib.md5(str_sorted_items.encode()).hexdigest() - mark_value = f"md5_{mark_value}" + mark_value = f"MD5_{mark_value}" return mark_value def copy_dict_remove_keys(resp_dict, remove_keys=None): # 移除响应字典中和URL相关的选项, 仅保留响应部分 - # {'HTTP_REQ_URL': 'https://www.baidu.com/home.rar', # 需要排除 + # {'HTTP_REQ_TARGET': 'https://www.baidu.com/home.rar', # 需要排除 # 'HTTP_CONST_SIGN': 'https://www.baidu.com/home.rar', # 需要排除 - # 'HTTP_RESP_REDIRECT_URL': 'HTTP_RAW_REDIRECT_URL'} # 可选排除 + # 'HTTP_RESP_REDIRECT': 'RESP_REDIRECT_ORIGIN'} # 可选排除 # 保留原始dict数据 copy_resp_dict = copy.copy(resp_dict) if remove_keys is None: - remove_keys = [HTTP_REQ_URL, HTTP_CONST_SIGN] + remove_keys = [HTTP_REQ_TARGET, HTTP_CONST_SIGN] for remove_key in remove_keys: # copy_resp_dict[remove_key] = "" # 清空指定键的值 copy_resp_dict.pop(remove_key, "") # 删除指定键并返回其对应的值 # 删除不存在的键时,指定默认值,不会引发异常 - # output(f"[*] 新的字典键数量:{len(copy_resp_dict.keys())}, 原始字典键数量:{len(resp_dict.keys())}", level=LOG_DEBUG) + # output(f"[*] 新的字典键数量:{len(copy_resp_dict.keys())}, 原始字典键数量:{len(data_dict.keys())}", level=LOG_DEBUG) return copy_resp_dict @@ -162,7 +183,7 @@ def access_result_handle(result_dict_list, should_stop_run = False # 访问失败的结果 # 就是除去URL和SING之外都是默认值 - access_fail_resp_dict = copy_dict_remove_keys(HTTP_DEFAULT_RESP_DICT) + access_fail_resp_dict = copy_dict_remove_keys(DEFAULT_HTTP_RESP_DICT) # 本次扫描的所有命中结果 默认保存的是 请求响应的 CONST_SIGN 属性 hit_result_list = [] @@ -184,7 +205,7 @@ def access_result_handle(result_dict_list, IGNORE_RESP = True # 排除标题被匹配的情况 - resp_text_title = access_resp_dict[HTTP_RESP_TEXT_TITLE] + resp_text_title = access_resp_dict[HTTP_RESP_TITLE] if not IGNORE_RESP and exclude_title_regexp and re.match(exclude_title_regexp, resp_text_title, re.IGNORECASE): IGNORE_RESP = True @@ -193,7 +214,7 @@ def access_result_handle(result_dict_list, for filter_key in list(dynamic_exclude_dict.keys()): filter_value = dynamic_exclude_dict[filter_key] # 被排除的值 access_resp_value = access_resp_dict[filter_key] - ignore_value_list = HTTP_FILTER_VALUE_DICT[filter_key] + ignore_value_list = FILTER_HTTP_VALUE_DICT[filter_key] if access_resp_value != filter_value and access_resp_value not in ignore_value_list: # 存在和排除关键字不同的项, 并且 这个值不是被忽略的值时 写入结果文件 break @@ -204,7 +225,7 @@ def access_result_handle(result_dict_list, if not IGNORE_RESP and isinstance(hit_info_hashes, list): hit_info_hash = calc_dict_info_hash(copy_dict_remove_keys(access_resp_dict)) if hit_info_hash in hit_info_hashes: - output(f"[!] 忽略命中 [{hit_info_hash}] <--> {access_resp_dict[HTTP_REQ_URL]}", level=LOG_ERROR) + output(f"[!] 忽略命中 [{hit_info_hash}] <--> {access_resp_dict[HTTP_REQ_TARGET]}", level=LOG_ERROR) IGNORE_RESP = True else: # output(f"[!] 保留命中 [{hit_info_hash}]", level=LOG_INFO) diff --git a/libs/lib_requests/response_handle.py b/libs/lib_requests/response_handle.py new file mode 100644 index 0000000..bf26290 --- /dev/null +++ b/libs/lib_requests/response_handle.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python +# encoding: utf-8 +import re +import sys +from urllib.parse import quote + +from libs.lib_log_print.logger_printer import output, LOG_DEBUG, LOG_ERROR +from libs.lib_requests.requests_const import * +from libs.lib_requests.requests_const import HTTP_RESP_REDIRECT, RESP_REDIRECT_ORIGIN, RESP_REDIRECT_ERROR, \ + RESP_CONTENT_BLANK, RESP_CONTENT_LARGE, RESP_CONTENT_ERROR, HTTP_RESP_CONTENT_CRC, RESP_CONTENT_CRC_BLANK, \ + RESP_CONTENT_CRC_LARGE, RESP_CONTENT_CRC_ERROR, HTTP_RESP_TITLE, RESP_TITLE_BLANK, RESP_TITLE_LARGE, \ + RESP_TITLE_ERROR, HTTP_RESP_SIZE, RESP_SIZE_BLANK, RESP_SIZE_LARGE, RESP_SIZE_ERROR, HTTP_RESP_CONTENT_OPT, \ + RESP_CONTENT_IGNORE +from libs.lib_requests.requests_utils import sorted_data_dict, calc_dict_info_hash, content_encode, replace_content + + +def show_requests_error(url_info, common_error_list, module_name, error_info): + # 把常规错误的关键字加入列表common_error_list内,列表为空时都作为非常规错误处理 + if any(key in str(error_info) for key in common_error_list): + output(f"[-] 当前目标 {url_info} COMMON ERROR ON Acquire [{module_name}]: [{error_info}]", level=LOG_DEBUG) + else: + output(f"[-] 当前目标 {url_info} OTHERS ERROR ON Acquire [{module_name}]: [{error_info}]", level=LOG_ERROR) + + +def handle_common_error(req_url, error, ignore_encode_error): + if "codec can't encode" in str(error): + # 数据编码错误处理 + if ignore_encode_error: + # 不需要重试的结果 设置resp_status标记为1, + resp_status = RESP_STATUS_IGNORE + output(f"[-] 当前目标 {req_url} 中文数据编码错误, 忽略本次错误...", level=LOG_DEBUG) + else: + # 需要手动访问重试的结果 + resp_status = RESP_STATUS_ERROR + output(f"[-] 当前目标 {req_url} 中文数据编码错误, 返回错误状态!!!", level=LOG_ERROR) + elif "No host supplied" in str(error): + # 目标格式不正确处理 + resp_status = RESP_STATUS_IGNORE + output(f"[-] 当前目标 {req_url} 格式输入错误,忽略本次结果!!!", level=LOG_ERROR) + else: + resp_status = RESP_STATUS_ERROR + output(f"[-] 当前目标 {req_url} 发生未知错误 {error}!!!", level=LOG_ERROR) + return resp_status + + +def get_resp_header_info(req_url, resp, resp_headers_need): + # 获取响应头相关的内容, resp_hash_headers|resp_headers_opt|resp_length + + # 获取原始响应头部 + raw_resp_headers = resp.headers + + current_module = HTTP_RESP_HEADERS_CRC + try: + # 响应头 字符串 HASH + if raw_resp_headers: + resp_hash_headers = calc_dict_info_hash(raw_resp_headers) + else: + resp_hash_headers = RESP_HEADERS_CRC_BLANK + except Exception as error: + show_requests_error(req_url, [], current_module, error) + resp_hash_headers = RESP_HEADERS_CRC_ERROR + + current_module = HTTP_RESP_HEADERS_OPT + try: + # 如果用户需要返回响应头,就进行返回 + if isinstance(resp_headers_need, bool) and resp_headers_need: + resp_headers_opt = sorted_data_dict(raw_resp_headers) + elif isinstance(resp_headers_need, str): + value = raw_resp_headers.get(resp_headers_need) + resp_headers_opt = str(value) if value else RESP_HEADERS_BLANK + elif isinstance(resp_headers_need, list): + # 获取自定义的响应头 + resp_headers_opt = sorted_data_dict({key: str(raw_resp_headers.get(key)) for key in resp_headers_need}) + else: + # 设置为忽略获取 + resp_headers_opt = RESP_HEADERS_IGNORE + except Exception as error: + show_requests_error(req_url, [], current_module, error) + resp_headers_opt = RESP_HEADERS_ERROR + + current_module = HTTP_RESP_LENGTH + try: + if 'Content-Length' in raw_resp_headers.keys(): + resp_length = int(str(raw_resp_headers.get('Content-Length'))) + else: + resp_length = RESP_LENGTH_BLANK + except Exception as error: + module_common_error_list = ["invalid literal for int()"] + show_requests_error(req_url, module_common_error_list, current_module, error) + resp_length = RESP_LENGTH_ERROR + + return resp_headers_opt, resp_hash_headers, resp_length + + +def get_resp_redirect_url(req_url, resp): + # 获取重定向后的URL信息 + current_module = HTTP_RESP_REDIRECT + try: + resp_redirect_url = RESP_REDIRECT_ORIGIN if req_url == resp.url else resp.url + except Exception as error: + show_requests_error(req_url, [], current_module, error) + resp_redirect_url = RESP_REDIRECT_ERROR + return resp_redirect_url + + +def get_resp_text_info(req_url, resp, req_stream, resp_content_need, resp_length, http_maximum_read): + # 1, 先获取原始响应内容 + current_module = "GET_RAW_RESP_CONTENT" + try: + # 1、正常获取到了响应头长度, 判断当前结果大小是否超出限制 + if isinstance(resp_length, int) and resp_length == 0: + encode_content = RESP_CONTENT_BLANK + elif isinstance(resp_length, int) and 0 < resp_length < http_maximum_read: + # 大小没有超出限制, 可以进行正常读取 + encode_content = content_encode(resp.content) # bytes类型 + else: + # 大小超出限制|或者没有发现大小数据,只读取部分数据 + # 如果是流模式,使用raw读取 http_maximum_read + if req_stream: + bytes_content = resp.raw.read(http_maximum_read) + encode_content = content_encode(bytes_content) + else: + encode_content = RESP_CONTENT_LARGE + except Exception as error: + show_requests_error(req_url, [], current_module, error) + encode_content = RESP_CONTENT_ERROR + + # 2、获取响应HASH数据 + current_module = HTTP_RESP_CONTENT_CRC + try: + if current_module in [RESP_CONTENT_BLANK, RESP_CONTENT_ERROR]: + resp_hash_content = RESP_CONTENT_CRC_BLANK + elif current_module in [RESP_CONTENT_LARGE]: + resp_hash_content = RESP_CONTENT_CRC_LARGE + else: + resp_hash_content = calc_dict_info_hash(encode_content, crc_mode=True) + except Exception as error: + show_requests_error(req_url, [], current_module, error) + resp_hash_content = RESP_CONTENT_CRC_ERROR + + # 3、获取响应title + current_module = HTTP_RESP_TITLE + try: + if current_module in [RESP_CONTENT_BLANK, RESP_CONTENT_ERROR]: + resp_text_title = RESP_TITLE_BLANK + elif current_module in [RESP_CONTENT_LARGE]: + resp_text_title = RESP_TITLE_LARGE + else: + try: + re_find = re.findall(r"(.+?)", encode_content, re.IGNORECASE) + resp_text_title = ",".join(re_find) + resp_text_title.encode(sys.stdout.encoding) + except re.error as regex_error: + # 正则表达式匹配错误 + output(f"[!] 正则提取标题失败 ERROR:{regex_error}", level=LOG_ERROR) + resp_text_title = RESP_TITLE_ERROR + except UnicodeEncodeError as encode_error: + resp_text_title = quote(resp_text_title.encode('utf-8')) + output(f"[!] 使用URL编码当前标题 URL标题:{resp_text_title}", level=LOG_ERROR) + except Exception as error: + show_requests_error(req_url, [], current_module, error) + resp_text_title = RESP_TITLE_ERROR + + # 4、获取响应实际大小 + current_module = HTTP_RESP_SIZE + try: + if current_module in [RESP_CONTENT_BLANK, RESP_CONTENT_ERROR]: + resp_text_size = RESP_SIZE_BLANK + elif current_module in [RESP_CONTENT_LARGE]: + resp_text_size = RESP_SIZE_LARGE + else: + resp_text_size = len(encode_content) + except Exception as error: + show_requests_error(req_url, [], current_module, error) + resp_text_size = RESP_SIZE_ERROR + + # 5、提取响应内容 + current_module = HTTP_RESP_CONTENT_OPT + try: + if current_module in [RESP_CONTENT_BLANK, RESP_CONTENT_ERROR, RESP_CONTENT_LARGE]: + resp_content_opt = current_module + else: + # 根据用户输入获取指定的数据 + if isinstance(resp_content_need, bool) and resp_content_need: + resp_content_opt = replace_content(encode_content) + elif isinstance(resp_content_need, str): + try: + re_find = re.findall(resp_content_need, encode_content, re.IGNORECASE) + resp_content_opt = ",".join(re_find) + except re.error as regex_error: + # 正则表达式匹配错误 + output(f"[!] 正则提取数据失败 ERROR:{regex_error}", level=LOG_ERROR) + resp_content_opt = RESP_CONTENT_ERROR + else: + resp_content_opt = RESP_CONTENT_IGNORE + except Exception as error: + show_requests_error(req_url, [], current_module, error) + resp_content_opt = RESP_CONTENT_ERROR + + return resp_content_opt, resp_hash_content, resp_text_title, resp_text_size + + +def retry_action_check(actions_dict, response_dict): + # 声明所有动作的优先级 + if actions_dict and response_dict: + # priority = [HTTP_RESP_STATUS, HTTP_RESP_TITLE, HTTP_RESP_SIZE, HTTP_RESP_LENGTH, HTTP_RESP_REDIRECT, HTTP_RESP_HEADERS_CRC, HTTP_RESP_CONTENT_CRC, HTTP_RESP_HEADERS_OPT, HTTP_RESP_CONTENT_OPT] + priority = list(response_dict.keys()) + # 根据priority列表中元素的索引进行排序 使用lambda函数来提供排序依据, + sorted_actions = sorted(actions_dict.keys(), key=lambda x: priority.index(x)) + print(f"sorted_actions:{sorted_actions}") + for ac_type in sorted_actions: + if ac_type in response_dict.keys(): + if any(str(keyword) in str(response_dict[ac_type]) for keyword in actions_dict[ac_type]): + return True + return False \ No newline at end of file diff --git a/libs/lib_url_analysis/parse_words.py b/libs/lib_url_analysis/parse_words.py index 1538f8b..8d8b12d 100644 --- a/libs/lib_url_analysis/parse_words.py +++ b/libs/lib_url_analysis/parse_words.py @@ -7,18 +7,6 @@ from libs.lib_log_print.logger_printer import output, LOG_ERROR -def list_ele_in_str(list_=None, str_=None, default=True): - flag = False - if list_: - for ele in list_: - if ele in str_: - flag = True - break - else: - flag = default - return flag - - def get_path_words(url, symbol_replace_dict=None, not_allowed_symbol=None): # 获取URL目录中的单词 if not_allowed_symbol is None: @@ -46,7 +34,8 @@ def get_path_words(url, symbol_replace_dict=None, not_allowed_symbol=None): if not_allowed_symbol: tmp_words_list = [] for word in path_words_list: - if not list_ele_in_str(not_allowed_symbol, word): + # if not list_ele_in_str(not_allowed_symbol, word): + if not any(key in word for key in not_allowed_symbol): tmp_words_list.append(word) path_words_list = tmp_words_list @@ -102,7 +91,8 @@ def get_domain_words(url, ignore_ip_format=True, symbol_replace_dict={}, not_all if not_allowed_symbol: tmp_list = [] for domain_val in real_domain_val_list: - if not list_ele_in_str(not_allowed_symbol, domain_val, default=False): + # if not list_ele_in_str(not_allowed_symbol, domain_val, default=False): + if not any(key in domain_val for key in not_allowed_symbol): tmp_list.append(domain_val) real_domain_val_list = tmp_list diff --git a/setting_com.py b/setting_com.py index 52ffeef..08d2d94 100644 --- a/setting_com.py +++ b/setting_com.py @@ -20,7 +20,7 @@ def init_common(config): config[GB_RUN_TIME] = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) ################################################################## # 版本号配置 - config[GB_VERSION] = "Ver 0.5.11 2023-07-25 15:30" + config[GB_VERSION] = "Ver 0.6.0 2023-07-26 05:30" ################################################################## # 是否显示DEBUG级别信息,默认False config[GB_DEBUG_FLAG] = False