From 4f5c12b80a0ba0ce99e9d42e5823ee627c459186 Mon Sep 17 00:00:00 2001 From: "guorong.zheng" <360996299@qq.com> Date: Fri, 17 May 2024 15:19:22 +0800 Subject: [PATCH] release: v1.1.5 --- CHANGELOG.md | 10 +++ README-EN.md | 41 +++++---- README.md | 41 +++++---- config.py | 1 - docs/tutorial-EN.md | 41 +++++---- docs/tutorial.md | 5 +- main.py | 118 +++++++------------------ utils.py | 211 ++++++++++++++++++++++++++++++-------------- version.json | 2 +- 9 files changed, 251 insertions(+), 219 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d2cbbc4f9e..ded3746b18f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # 更新日志(Changelog) +## v1.1.5 + +### 2024/5/17 + +- 增加模糊匹配规则,适配在线检索、订阅源、组播源(Add fuzzy matching rules for online search, subscription sources, and multicast sources) +- 增加订阅源、组播源更新进度条(Added the update progress bar for subscription sources and multicast sources) +- 优化组播源更新可能出现的无匹配结果情况(Optimize the possible situation of no match results in multicast source updates) +- 移除部分错误日志打印(Removes some error log prints) +- 移除严格匹配配置(Removes strict matching configurations) + ## v1.1.4 ### 2024/5/15 diff --git a/README-EN.md b/README-EN.md index 4a59d60fd46..5e3e5e3068b 100644 --- a/README-EN.md +++ b/README-EN.md @@ -20,27 +20,26 @@ Customize channel menus and automatically obtain and update the latest live sour ## Config -| Configuration Item | Default Value | Description | -| ---------------------- | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| source_file | "demo.txt" | Template file name | -| final_file | "result.txt" | Generated file name | -| favorite_list | ["广东珠江","CCTV-1","CCTV-5","CCTV-5+","CCTV-13","广东体育","广东卫视","大湾区卫视","浙江卫视","湖南卫视","翡翠台"] | List of favorite channel names (used only to distinguish from regular channels, custom page retrieval quantity) | -| open_online_search | True | Enable online search source feature | -| favorite_page_num | 5 | Page retrieval quantity for favorite channels | -| default_page_num | 3 | Page retrieval quantity for regular channels | -| urls_limit | 10 | Number of interfaces per channel | -| open_sort | True | Enable the sorting test function, it is recommended to turn it off if you are not using online search | -| response_time_weight | 0.5 | Response time weight value (the sum of all weight values should be 1) | -| resolution_weight | 0.5 | Resolution weight value (the sum of all weight values should be 1) | -| recent_days | 30 | Retrieve interfaces updated within a recent time range (in days), reducing appropriately can avoid matching issues | -| ipv_type | "ipv4" | The type of interface in the generated result, optional values: "ipv4", "ipv6", "all" | -| domain_blacklist | ["epg.pw"] | Interface domain blacklist, used to filter out interfaces with low-quality, ad-inclusive domains | -| url_keywords_blacklist | [] | Interface keyword blacklist, used to filter out interfaces containing specific characters | -| open_subscribe | True | Enable subscription source feature | -| subscribe_urls | ["https://m3u.ibert.me/txt/fmml_dv6.txt",
"https://m3u.ibert.me/txt/o_cn.txt",
"https://m3u.ibert.me/txt/j_iptv.txt"] | Subscription source list | -| open_multicast | True | Enable multicast source function | -| region_list | ["广东"] | Multicast source region list, for more regions please see the fofa_map.py file | -| strict_match | False | Strict matching, when enabled, can minimize the issue of channel interface mismatch to the greatest extent, but at the same time, some fuzzy matching results may be lost | +| Configuration Item | Default Value | Description | +| ---------------------- | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ | +| source_file | "demo.txt" | Template file name | +| final_file | "result.txt" | Generated file name | +| favorite_list | ["广东珠江","CCTV-1","CCTV-5","CCTV-5+","CCTV-13","广东体育","广东卫视","大湾区卫视","浙江卫视","湖南卫视","翡翠台"] | List of favorite channel names (used only to distinguish from regular channels, custom page retrieval quantity) | +| open_online_search | True | Enable online search source feature | +| favorite_page_num | 5 | Page retrieval quantity for favorite channels | +| default_page_num | 3 | Page retrieval quantity for regular channels | +| urls_limit | 10 | Number of interfaces per channel | +| open_sort | True | Enable the sorting function (response speed, date, resolution), or turn it off if it takes a long time to execute | +| response_time_weight | 0.5 | Response time weight value (the sum of all weight values should be 1) | +| resolution_weight | 0.5 | Resolution weight value (the sum of all weight values should be 1) | +| recent_days | 30 | Retrieve interfaces updated within a recent time range (in days), reducing appropriately can avoid matching issues | +| ipv_type | "ipv4" | The type of interface in the generated result, optional values: "ipv4", "ipv6", "all" | +| domain_blacklist | ["epg.pw"] | Interface domain blacklist, used to filter out interfaces with low-quality, ad-inclusive domains | +| url_keywords_blacklist | [] | Interface keyword blacklist, used to filter out interfaces containing specific characters | +| open_subscribe | True | Enable subscription source feature | +| subscribe_urls | ["https://m3u.ibert.me/txt/fmml_dv6.txt",
"https://m3u.ibert.me/txt/o_cn.txt",
"https://m3u.ibert.me/txt/j_iptv.txt"] | Subscription source list | +| open_multicast | True | Enable multicast source function | +| region_list | ["广东"] | Multicast source region list, [more regions](./fofa_map.py) | ## Quick Start diff --git a/README.md b/README.md index 4e30db5e95e..70ff3c28be5 100644 --- a/README.md +++ b/README.md @@ -20,27 +20,26 @@ ## 配置 -| 配置项 | 默认值 | 描述 | -| ---------------------- | --------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------- | -| source_file | "demo.txt" | 模板文件名称 | -| final_file | "result.txt" | 生成文件名称 | -| favorite_list | ["广东珠江","CCTV-1","CCTV-5","CCTV-5+","CCTV-13","广东体育","广东卫视","大湾区卫视","浙江卫视","湖南卫视","翡翠台"] | 关注频道名称列表(仅用于与常规频道区分,自定义获取分页数量) | -| open_online_search | True | 开启线上检索源功能 | -| favorite_page_num | 5 | 关注频道获取分页数量 | -| default_page_num | 3 | 常规频道获取分页数量 | -| urls_limit | 10 | 单个频道接口数量 | -| open_sort | True | 开启排序测试功能,若没有使用线上检索建议关闭 | -| response_time_weight | 0.5 | 响应时间权重值(所有权重值总和应为 1) | -| resolution_weight | 0.5 | 分辨率权重值 (所有权重值总和应为 1) | -| recent_days | 30 | 获取最近时间范围内更新的接口(单位天),适当减小可避免出现匹配问题 | -| ipv_type | "ipv4" | 生成结果中接口的类型,可选值:"ipv4"、"ipv6"、"all" | -| domain_blacklist | ["epg.pw"] | 接口域名黑名单,用于过滤低质量含广告类域名的接口 | -| url_keywords_blacklist | [] | 接口关键字黑名单,用于过滤含特定字符的接口 | -| open_subscribe | True | 开启订阅源功能 | -| subscribe_urls | ["https://m3u.ibert.me/txt/fmml_dv6.txt",
"https://m3u.ibert.me/txt/o_cn.txt",
"https://m3u.ibert.me/txt/j_iptv.txt"] | 订阅源列表 | -| open_multicast | True | 开启组播源功能 | -| region_list | ["广东"] | 组播源地区列表,更多地区请见 fofa_map.py 文件 | -| strict_match | False | 严格匹配,开启可最大程度减少频道接口不匹配问题,同时会丢失部分模糊匹配结果 | +| 配置项 | 默认值 | 描述 | +| ---------------------- | --------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------- | +| source_file | "demo.txt" | 模板文件名称 | +| final_file | "result.txt" | 生成文件名称 | +| favorite_list | ["广东珠江","CCTV-1","CCTV-5","CCTV-5+","CCTV-13","广东体育","广东卫视","大湾区卫视","浙江卫视","湖南卫视","翡翠台"] | 关注频道名称列表(仅用于与常规频道区分,自定义获取分页数量) | +| open_online_search | True | 开启线上检索源功能 | +| favorite_page_num | 5 | 关注频道获取分页数量 | +| default_page_num | 3 | 常规频道获取分页数量 | +| urls_limit | 10 | 单个频道接口数量 | +| open_sort | True | 开启排序功能(响应速度、日期、分辨率),若更执行时间较长可关闭此功能 | +| response_time_weight | 0.5 | 响应时间权重值(所有权重值总和应为 1) | +| resolution_weight | 0.5 | 分辨率权重值 (所有权重值总和应为 1) | +| recent_days | 30 | 获取最近时间范围内更新的接口(单位天),适当减小可避免出现匹配问题 | +| ipv_type | "ipv4" | 生成结果中接口的类型,可选值:"ipv4"、"ipv6"、"all" | +| domain_blacklist | ["epg.pw"] | 接口域名黑名单,用于过滤低质量含广告类域名的接口 | +| url_keywords_blacklist | [] | 接口关键字黑名单,用于过滤含特定字符的接口 | +| open_subscribe | True | 开启订阅源功能 | +| subscribe_urls | ["https://m3u.ibert.me/txt/fmml_dv6.txt",
"https://m3u.ibert.me/txt/o_cn.txt",
"https://m3u.ibert.me/txt/j_iptv.txt"] | 订阅源列表 | +| open_multicast | True | 开启组播源功能 | +| region_list | ["广东"] | 组播源地区列表,[更多地区](./fofa_map.py) | ## 快速上手 diff --git a/config.py b/config.py index 00fbfb71c96..d9b745d1a55 100644 --- a/config.py +++ b/config.py @@ -32,4 +32,3 @@ ] open_multicast = True region_list = ["广东"] -strict_match = False diff --git a/docs/tutorial-EN.md b/docs/tutorial-EN.md index ba6aa53c0f6..f5ed0287c00 100644 --- a/docs/tutorial-EN.md +++ b/docs/tutorial-EN.md @@ -57,27 +57,26 @@ Similar to editing the template, modify the running configuration Adjust the configuration as needed. Below is the default configuration explanation: -| Configuration Item | Default Value | Description | -| ---------------------- | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| source_file | "demo.txt" | Template file name | -| final_file | "result.txt" | Generated file name | -| favorite_list | ["广东珠江","CCTV-1","CCTV-5","CCTV-5+","CCTV-13","广东体育","广东卫视","大湾区卫视","浙江卫视","湖南卫视","翡翠台"] | List of favorite channel names (used only to distinguish from regular channels, custom page retrieval quantity) | -| open_online_search | True | Enable online search source feature | -| favorite_page_num | 5 | Page retrieval quantity for favorite channels | -| default_page_num | 3 | Page retrieval quantity for regular channels | -| urls_limit | 10 | Number of interfaces per channel | -| open_sort | True | Enable the sorting test function, it is recommended to turn it off if you are not using online search | -| response_time_weight | 0.5 | Response time weight value (the sum of all weight values should be 1) | -| resolution_weight | 0.5 | Resolution weight value (the sum of all weight values should be 1) | -| recent_days | 30 | Retrieve interfaces updated within a recent time range (in days), reducing appropriately can avoid matching issues | -| ipv_type | "ipv4" | The type of interface in the generated result, optional values: "ipv4", "ipv6", "all" | -| domain_blacklist | ["epg.pw"] | Interface domain blacklist, used to filter out interfaces with low-quality, ad-inclusive domains | -| url_keywords_blacklist | [] | Interface keyword blacklist, used to filter out interfaces containing specific characters | -| open_subscribe | True | Enable subscription source feature | -| subscribe_urls | ["https://m3u.ibert.me/txt/fmml_dv6.txt",
"https://m3u.ibert.me/txt/o_cn.txt",
"https://m3u.ibert.me/txt/j_iptv.txt"] | Subscription source list | -| open_multicast | True | Enable multicast source function | -| region_list | ["广东"] | Multicast source region list, for more regions please see the fofa_map.py file | -| strict_match | False | Strict matching, when enabled, can minimize the issue of channel interface mismatch to the greatest extent, but at the same time, some fuzzy matching results may be lost | +| Configuration Item | Default Value | Description | +| ---------------------- | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ | +| source_file | "demo.txt" | Template file name | +| final_file | "result.txt" | Generated file name | +| favorite_list | ["广东珠江","CCTV-1","CCTV-5","CCTV-5+","CCTV-13","广东体育","广东卫视","大湾区卫视","浙江卫视","湖南卫视","翡翠台"] | List of favorite channel names (used only to distinguish from regular channels, custom page retrieval quantity) | +| open_online_search | True | Enable online search source feature | +| favorite_page_num | 5 | Page retrieval quantity for favorite channels | +| default_page_num | 3 | Page retrieval quantity for regular channels | +| urls_limit | 10 | Number of interfaces per channel | +| open_sort | True | Enable the sorting function (response speed, date, resolution), or turn it off if it takes a long time to execute | +| response_time_weight | 0.5 | Response time weight value (the sum of all weight values should be 1) | +| resolution_weight | 0.5 | Resolution weight value (the sum of all weight values should be 1) | +| recent_days | 30 | Retrieve interfaces updated within a recent time range (in days), reducing appropriately can avoid matching issues | +| ipv_type | "ipv4" | The type of interface in the generated result, optional values: "ipv4", "ipv6", "all" | +| domain_blacklist | ["epg.pw"] | Interface domain blacklist, used to filter out interfaces with low-quality, ad-inclusive domains | +| url_keywords_blacklist | [] | Interface keyword blacklist, used to filter out interfaces containing specific characters | +| open_subscribe | True | Enable subscription source feature | +| subscribe_urls | ["https://m3u.ibert.me/txt/fmml_dv6.txt",
"https://m3u.ibert.me/txt/o_cn.txt",
"https://m3u.ibert.me/txt/j_iptv.txt"] | Subscription source list | +| open_multicast | True | Enable multicast source function | +| region_list | ["广东"] | Multicast source region list, [more regions](./fofa_map.py) | ## Step 4: Run Updates Locally (Recommended, Stable, Supports a large number of channel updates) diff --git a/docs/tutorial.md b/docs/tutorial.md index 821db14a104..3166003ac76 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -65,7 +65,7 @@ | favorite_page_num | 5 | 关注频道获取分页数量 | | default_page_num | 3 | 常规频道获取分页数量 | | urls_limit | 10 | 单个频道接口数量 | -| open_sort | True | 开启排序测试功能,若没有使用线上检索建议关闭 | +| open_sort | True | 开启排序功能(响应速度、日期、分辨率),若更执行时间较长可关闭此功能 | | response_time_weight | 0.5 | 响应时间权重值(所有权重值总和应为 1) | | resolution_weight | 0.5 | 分辨率权重值 (所有权重值总和应为 1) | | recent_days | 30 | 获取最近时间范围内更新的接口(单位天),适当减小可避免出现匹配问题 | @@ -75,8 +75,7 @@ | open_subscribe | True | 开启订阅源功能 | | subscribe_urls | ["https://m3u.ibert.me/txt/fmml_dv6.txt",
"https://m3u.ibert.me/txt/o_cn.txt",
"https://m3u.ibert.me/txt/j_iptv.txt"] | 订阅源列表 | | open_multicast | True | 开启组播源功能 | -| region_list | ["广东"] | 组播源地区列表,更多地区请见 fofa_map.py 文件 | -| strict_match | False | 严格匹配,开启可最大程度减少频道接口不匹配问题,同时会丢失部分模糊匹配结果 | +| region_list | ["广东"] | 组播源地区列表,[更多地区](./fofa_map.py) | ## 步骤四:本地运行更新(推荐,稳定,支持大量频道更新) diff --git a/main.py b/main.py index b7cdd7c86f5..d18855c4a24 100644 --- a/main.py +++ b/main.py @@ -3,28 +3,25 @@ except ImportError: import config from selenium import webdriver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC # from selenium_stealth import stealth import asyncio -from bs4 import BeautifulSoup from utils import ( getChannelItems, updateChannelUrlsTxt, updateFile, - getResultsFromSoup, sortUrlsBySpeedAndResolution, - getTotalUrls, + getTotalUrlsFromInfoList, + getTotalUrlsFromSortedData, filterUrlsByPatterns, useAccessibleUrl, - getChannelsByExtendBaseUrls, + getChannelsBySubscribeUrls, checkUrlByPatterns, getFOFAUrlsFromRegionList, getChannelsByFOFA, mergeObjects, - getTotalUrlsFromInfoList, + getChannelsInfoListByOnlineSearch, + formatChannelName, ) import logging from logging.handlers import RotatingFileHandler @@ -71,9 +68,8 @@ async def visitPage(self, channelItems): name for _, channelObj in channelItems.items() for name in channelObj.keys() ] if config.open_subscribe: - extendResults = await getChannelsByExtendBaseUrls(channelNames) + extendResults = await getChannelsBySubscribeUrls(channelNames) if config.open_multicast: - print(f"Getting channels by FOFA...") fofa_urls = getFOFAUrlsFromRegionList() fofa_results = {} for url in fofa_urls: @@ -88,7 +84,6 @@ async def visitPage(self, channelItems): total_channels = len(channelNames) pbar = tqdm(total=total_channels) pageUrl = await useAccessibleUrl() if config.open_online_search else None - wait = WebDriverWait(self.driver, 10) for cate, channelObj in channelItems.items(): channelUrls = {} channelObjKeys = channelObj.keys() @@ -96,95 +91,46 @@ async def visitPage(self, channelItems): pbar.set_description( f"Processing {name}, {total_channels - pbar.n} channels remaining" ) + format_name = formatChannelName(name) info_list = [] if config.open_subscribe: - for url, date, resolution in extendResults.get(name, []): + for url, date, resolution in extendResults.get(format_name, []): if url and checkUrlByPatterns(url): info_list.append((url, None, resolution)) if config.open_multicast: - for url in fofa_results.get(name, []): + for url in fofa_results.get(format_name, []): if url and checkUrlByPatterns(url): info_list.append((url, None, None)) if config.open_online_search and pageUrl: - self.driver.get(pageUrl) - search_box = wait.until( - EC.presence_of_element_located( - (By.XPATH, '//input[@type="text"]') - ) - ) - search_box.clear() - search_box.send_keys(name) - submit_button = wait.until( - EC.element_to_be_clickable( - (By.XPATH, '//input[@type="submit"]') - ) - ) - self.driver.execute_script("arguments[0].click();", submit_button) - isFavorite = name in config.favorite_list - pageNum = ( - config.favorite_page_num - if isFavorite - else config.default_page_num + online_info_list = getChannelsInfoListByOnlineSearch( + self.driver, pageUrl, format_name ) - for page in range(1, pageNum + 1): - try: - if page > 1: - page_link = wait.until( - EC.element_to_be_clickable( - ( - By.XPATH, - f'//a[contains(@href, "={page}") and contains(@href, "{name}")]', - ) - ) - ) - self.driver.execute_script( - "arguments[0].click();", page_link - ) - source = re.sub( - r"", - "", - self.driver.page_source, - flags=re.DOTALL, - ) - soup = BeautifulSoup(source, "html.parser") - if soup: - results = getResultsFromSoup(soup, name) - for result in results: - url, date, resolution = result - if url and checkUrlByPatterns(url): - info_list.append((url, date, resolution)) - except Exception as e: - print(f"Error on page {page}: {e}") - continue + if online_info_list: + info_list.extend(online_info_list) try: + channelUrls[name] = filterUrlsByPatterns( + getTotalUrlsFromInfoList(info_list) + ) github_actions = os.environ.get("GITHUB_ACTIONS") - if not github_actions or ( - pbar.n <= 200 and github_actions == "true" + if ( + config.open_sort + and not github_actions + or (pbar.n <= 200 and github_actions == "true") ): - if config.open_sort: - sorted_data = await sortUrlsBySpeedAndResolution(info_list) - if sorted_data: - channelUrls[name] = getTotalUrls(sorted_data) - for ( - url, - date, - resolution, - ), response_time in sorted_data: - logging.info( - f"Name: {name}, URL: {url}, Date: {date}, Resolution: {resolution}, Response Time: {response_time}ms" - ) - else: - channelUrls[name] = filterUrlsByPatterns( - channelObj[name] + sorted_data = await sortUrlsBySpeedAndResolution(info_list) + if sorted_data: + channelUrls[name] = getTotalUrlsFromSortedData(sorted_data) + for ( + url, + date, + resolution, + ), response_time in sorted_data: + logging.info( + f"Name: {name}, URL: {url}, Date: {date}, Resolution: {resolution}, Response Time: {response_time}ms" ) - else: - channelUrls[name] = filterUrlsByPatterns( - getTotalUrlsFromInfoList(info_list) - ) - else: + if len(channelUrls[name]) == 0: channelUrls[name] = filterUrlsByPatterns(channelObj[name]) - except Exception as e: - print(f"Error on sorting: {e}") + except: continue finally: pbar.update() diff --git a/utils.py b/utils.py index 782098078ce..2b7fa3410d4 100644 --- a/utils.py +++ b/utils.py @@ -13,9 +13,56 @@ from urllib.parse import urlparse import requests import re +from bs4 import BeautifulSoup from bs4 import NavigableString import fofa_map from collections import defaultdict +from tqdm import tqdm +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + + +def formatChannelName(name): + """ + Format the channel name with sub and replace and lower + """ + sub_pattern = ( + r"-|_|\((.*?)\)|\[(.*?)\]| |频道|标清|高清|HD|hd|超清|超高|超高清|中央|央视|台" + ) + name = re.sub(sub_pattern, "", name) + name = name.replace("plus", "+") + name = name.replace("PLUS", "+") + name = name.replace("+", "+") + name = name.replace("CCTV1综合", "CCTV1") + name = name.replace("CCTV2财经", "CCTV2") + name = name.replace("CCTV3综艺", "CCTV3") + name = name.replace("CCTV4国际", "CCTV4") + name = name.replace("CCTV4中文国际", "CCTV4") + name = name.replace("CCTV4欧洲", "CCTV4") + name = name.replace("CCTV5体育", "CCTV5") + name = name.replace("CCTV5+体育赛视", "CCTV5+") + name = name.replace("CCTV5+体育赛事", "CCTV5+") + name = name.replace("CCTV5+体育", "CCTV5+") + name = name.replace("CCTV6电影", "CCTV6") + name = name.replace("CCTV7军事", "CCTV7") + name = name.replace("CCTV7军农", "CCTV7") + name = name.replace("CCTV7农业", "CCTV7") + name = name.replace("CCTV7国防军事", "CCTV7") + name = name.replace("CCTV8电视剧", "CCTV8") + name = name.replace("CCTV9记录", "CCTV9") + name = name.replace("CCTV9纪录", "CCTV9") + name = name.replace("CCTV10科教", "CCTV10") + name = name.replace("CCTV11戏曲", "CCTV11") + name = name.replace("CCTV12社会与法", "CCTV12") + name = name.replace("CCTV13新闻", "CCTV13") + name = name.replace("CCTV新闻", "CCTV13") + name = name.replace("CCTV14少儿", "CCTV14") + name = name.replace("CCTV15音乐", "CCTV15") + name = name.replace("CCTV16奥林匹克", "CCTV16") + name = name.replace("CCTV17农业农村", "CCTV17") + name = name.replace("CCTV17农业", "CCTV17") + return name.lower() def getChannelItems(): @@ -52,16 +99,19 @@ def getChannelItems(): return channels -async def getChannelsByExtendBaseUrls(channel_names): +async def getChannelsBySubscribeUrls(channel_names): """ - Get the channels by extending the base urls + Get the channels by subscribe urls """ channels = {} pattern = r"^(.*?),(?!#genre#)(.*?)$" - sub_pattern = r"_\((.*?)\)|_\[(.*?)\]|频道" + subscribe_urls_len = len(config.subscribe_urls) + pbar = tqdm(total=subscribe_urls_len) for base_url in config.subscribe_urls: try: - print(f"Processing extend base url: {base_url}") + pbar.set_description( + f"Processing subscribe {base_url}, {subscribe_urls_len - pbar.n} urls remaining" + ) try: response = requests.get(base_url, timeout=30) except requests.exceptions.Timeout: @@ -70,7 +120,6 @@ async def getChannelsByExtendBaseUrls(channel_names): content = response.text if content: lines = content.split("\n") - link_dict = {} for line in lines: if re.match(pattern, line) is not None: key = re.match(pattern, line).group(1) @@ -80,37 +129,73 @@ async def getChannelsByExtendBaseUrls(channel_names): if resolution_match is not None else None ) - key = re.sub(sub_pattern, "", key).lower() + key = formatChannelName(key) url = re.match(pattern, line).group(2) value = (url, None, resolution) - if key in link_dict: - if value not in link_dict[key]: - link_dict[key].append(value) + if key in channels: + if value not in channels[key]: + channels[key].append(value) else: - link_dict[key] = [value] - found_channels = [] - for channel_name in channel_names: - sub_channel_name = ( - channel_name.lower() - if config.strict_match - else re.sub(sub_pattern, "", channel_name).lower() - ) - values = link_dict.get(sub_channel_name) - if values: - if channel_name in channels: - channels[channel_name] += values - else: - channels[channel_name] = values - found_channels.append(channel_name) - if found_channels: - print(f"{base_url} found channels: {','.join(found_channels)}") + channels[key] = [value] except Exception as e: print(f"Error on {base_url}: {e}") continue - print("Finished processing extend base urls") + finally: + pbar.update() + print("Finished processing subscribe urls") + pbar.close() return channels +def getChannelsInfoListByOnlineSearch(driver, pageUrl, name): + """ + Get the channels info list by online search + """ + wait = WebDriverWait(driver, 10) + driver.get(pageUrl) + search_box = wait.until( + EC.presence_of_element_located((By.XPATH, '//input[@type="text"]')) + ) + search_box.clear() + search_box.send_keys(name) + submit_button = wait.until( + EC.element_to_be_clickable((By.XPATH, '//input[@type="submit"]')) + ) + driver.execute_script("arguments[0].click();", submit_button) + isFavorite = name in config.favorite_list + pageNum = config.favorite_page_num if isFavorite else config.default_page_num + info_list = [] + for page in range(1, pageNum + 1): + try: + if page > 1: + page_link = wait.until( + EC.element_to_be_clickable( + ( + By.XPATH, + f'//a[contains(@href, "={page}") and contains(@href, "{name}")]', + ) + ) + ) + driver.execute_script("arguments[0].click();", page_link) + source = re.sub( + r"", + "", + driver.page_source, + flags=re.DOTALL, + ) + soup = BeautifulSoup(source, "html.parser") + if soup: + results = getResultsFromSoup(soup, name) + for result in results: + url, date, resolution = result + if url and checkUrlByPatterns(url): + info_list.append((url, date, resolution)) + except Exception as e: + # print(f"Error on page {page}: {e}") + continue + return info_list + + def updateChannelUrlsTxt(cate, channelUrls): """ Update the category and channel urls to the final file @@ -168,18 +253,6 @@ def getChannelInfo(element): return date, resolution -def checkNameMatch(name, result_name): - pattern = r"[a-zA-Z]+[_\-+]|cctv" - if re.search( - pattern, - result_name, - re.IGNORECASE, - ): - return name.lower() == result_name.lower() - else: - return True - - def getResultsFromSoup(soup, name): """ Get the results from the soup @@ -194,7 +267,7 @@ def getResultsFromSoup(soup, name): name_element = url_element.find_previous_sibling() if name_element: channel_name = name_element.get_text(strip=True) - if checkNameMatch(name, channel_name): + if name == formatChannelName(channel_name): info_element = url_element.find_next_sibling() date, resolution = getChannelInfo(info_element) results.append((url, date, resolution)) @@ -267,49 +340,49 @@ def filterByDate(data): """ Filter by date and limit """ - default_recent_days = 60 - use_recent_days = getattr(config, "recent_days", 60) - if ( - not isinstance(use_recent_days, int) - or use_recent_days <= 0 - or use_recent_days > 365 - ): + default_recent_days = 30 + use_recent_days = getattr(config, "recent_days", 30) + if not isinstance(use_recent_days, int) or use_recent_days <= 0: use_recent_days = default_recent_days start_date = datetime.datetime.now() - datetime.timedelta(days=use_recent_days) recent_data = [] unrecent_data = [] for (url, date, resolution), response_time in data: + item = ((url, date, resolution), response_time) if date: date = datetime.datetime.strptime(date, "%m-%d-%Y") if date >= start_date: - recent_data.append(((url, date, resolution), response_time)) + recent_data.append(item) else: - unrecent_data.append(((url, date, resolution), response_time)) - if len(recent_data) < config.urls_limit: + unrecent_data.append(item) + else: + unrecent_data.append(item) + recent_data_len = len(recent_data) + if recent_data_len == 0: + recent_data = unrecent_data + elif recent_data_len < config.urls_limit: recent_data.extend(unrecent_data[: config.urls_limit - len(recent_data)]) - return recent_data[: config.urls_limit] + return recent_data -def getTotalUrls(data): +def getTotalUrlsFromInfoList(infoList): """ - Get the total urls with filter by date and depulicate + Get the total urls from info list + """ + total_urls = [url for url, _, _ in infoList] + return list(dict.fromkeys(total_urls))[: config.urls_limit] + + +def getTotalUrlsFromSortedData(data): + """ + Get the total urls with filter by date and depulicate from sorted data """ total_urls = [] if len(data) > config.urls_limit: total_urls = [url for (url, _, _), _ in filterByDate(data)] else: total_urls = [url for (url, _, _), _ in data] - return list(dict.fromkeys(total_urls)) - - -def getTotalUrlsFromInfoList(infoList): - """ - Get the total urls from info list - """ - total_urls = [ - url for url, _, _ in infoList[: min(len(infoList), config.urls_limit)] - ] - return list(dict.fromkeys(total_urls)) + return list(dict.fromkeys(total_urls))[: config.urls_limit] def is_ipv6(url): @@ -412,8 +485,13 @@ def getChannelsByFOFA(source): """ urls = set(re.findall(r"https?://[\w\.-]+:\d+", source)) channels = {} + urls_len = len(urls) + pbar = tqdm(total=urls_len) for url in urls: try: + pbar.set_description( + f"Processing multicast {url}, {urls_len - pbar.n} urls remaining" + ) response = requests.get(url + "/iptv/live/1000.json?key=txiptv", timeout=2) try: json_data = response.json() @@ -421,7 +499,7 @@ def getChannelsByFOFA(source): try: for item in json_data["data"]: if isinstance(item, dict): - item_name = item.get("name").strip() + item_name = formatChannelName(item.get("name")) item_url = item.get("url").strip() if item_name and item_url: total_url = url + item_url @@ -438,6 +516,9 @@ def getChannelsByFOFA(source): except Exception as e: # print(f"{url}: {e}") continue + finally: + pbar.update() + pbar.close() return channels diff --git a/version.json b/version.json index 8c0f63aecc1..e613e8adaa6 100644 --- a/version.json +++ b/version.json @@ -1,3 +1,3 @@ { - "version": "1.1.4" + "version": "1.1.5" } \ No newline at end of file