From 2f584f55e505d2228a3ae2751b39c6e7dc78d30a Mon Sep 17 00:00:00 2001 From: SHELA Date: Tue, 29 Aug 2023 15:52:09 +0300 Subject: [PATCH] Add Photos, video thumbs and views to posts --- snscrape/modules/telegram.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py index 191a63f..9316b36 100644 --- a/snscrape/modules/telegram.py +++ b/snscrape/modules/telegram.py @@ -14,7 +14,7 @@ _logger = logging.getLogger(__name__) _SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$') - +_TELEGRAM_IMG_RE = re.compile(r"background-image:url\(\'(.*?)\'\)") @dataclasses.dataclass class LinkPreview: @@ -31,6 +31,9 @@ class TelegramPost(snscrape.base.Item): date: datetime.datetime content: str outlinks: list + photos: list + videoThumbs: list + views: str linkPreview: typing.Optional[LinkPreview] = None outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks') @@ -91,6 +94,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): _logger.warning(f'Possibly incorrect URL: {rawUrl!r}') url = rawUrl.replace('//t.me/', '//t.me/s/') date = datetime.datetime.strptime(dateDiv.find('time', datetime = True)['datetime'].replace('-', '', 2).replace(':', ''), '%Y%m%dT%H%M%S%z') + views = post.find("span", {"class": "tgme_widget_message_views"}).text if (message := post.find('div', class_ = 'tgme_widget_message_text')): content = message.text outlinks = [] @@ -107,6 +111,29 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): href = urllib.parse.urljoin(pageUrl, link['href']) if href not in outlinks: outlinks.append(href) + + photos = [] + photosHtml = post.select( + "a[class='tgme_widget_message_photo_wrap grouped_media_wrap blured js-message_photo']" + ) + + if photosHtml: + for photo in photosHtml: + photos.append(_TELEGRAM_IMG_RE.search(photo.get("style")).group(1)) + else: + photosBlock = post.select_one("a[class*='tgme_widget_message_photo_wrap']") + if photosBlock: + photos.append(_TELEGRAM_IMG_RE.search(photosBlock.get("style")).group(1)) + + VideoThumbsHtml = post.select( + "i[class='tgme_widget_message_video_thumb']" + ) + + videoThumbs = [] + if VideoThumbsHtml: + for thumb in VideoThumbsHtml: + videoThumbs.append(_TELEGRAM_IMG_RE.search(thumb.get("style")).group(1)) + else: content = None outlinks = [] @@ -126,7 +153,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): else: _logger.warning(f'Could not process link preview image on {url}') linkPreview = LinkPreview(**kwargs) - yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, linkPreview = linkPreview) + yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, linkPreview = linkPreview, photos = photos, videoThumbs = videoThumbs, views=views) def get_items(self): r, soup = self._initial_page()