diff --git a/.gitignore b/.gitignore index f564cdb9..6f0dbc60 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,7 @@ __pycache__/ /dist/ /snscrape.egg-info/ /.eggs/ +/docs/_build/** +/docs/_autosummary/** +.vscode/ +venv/ diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..d4bb2cbb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/_templates/custom-class-template.rst b/docs/_templates/custom-class-template.rst new file mode 100644 index 00000000..df448553 --- /dev/null +++ b/docs/_templates/custom-class-template.rst @@ -0,0 +1,32 @@ +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + :members: + :show-inheritance: + :inherited-members: + + {% block methods %} + + + {% if methods %} + .. rubric:: {{ _('Methods') }} + + .. autosummary:: + {% for item in methods %} + ~{{ name }}.{{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block attributes %} + {% if attributes %} + .. rubric:: {{ _('Attributes') }} + + .. autosummary:: + {% for item in attributes %} + ~{{ name }}.{{ item }} + {%- endfor %} + {% endif %} + {% endblock %} \ No newline at end of file diff --git a/docs/_templates/custom-module-template.rst b/docs/_templates/custom-module-template.rst new file mode 100644 index 00000000..a52dafcc --- /dev/null +++ b/docs/_templates/custom-module-template.rst @@ -0,0 +1,66 @@ +{{ fullname | escape | underline}} + +.. automodule:: {{ fullname }} + + {% block attributes %} + {% if attributes %} + .. rubric:: Module Attributes + + .. autosummary:: + :toctree: + {% for item in attributes %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block functions %} + {% if functions %} + .. rubric:: {{ _('Functions') }} + + .. autosummary:: + :toctree: + {% for item in functions %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block classes %} + {% if classes %} + .. rubric:: {{ _('Classes') }} + + .. autosummary:: + :toctree: + :template: custom-class-template.rst + {% for item in classes %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block exceptions %} + {% if exceptions %} + .. rubric:: {{ _('Exceptions') }} + + .. autosummary:: + :toctree: + {% for item in exceptions %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + +{% block modules %} +{% if modules %} +.. rubric:: Modules + +.. autosummary:: + :toctree: + :template: custom-module-template.rst + :recursive: +{% for item in modules %} + {{ item.split('.')[-1] }} +{%- endfor %} +{% endif %} +{% endblock %} \ No newline at end of file diff --git a/docs/api-reference.rst b/docs/api-reference.rst new file mode 100644 index 00000000..909b9978 --- /dev/null +++ b/docs/api-reference.rst @@ -0,0 +1,11 @@ +.. This file should contain API reference. Ideally, an automatic discovery/summary. + +API Reference +============= + +.. autosummary:: + :toctree: _autosummary + :template: custom-module-template.rst + :recursive: + + snscrape \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..cc7cde96 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,87 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +sys.path.insert(0, os.path.abspath('..')) + +# Tools for importing snscrape at build time +# Avoid name conflict with sphinx configuration variable "version" +from importlib import import_module +from importlib.metadata import metadata + + +# -- Project information ----------------------------------------------------- + +# Project name +project = 'snscrape' + +# Metadata +_metadata = metadata(project) + +# Version in format 0.4.0.20211208 +release = _metadata['version'] +author = _metadata['author'] + +_major, _minor, _patch, _yyyymmdd = release.split('.') + +YEAR = _yyyymmdd[0:4] +copyright = f'{YEAR}, {author}' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.napoleon', + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + # 'sphinx_autodoc_typehints' +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# -- Custom extension options ------------------------------------------------ + +# Put type hint in description instead of signature +# Note: the docstrings are overridden if autodoc_typehints is used +autodoc_typehints = 'description' + +# Set 'both' to use both class and __init__ docstrings. +autoclass_content = 'both' + +# Might want to look at it: +# https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html#confval-autodoc_type_aliases +# autodoc_type_aliases = {} + +# Turn on autosummary +autosummary_generate = True + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'nature' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] \ No newline at end of file diff --git a/docs/google.mustache b/docs/google.mustache new file mode 100644 index 00000000..56ef74a8 --- /dev/null +++ b/docs/google.mustache @@ -0,0 +1,30 @@ +{{! Modified Google Docstring Template }} +{{summaryPlaceholder}} +{{extendedSummaryPlaceholder}} +{{#parametersExist}} +Args: +{{#args}} + {{var}}: {{descriptionPlaceholder}} +{{/args}} +{{#kwargs}} + {{var}}: {{descriptionPlaceholder}}. Defaults to {{&default}}. +{{/kwargs}} +{{/parametersExist}} +{{#exceptionsExist}} +Raises: +{{#exceptions}} + {{type}}: {{descriptionPlaceholder}} +{{/exceptions}} +{{/exceptionsExist}} +{{#returnsExist}} +Returns: +{{#returns}} + {{descriptionPlaceholder}} +{{/returns}} +{{/returnsExist}} +{{#yieldsExist}} +Yields: +{{#yields}} + {{typePlaceholder}}: {{descriptionPlaceholder}} +{{/yields}} +{{/yieldsExist}} \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..08eb97fa --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,112 @@ +.. snscrape documentation master file, created by + sphinx-quickstart on Sat Dec 11 06:18:23 2021. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to snscrape's documentation! +==================================== + +``snscrape`` is a scraper for social networking services (SNS). It scrapes through things like user profiles, hashtags, or searches and returns the discovered items, usually posts. ``snscrape`` supports several SNS: + +================== ======================================================= +Platform Can scrape for items in: +================== ======================================================= +Twitter User profile, hashtag, search, thread, list, trending +Instagram User profile, hashtag, location +Reddit User profile, subreddit, search (via Pushshift) +Facebook User profile, group, community (for visitor posts) +Telegram Channel +VKontakte User profile +Weibo (Sina Weibo) User profile +Mastodon User profile, thread +================== ======================================================= + +``snscrape`` works without the need for logins/authentications. The drawback of doing so, however, is that some platforms (right now, or in the future) may try to impose limits for unauthenticated or not-logged-in requests coming from your IP address. Such IP-based limits are usually temporary. + +``snscrape`` can be used either from CLI or imported as a library. + +CLI usage +--------- + +The generic syntax of snscrape's CLI is: + +.. code-block:: console + + snscrape [GLOBAL-OPTIONS] SCRAPER-NAME [SCRAPER-OPTIONS] [SCRAPER-ARGUMENTS...] + +``snscrape --help`` and ``snscrape SCRAPER-NAME --help`` provide details on the options and arguments. ``snscrape --help`` also lists all available scrapers. + +The default output of the CLI is the URL of each result. + +Some noteworthy global options are: + +* ``--jsonl`` to get output as JSONL. This includes all information extracted by ``snscrape`` (e.g. message content, datetime, images; details vary by scraper). +* ``--max-results NUMBER`` to only return the first ``NUMBER`` results. +* ``--with-entity`` to get an item on the entity being scraped, e.g. the user or channel. This is not supported on all scrapers. (You can use this together with ``--max-results 0`` to only fetch the entity info.) + +**Examples** + +Collect all tweets by Jason Scott (@textfiles): + +.. code-block:: console + + snscrape twitter-user textfiles + +It's usually useful to redirect the output to a file for further processing, e.g. in bash using the filename ``twitter-@textfiles``: + +.. code-block:: console + + snscrape twitter-user textfiles >twitter-@textfiles + + +To get the latest 100 tweets with the hashtag #archiveteam: + +.. code-block:: console + + snscrape --max-results 100 twitter-hashtag archiveteam + + +Library usage +------------- + +The general idea of steps is: + +#. **Instantiate a scraper object.** + ``snscrape`` provides various object classes that implement their own specific ways. For example, :class:`TwitterSearchScraper` gathers tweets via search query, and :class:`TwitterUserScraper` gathers tweets from a specified user. +#. **Call the scraper's** ``get_item()`` **method.** + ``get_item()`` is an iterator and yields one item at a time. + +Each scraper class provides different options and arguments. Refer to the class signature for more information, e.g. in Jupyter Notebook it can be done via:: + + ?TwitterSearchScraper + +**Examples** + +Collect tweets by searching for "omicron variant", limit the results to first 100 tweets, and save the results to a list: + +.. code-block:: python + + from snscrape.modules import TwitterSearchScraper + scraper = TwitterSearchScraper('omicron variant') + + result = [] + + for i, item in enumerate(scraper.get_items()): + result.append(item) + if i == 100: + break + +API reference +============= + +.. toctree:: + :maxdepth: 5 + + api-reference + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..922152e9 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/snscrape/base.py b/snscrape/base.py index bda493fa..e32991d3 100644 --- a/snscrape/base.py +++ b/snscrape/base.py @@ -146,7 +146,17 @@ def __init__(self, *, retries = 3, proxies = None): @abc.abstractmethod def get_items(self): - '''Iterator yielding Items.''' + '''Get items according to the specifications given when instantiating this scraper. + + Yields: + Individual items. + Returns: + An iterator of items. + + Note: + This method is a generator. The number of items is not known beforehand. + Please keep in mind that the scraping results can possibly be a lot of items. + ''' pass diff --git a/snscrape/modules/facebook.py b/snscrape/modules/facebook.py index 6b6bbdee..01a640cd 100644 --- a/snscrape/modules/facebook.py +++ b/snscrape/modules/facebook.py @@ -17,6 +17,8 @@ @dataclasses.dataclass class FacebookPost(snscrape.base.Item): + '''An entity representing one post.''' + cleanUrl: str dirtyUrl: str date: datetime.datetime @@ -31,6 +33,8 @@ def __str__(self): @dataclasses.dataclass class User(snscrape.base.Entity): + '''An entity representing one user.''' + username: str pageId: int name: str @@ -50,6 +54,8 @@ def __str__(self): class _FacebookCommonScraper(snscrape.base.Scraper): + '''Base class for all other Facebook scraper classes.''' + def _clean_url(self, dirtyUrl): u = urllib.parse.urlparse(dirtyUrl) if u.path == '/permalink.php': @@ -159,6 +165,11 @@ def _soup_to_items(self, soup, baseUrl, mode): class _FacebookUserAndCommunityScraper(_FacebookCommonScraper): def __init__(self, username, **kwargs): + ''' + Args: + username: A Facebook username. + ''' + super().__init__(**kwargs) self._username = username self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:78.0) Gecko/20100101 Firefox/78.0', 'Accept-Language': 'en-US,en;q=0.5'} @@ -214,6 +225,8 @@ def _cli_from_args(cls, args): class FacebookUserScraper(_FacebookUserAndCommunityScraper): + '''Scraper class, designed to scrape specific user's profile for posts.''' + name = 'facebook-user' def __init__(self, *args, **kwargs): @@ -292,6 +305,8 @@ def _get_entity(self): class FacebookCommunityScraper(_FacebookUserAndCommunityScraper): + '''Scraper class, designed to collect community/visitor posts.''' + name = 'facebook-community' def __init__(self, *args, **kwargs): @@ -300,13 +315,29 @@ def __init__(self, *args, **kwargs): class FacebookGroupScraper(_FacebookCommonScraper): + '''Scraper class, designed to collect posts in a Facebook group.''' + name = 'facebook-group' - def __init__(self, group, **kwargs): + def __init__(self, group: str, **kwargs): + ''' + Args: + group: A group name or ID. + ''' super().__init__(**kwargs) self._group = group def get_items(self): + '''[summary] + + Raises: + snscrape.base.ScraperException + Yields: + Individual post. + Returns: + An iterator of Facebook posts. + ''' + headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.5'} pageletDataPattern = re.compile(r'"GroupEntstreamPagelet",\{.*?\}(?=,\{)') diff --git a/snscrape/modules/instagram.py b/snscrape/modules/instagram.py index 4a89f3d4..69aa39d8 100644 --- a/snscrape/modules/instagram.py +++ b/snscrape/modules/instagram.py @@ -16,6 +16,8 @@ @dataclasses.dataclass class InstagramPost(snscrape.base.Item): + '''An object representing one Instagram post.''' + url: str date: datetime.datetime content: typing.Optional[str] @@ -33,6 +35,8 @@ def __str__(self): @dataclasses.dataclass class User(snscrape.base.Entity): + '''An object representing one Instagram user.''' + username: str name: typing.Optional[str] followers: snscrape.base.IntWithGranularity @@ -48,6 +52,8 @@ def __str__(self): class _InstagramCommonScraper(snscrape.base.Scraper): + '''Base class for all other Instagram scrapers.''' + def __init__(self, **kwargs): super().__init__(**kwargs) self._headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} @@ -104,6 +110,14 @@ def _check_json_callback(self, r): return True, None def get_items(self): + '''Get posts according to the specifications given when instantiating this scraper. + + Raises: + snscrape.base.ScraperException: [description] + Yields: + Individual post. + ''' + r = self._initial_page() if r.status_code == 404: _logger.warning('Page does not exist') @@ -143,9 +157,15 @@ def get_items(self): class InstagramUserScraper(_InstagramCommonScraper): + '''Scraper class, designed to scrape posts from a specific user profile.''' + name = 'instagram-user' def __init__(self, username, **kwargs): + ''' + Args: + username: Username of the desired profile, without the @ sign. + ''' super().__init__(**kwargs) self._initialUrl = f'https://www.instagram.com/{username}/' self._pageName = 'ProfilePage' @@ -198,9 +218,15 @@ def _cli_from_args(cls, args): class InstagramHashtagScraper(_InstagramCommonScraper): + '''Scraper object, designed to scrape Instagram through hashtag.''' + name = 'instagram-hashtag' def __init__(self, hashtag, **kwargs): + ''' + Args: + hashtag: Hashtag query, without the # sign. + ''' super().__init__(**kwargs) self._initialUrl = f'https://www.instagram.com/explore/tags/{hashtag}/' self._pageName = 'TagPage' @@ -220,9 +246,15 @@ def _cli_from_args(cls, args): class InstagramLocationScraper(_InstagramCommonScraper): + '''Scraper class, designed to scrape for posts according to its location ID.''' + name = 'instagram-location' def __init__(self, locationId, **kwargs): + ''' + Args: + locationId: Desired location ID. + ''' super().__init__(**kwargs) self._initialUrl = f'https://www.instagram.com/explore/locations/{locationId}/' self._pageName = 'LocationsPage' diff --git a/snscrape/modules/mastodon.py b/snscrape/modules/mastodon.py index cfe69e43..e04f0da3 100644 --- a/snscrape/modules/mastodon.py +++ b/snscrape/modules/mastodon.py @@ -18,6 +18,7 @@ @dataclasses.dataclass class Toot(snscrape.base.Item): + '''An object representing one toot.''' url: str id: str user: 'User' @@ -36,6 +37,7 @@ def __str__(self): @dataclasses.dataclass class Boost(snscrape.base.Item): + '''An object representing a boost of toot by a user.''' user: 'User' toot: Toot @@ -46,12 +48,14 @@ def __str__(self): @dataclasses.dataclass class Attachment: + '''An object representing attachment in a toot.''' url: str name: str @dataclasses.dataclass class Poll: + '''An object representing a poll.''' id: str expirationDate: datetime.datetime multiple: bool @@ -62,12 +66,18 @@ class Poll: @dataclasses.dataclass class PollOption: + '''An object representing an option in polls.''' title: str votesCount: int @dataclasses.dataclass class User(snscrape.base.Entity): + '''An object representing one user. + + Most fields can be None if not known. + ''' + account: str # @username@domain.invalid displayName: typing.Optional[str] = None displayNameWithCustomEmojis: typing.Optional[typing.List[typing.Union[str, 'CustomEmoji']]] = None @@ -92,6 +102,8 @@ class CustomEmoji: class _MastodonCommonScraper(snscrape.base.Scraper): + '''Base class for all other Mastodon scraper classes.''' + def __init__(self, **kwargs): super().__init__(**kwargs) self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'} @@ -243,9 +255,16 @@ def _url_to_account(url): class MastodonProfileScraper(_MastodonCommonScraper): + '''Scraper class, designed to scrape toots of a specific user profile.''' + name = 'mastodon-profile' def __init__(self, account, **kwargs): + ''' + Args: + account: The desired Mastodon account. + ''' + super().__init__(**kwargs) if account.startswith('@') and account.count('@') == 2: account, domain = account[1:].split('@') @@ -297,6 +316,7 @@ def _cli_from_args(cls, args): class MastodonTootScraperMode(enum.Enum): + '''Enumeration for Mastodon toot scraping mode''' SINGLE = 'single' THREAD = 'thread' @@ -308,9 +328,17 @@ def _cli_from_args(cls, args): class MastodonTootScraper(_MastodonCommonScraper): + '''Scraper class, designed to scrape a specific toot and thread surrounding it.''' + name = 'mastodon-toot' def __init__(self, url, *, mode = MastodonTootScraperMode.SINGLE, **kwargs): + ''' + Args: + url: URL for the desired toot. + mode: Defaults to MastodonTootScraperMode.SINGLE. + ''' + super().__init__(**kwargs) self._url = url self._mode = mode diff --git a/snscrape/modules/reddit.py b/snscrape/modules/reddit.py index 32b9d6a0..27c58f4e 100644 --- a/snscrape/modules/reddit.py +++ b/snscrape/modules/reddit.py @@ -19,6 +19,8 @@ @dataclasses.dataclass class Submission(snscrape.base.Item): + '''An object representing one Reddit submission post.''' + author: typing.Optional[str] # E.g. submission hf7k6 created: datetime.datetime id: str @@ -34,6 +36,8 @@ def __str__(self): @dataclasses.dataclass class Comment(snscrape.base.Item): + '''An object representing one Reddit comment post.''' + author: typing.Optional[str] body: str created: datetime.datetime @@ -72,6 +76,11 @@ def _cmp_id(id1, id2): class _RedditPushshiftScraper(snscrape.base.Scraper): + '''Base scraper for all other Reddit scraper classes + + Note: Reddit scraper uses Pushshift. + ''' + def __init__(self, **kwargs): super().__init__(**kwargs) self._headers = {'User-Agent': f'snscrape/{snscrape.version.__version__}'} @@ -211,6 +220,13 @@ def _iter_api_submissions_and_comments(self, params: dict): break def get_items(self): + '''Get posts according to the specifications given when instantiating this scraper. + + Yields: + Individual post. + Returns: + An iterator of posts. + ''' yield from self._iter_api_submissions_and_comments({type(self)._apiField: self._name}) @classmethod @@ -229,18 +245,24 @@ def _cli_from_args(cls, args): class RedditUserScraper(_RedditPushshiftSearchScraper): + '''Scraper class, designed to scrape posts made by specific user.''' + name = 'reddit-user' _validationFunc = lambda x: re.match('^[A-Za-z0-9_-]{3,20}$', x) _apiField = 'author' class RedditSubredditScraper(_RedditPushshiftSearchScraper): + '''Scraper class, designed to scrape a subreddit for posts.''' + name = 'reddit-subreddit' _validationFunc = lambda x: re.match('^[A-Za-z0-9][A-Za-z0-9_]{2,20}$', x) _apiField = 'subreddit' class RedditSearchScraper(_RedditPushshiftSearchScraper): + '''Scraper class, designed to scrape Reddit via search query.''' + name = 'reddit-search' _validationFunc = lambda x: True _apiField = 'q' diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py index 11cc19bb..68dcc251 100644 --- a/snscrape/modules/telegram.py +++ b/snscrape/modules/telegram.py @@ -17,6 +17,8 @@ @dataclasses.dataclass class LinkPreview: + '''An object representing a Telegram link preview.''' + href: str siteName: typing.Optional[str] = None title: typing.Optional[str] = None @@ -26,6 +28,8 @@ class LinkPreview: @dataclasses.dataclass class TelegramPost(snscrape.base.Item): + '''An object representing one Telegram post.''' + url: str date: datetime.datetime content: str @@ -40,6 +44,8 @@ def __str__(self): @dataclasses.dataclass class Channel(snscrape.base.Entity): + '''An object repreenting one Telegram channel.''' + username: str title: str verified: bool @@ -61,9 +67,16 @@ def __str__(self): class TelegramChannelScraper(snscrape.base.Scraper): + '''Scraper class designed to scrape posts in Telegram channels''' + name = 'telegram-channel' def __init__(self, name, **kwargs): + ''' + Args: + name: Channel name. + ''' + super().__init__(**kwargs) self._name = name self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'} @@ -128,6 +141,15 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, linkPreview = linkPreview) def get_items(self): + '''Get posts according to the specifications given when instantiating this scraper. + + Raises: + snscrape.base.ScraperException + Yields: + Individual post. + Returns: + An iterator of posts. + ''' r, soup = self._initial_page() if '/s/' not in r.url: _logger.warning('No public post list for this user') diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 485b0e95..bc0b59ee 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -41,6 +41,11 @@ @dataclasses.dataclass class Tweet(snscrape.base.Item): + '''An object representing one tweet. + + Most fields can be None if not known. + ''' + url: str date: datetime.datetime content: str @@ -79,17 +84,20 @@ def __str__(self): class Medium: + '''Base class for all Twitter media objects.''' pass @dataclasses.dataclass class Photo(Medium): + '''An object representing a photo in Twitter.''' previewUrl: str fullUrl: str @dataclasses.dataclass class VideoVariant: + '''An object representing specs/variant of a Twitter video.''' contentType: str url: str bitrate: typing.Optional[int] @@ -97,6 +105,7 @@ class VideoVariant: @dataclasses.dataclass class Video(Medium): + '''An object representing a video in Twitter.''' thumbnailUrl: str variants: typing.List[VideoVariant] duration: float @@ -105,12 +114,14 @@ class Video(Medium): @dataclasses.dataclass class Gif(Medium): + '''An object representing a gif image in Twitter.''' thumbnailUrl: str variants: typing.List[VideoVariant] @dataclasses.dataclass class DescriptionURL: + '''An object representing URL description in a tweet.''' text: typing.Optional[str] url: str tcourl: str @@ -119,12 +130,14 @@ class DescriptionURL: @dataclasses.dataclass class Coordinates: + '''An object representing a coordinate in Twitter.''' longitude: float latitude: float @dataclasses.dataclass class Place: + '''An object representing a named place in Twitter.''' fullName: str name: str type: str @@ -152,7 +165,10 @@ def __str__(self): @dataclasses.dataclass class User(snscrape.base.Entity): - # Most fields can be None if they're not known. + '''An object representing one user. + + Most fields can be None if not known. + ''' username: str id: int @@ -186,6 +202,11 @@ def __str__(self): @dataclasses.dataclass class UserLabel: + '''An object representing user label. + + Label is a badge that shows whether the Twitter account is affiliated with any government or other certain organizations. + ''' + description: str url: typing.Optional[str] = None badgeUrl: typing.Optional[str] = None @@ -194,6 +215,8 @@ class UserLabel: @dataclasses.dataclass class Trend(snscrape.base.Item): + '''An object representing one trend, i.e. a topic which is trending.''' + name: str domainContext: str metaDescription: typing.Optional[str] = None @@ -296,6 +319,8 @@ class _TwitterAPIType(enum.Enum): class _TwitterAPIScraper(snscrape.base.Scraper): + '''Base class for all other Twitter scraper classes.''' + def __init__(self, baseUrl, *, guestTokenManager = None, **kwargs): super().__init__(**kwargs) self._baseUrl = baseUrl @@ -740,9 +765,21 @@ def _cli_construct(cls, argparseArgs, *args, **kwargs): class TwitterSearchScraper(_TwitterAPIScraper): + '''Scraper class, designed to scrape Twitter through specific search query.''' + name = 'twitter-search' def __init__(self, query, *, cursor = None, top = False, **kwargs): + ''' + Args: + query: Search query. Must not be empty. + cursor: cursor. Defaults to None. + top: top. Defaults to False. + + Raises: + ValueError: When query is empty (including whitespace-only and empty strings). + ''' + if not query.strip(): raise ValueError('empty query') super().__init__(baseUrl = 'https://twitter.com/search?' + urllib.parse.urlencode({'f': 'live', 'lang': 'en', 'q': query, 'src': 'spelling_expansion_revert_click'}), **kwargs) @@ -760,7 +797,21 @@ def _check_scroll_response(self, r): return False, 'non-200 status code' return True, None - def get_items(self): + def get_items(self) -> typing.Iterator[Tweet]: + '''Get tweets according to the specifications given when instantiating this scraper. + + Raises: + ValueError + Yields: + Individual tweet. + Returns: + An iterator of tweets. + + Note: + This method is a generator. The number of tweets is not known beforehand. + Please keep in mind that the scraping results can possibly be a lot of tweets. + ''' + if not self._query.strip(): raise ValueError('empty query') paginationParams = { @@ -816,9 +867,19 @@ def _cli_from_args(cls, args): class TwitterUserScraper(TwitterSearchScraper): + '''Scraper class, designed to scrape tweets of a specific user profile.''' + name = 'twitter-user' def __init__(self, user, **kwargs): + ''' + Args: + user: Username of the desired profile, without the @ sign. + + Raises: + ValueError: When ``user`` is not a valid Twitter username. + ''' + self._isUserId = isinstance(user, int) if not self._isUserId and not self.is_valid_username(user): raise ValueError('Invalid username') @@ -878,6 +939,12 @@ def get_items(self): @staticmethod def is_valid_username(s): + '''Check if s is a valid Twitter username. + + Args: + s: Twitter username + ''' + return 1 <= len(s) <= 15 and s.strip(string.ascii_letters + string.digits + '_') == '' @classmethod @@ -896,6 +963,8 @@ def _cli_from_args(cls, args): class TwitterProfileScraper(TwitterUserScraper): + '''Scraper class, designed to scrape tweets of a specific user profile.''' + name = 'twitter-profile' def get_items(self): @@ -932,9 +1001,16 @@ def get_items(self): class TwitterHashtagScraper(TwitterSearchScraper): + '''Scraper object, designed to scrape Twitter through hashtag.''' + name = 'twitter-hashtag' - def __init__(self, hashtag, **kwargs): + def __init__(self, hashtag: str, **kwargs): + ''' + Args: + hashtag: Hashtag query, without the # sign. + ''' + super().__init__(f'#{hashtag}', **kwargs) self._hashtag = hashtag @@ -948,6 +1024,7 @@ def _cli_from_args(cls, args): class TwitterTweetScraperMode(enum.Enum): + '''Enumeration of modes for :class:`TwitterTweetScraper`''' SINGLE = 'single' SCROLL = 'scroll' RECURSE = 'recurse' @@ -962,14 +1039,28 @@ def _cli_from_args(cls, args): class TwitterTweetScraper(_TwitterAPIScraper): + '''Scraper object designed to scrape a specific tweet or thread surrounding it.''' + name = 'twitter-tweet' def __init__(self, tweetId, *, mode = TwitterTweetScraperMode.SINGLE, **kwargs): + ''' + Args: + tweetId: ID of the tweet. + mode: Scraping mode. Defaults to TwitterTweetScraperMode.SINGLE. + ''' + self._tweetId = tweetId self._mode = mode super().__init__(f'https://twitter.com/i/web/status/{self._tweetId}', **kwargs) def get_items(self): + '''Get items according to specifications given when instantiating this scraper. + + Yields: + Individual tweet. + ''' + paginationVariables = { 'focalTweetId': str(self._tweetId), 'cursor': None, @@ -1041,9 +1132,16 @@ def _cli_from_args(cls, args): class TwitterListPostsScraper(TwitterSearchScraper): + '''Scraper object designed to scrape tweets from a Twitter list''' + name = 'twitter-list-posts' def __init__(self, listName, **kwargs): + ''' + Args: + listName: A Twitter list ID, or a string in the form "username/listname" (replace spaces with dashes). + ''' + super().__init__(f'list:{listName}', **kwargs) self._listName = listName @@ -1057,12 +1155,20 @@ def _cli_from_args(cls, args): class TwitterTrendsScraper(_TwitterAPIScraper): + '''Scraper object, designed to scrape Twitter trending topics.''' + name = 'twitter-trends' def __init__(self, **kwargs): super().__init__('https://twitter.com/i/trends', **kwargs) def get_items(self): + '''Get trending topics on Twitter. + + Yields: + Individual trending topic. + ''' + params = { 'include_profile_interstitial_type': '1', 'include_blocking': '1', diff --git a/snscrape/modules/vkontakte.py b/snscrape/modules/vkontakte.py index 5bed3309..b6231aa3 100644 --- a/snscrape/modules/vkontakte.py +++ b/snscrape/modules/vkontakte.py @@ -40,6 +40,8 @@ def _localised_datetime(tz, *args, **kwargs): @dataclasses.dataclass class VKontaktePost(snscrape.base.Item): + '''An object representing one VKontakte post.''' + url: str date: typing.Optional[typing.Union[datetime.datetime, datetime.date]] content: str @@ -54,12 +56,14 @@ def __str__(self): @dataclasses.dataclass class Photo: + '''An object representing a photo in VK.''' variants: typing.List['PhotoVariant'] url: typing.Optional[str] = None @dataclasses.dataclass class PhotoVariant: + '''An object representing a photo variant in VK.''' url: str width: int height: int @@ -67,6 +71,7 @@ class PhotoVariant: @dataclasses.dataclass class Video: + '''An object representing a video in VK.''' id: str list: str duration: int @@ -76,6 +81,8 @@ class Video: @dataclasses.dataclass class User(snscrape.base.Entity): + '''An object representing one VKontakte user.''' + username: str name: str verified: bool @@ -98,9 +105,16 @@ def __str__(self): class VKontakteUserScraper(snscrape.base.Scraper): + '''Scraper class designed to scrape for items in a VKontakte user profile.''' + name = 'vkontakte-user' def __init__(self, username, **kwargs): + ''' + Args: + username: a VK username. + ''' + super().__init__(**kwargs) self._username = username self._baseUrl = f'https://vk.com/{self._username}' diff --git a/snscrape/modules/weibo.py b/snscrape/modules/weibo.py index 38318bdd..a4151061 100644 --- a/snscrape/modules/weibo.py +++ b/snscrape/modules/weibo.py @@ -15,6 +15,8 @@ @dataclasses.dataclass class Post(snscrape.base.Item): + '''An object representing one post.''' + url: str id: str user: typing.Optional['User'] @@ -35,6 +37,8 @@ def __str__(self): @dataclasses.dataclass class User(snscrape.base.Entity): + '''An object representing one user.''' + screenname: str uid: int verified: bool @@ -50,9 +54,15 @@ def __str__(self): class WeiboUserScraper(snscrape.base.Scraper): + '''Scraper class, designed to scrape a specific Weibo user's profile for posts.''' + name = 'weibo-user' def __init__(self, user, **kwargs): + ''' + Args: + user: a Weibo username. + ''' super().__init__(**kwargs) self._user = user self._isUserId = isinstance(user, int)