Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix get_post_by_search #1042

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 2 additions & 38 deletions facebook_scraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
import pathlib
import sys
import warnings
import pickle
from typing import Any, Dict, Iterator, Optional, Set, Union

from requests.cookies import cookiejar_from_dict

from .constants import DEFAULT_REQUESTS_TIMEOUT, DEFAULT_COOKIES_FILE_PATH
from .constants import DEFAULT_REQUESTS_TIMEOUT
from .facebook_scraper import FacebookScraper
from .fb_types import Credentials, Post, RawPost, Profile
from .utils import html_element_to_string, parse_cookie_file
Expand Down Expand Up @@ -300,6 +299,7 @@ def get_posts_by_search(
credentials: Optional[Credentials] = None,
**kwargs,
) -> Iterator[Post]:

"""Get posts by searching all of Facebook
Args:
word (str): The word for searching posts.
Expand Down Expand Up @@ -517,42 +517,6 @@ def enable_logging(level=logging.DEBUG):
logger.setLevel(level)


def use_persistent_session(email: str, password: str, cookies_file_path=DEFAULT_COOKIES_FILE_PATH):
"""Login persistently to Facebook and save cookies to a file (default: ".fb-cookies.pckl"). This is highly recommended if you want to scrape several times a day because it will keep your session alive instead of logging in every time (which can be flagged as suspicious by Facebook).

Args:
email (str): email address to login.
password (str): password to login.
cookies_file_path (str, optional): path to the file in which to save cookies. Defaults to ".fb-cookies.pckl".

Raises:
exceptions.InvalidCredentials: if the credentials are invalid.

Returns:
Boolean: True if the login was successful, False otherwise.
"""
try:
with open(cookies_file_path, "rb") as f:
cookies = pickle.load(f)
logger.debug("Loaded cookies from %s", cookies_file_path)
except FileNotFoundError:
logger.error("No cookies file found at %s", cookies_file_path)
cookies = None
try:
if not cookies:
raise exceptions.InvalidCookies()
set_cookies(cookies)
logger.debug("Successfully logged in with cookies")
except exceptions.InvalidCookies:
logger.exception("Invalid cookies, trying to login with credentials")
_scraper.login(email, password)
cookies = _scraper.session.cookies
with open(cookies_file_path, "wb") as f:
pickle.dump(cookies, f)
set_cookies(cookies)
logger.debug("Successfully logged in with credentials")


# Disable logging by default
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
2 changes: 0 additions & 2 deletions facebook_scraper/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,3 @@

DEFAULT_REQUESTS_TIMEOUT = 30
DEFAULT_PAGE_LIMIT = 10

DEFAULT_COOKIES_FILE_PATH = '.fb-cookies.pckl'
130 changes: 45 additions & 85 deletions facebook_scraper/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from typing import Any, Dict, Optional
from urllib.parse import parse_qs, urlparse
from tqdm.auto import tqdm
from collections import defaultdict

from . import utils, exceptions
from .constants import FB_BASE_URL, FB_MOBILE_BASE_URL, FB_W3_BASE_URL
Expand Down Expand Up @@ -87,7 +86,6 @@ class PostExtractor:
bad_json_key_regex = re.compile(r'(?P<prefix>[{,])(?P<key>\w+):')

more_url_regex = re.compile(r'(?<=…\s)<a')
has_translation_regex = re.compile(r'<span.*>Rate Translation</span>')
post_story_regex = re.compile(r'href="(\/story[^"]+)" aria')

def __init__(self, element, options, request_fn, full_post_html=None):
Expand Down Expand Up @@ -140,7 +138,6 @@ def make_new_post(self) -> Post:
'shared_time': None,
'shared_user_id': None,
'shared_username': None,
'shared_user_url': None,
'shared_post_url': None,
'available': None,
'comments_full': None,
Expand Down Expand Up @@ -273,94 +270,66 @@ def extract_text(self) -> PartialPost:

element = self.element

story_containers = element.find(".story_body_container")

has_more = self.more_url_regex.search(element.html)
if has_more and self.full_post_html:
element = self.full_post_html.find('.story_body_container', first=True)
if not element and self.full_post_html.find("div.msg", first=True):
text = self.full_post_html.find("div.msg", first=True).text
return {"text": text, "post_text": text}


texts = defaultdict(str)

for container_index, container in enumerate(story_containers):

has_translation = self.has_translation_regex.search(container.html)
if has_translation:
original = container.find('div[style="display:none"]', first=True)
translation = utils.make_html_element(
html=container.html.replace(original.html, "")
)
content_versions = [("hidden_original", original), ("translation", translation)]
else:
content_versions = [("original", container)]

nodes = element.find('p, header, span[role=presentation]')
if nodes and len(nodes) > 1:
post_text = []
shared_text = []
ended = False
index_non_header = next(
(i for i, node in enumerate(nodes) if node.tag != 'header'), 1
)
for node in nodes[index_non_header:]:
if node.tag == 'header':
ended = True

# Remove '... More'
# This button is meant to display the hidden text that is already loaded
# Not to be confused with the 'More' that opens the article in a new page
if node.tag == 'p':
node = utils.make_html_element(
html=node.html.replace('>… <', '><', 1).replace('>More<', '', 1)
)

if not ended:
post_text.append(node.text)
else:
shared_text.append(node.text)

# Separation between paragraphs
paragraph_separator = '\n\n'

for version, content in content_versions:
post_text = []
shared_text = []
nodes = content.find('p, header, span[role=presentation]')

if version == "hidden_original":
if container_index == 0:
post_text.append(content.text)
else:
shared_text.append(content.text)

elif nodes:
ended = False
index_non_header = next(
(i for i, node in enumerate(nodes) if node.tag != 'header'), 1
text = paragraph_separator.join(itertools.chain(post_text, shared_text))
post_text = paragraph_separator.join(post_text)
shared_text = paragraph_separator.join(shared_text)

original_text = None
hidden_div = element.find('div[style="display:none"]', first=True)
if hidden_div:
original_text = []
for node in hidden_div.find("p,span[role=presentation]"):
node = utils.make_html_element(
html=node.html.replace('>… <', '><', 1).replace('>More<', '', 1)
)
for node in nodes[index_non_header:]:
if node.tag == 'header' or container_index > 0:
ended = True

# Remove '... More'
# This button is meant to display the hidden text that is already loaded
# Not to be confused with the 'More' that opens the article in a new page
if node.tag == 'p':
node = utils.make_html_element(
html=node.html.replace('>… <', '><', 1).replace('>More<', '', 1)
)

if not ended:
post_text.append(node.text)
else:
shared_text.append(node.text)

text = paragraph_separator.join(itertools.chain(post_text, shared_text))
post_text = paragraph_separator.join(post_text)
shared_text = paragraph_separator.join(shared_text)

if version in ["original", "hidden_original"]:
texts["text"] += text
texts["post_text"] += post_text
texts["shared_text"] += shared_text
if version == "translation":
texts["translated_text"] += text
texts["translated_post_text"] += post_text
texts["translated_shared_text"] += shared_text

if texts:
if texts["translated_text"]:
texts["original_text"] = texts["text"]
return dict(texts)
original_text.append(node.text)
original_text = paragraph_separator.join(original_text)

return {
'text': text,
'post_text': post_text,
'shared_text': shared_text,
'original_text': original_text,
}
elif element.find(".story_body_container>div", first=True):
text = element.find(".story_body_container>div", first=True).text
return {'text': text, 'post_text': text}
elif len(nodes) == 1:
text = nodes[0].text
return {'text': text, 'post_text': text}




return None

# TODO: Add the correct timezone
Expand Down Expand Up @@ -1031,13 +1000,11 @@ def extract_share_information(self):
)
# We can re-use the existing parsers, as a one level deep recursion
shared_post = PostExtractor(raw_post, self.options, self.request)
shared_user_info = shared_post.extract_username()
return {
'shared_post_id': self.data_ft["original_content_id"],
'shared_time': shared_post.extract_time().get("time"),
'shared_user_id': self.data_ft["original_content_owner_id"],
'shared_username': shared_user_info.get("username"),
'shared_user_url': shared_user_info.get("user_url"),
'shared_username': shared_post.extract_username().get("username"),
'shared_post_url': shared_post.extract_post_url().get("post_url"),
}

Expand Down Expand Up @@ -1119,13 +1086,6 @@ def parse_comment(self, comment):
reactions = self.extract_reactions(comment_id, force_parse_HTML=True)
if comment_reactors_opt != "generator":
reactions["reactors"] = utils.safe_consume(reactions.get("reactors", []))
else:
reactions_count = comment.find('span._14va', first=True)
if reactions_count and len(reactions_count.text) > 0:
reactions_count = reactions_count.text
else:
reactions_count = None
reactions.update({"reaction_count": reactions_count})

return {
"comment_id": comment_id,
Expand Down
15 changes: 7 additions & 8 deletions facebook_scraper/facebook_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class FacebookScraper:
"Accept": "*/*",
"Connection": "keep-alive",
"Accept-Encoding": "gzip,deflate",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8",
}
have_checked_locale = False

Expand Down Expand Up @@ -757,10 +757,10 @@ def get_group_info(self, group, **kwargs) -> Profile:
except:
result["about"] = None

try:
url = members.find("a", first=True).attrs.get("href")
logger.debug(f"Requesting page from: {url}")
url = members.find("a", first=True).attrs.get("href")
logger.debug(f"Requesting page from: {url}")

try:
resp = self.get(url).html
url = resp.find("a[href*='listType=list_admin_moderator']", first=True)
if kwargs.get("admins", True):
Expand Down Expand Up @@ -959,10 +959,9 @@ def submit_form(self, response, extra_data={}):
def login(self, email: str, password: str):
response = self.get(self.base_url)

datr_cookie = re.search('(?<=_js_datr",")[^"]+', response.html.html)
if datr_cookie:
cookie_value = datr_cookie.group()
self.session.cookies.set('datr', cookie_value)
cookies_values = re.findall(r'js_datr","([^"]+)', response.html.html)
if len(cookies_values) == 1:
self.session.cookies.set("datr", cookies_values[0])

response = self.submit_form(
response, {"email": email, "pass": password, "_fb_noscript": None}
Expand Down
7 changes: 6 additions & 1 deletion facebook_scraper/page_iterators.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,10 +283,15 @@ def get_next_page(self) -> Optional[URL]:
if match:
value = match.groups()[0]
return value.encode('utf-8').decode('unicode_escape').replace('\\/', '/')
def get_page(self) -> Page:
try:
return super()._get_page('div._5rgr._5gh8._3-hy.async_like', 'article')
except:
return super()._get_page('article[data-ft*="top_level_post_id"]', 'article')


class HashtagPageParser(PageParser):
cursor_regex = re.compile(r'(\/hashtag\/[a-z]+\/\?cursor=[^"]+).*$')
cursor_regex = re.compile(r'(\/hashtag\/[a-z]+\/\?locale=[a-z_A-Z]+&amp;cursor=[^"]+).*$')

def get_page(self) -> Page:
return super()._get_page('article', 'article')
Expand Down