-
Notifications
You must be signed in to change notification settings - Fork 0
/
getDetail.py
46 lines (43 loc) · 1.65 KB
/
getDetail.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import requests
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import re
def getDetail(newslink):
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
}
try:
req = requests.get(newslink, headers=headers, timeout=20)
req.raise_for_status()
http_encoding = (
req.encoding
if "charset" in req.headers.get("content-type", "").lower()
else None
)
html_encoding = EncodingDetector.find_declared_encoding(
req.content, is_html=True
)
encoding = http_encoding or html_encoding
encoding = encoding.upper()
req.encoding = encoding
soup = BeautifulSoup(req.text, "html.parser", from_encoding=encoding)
except Exception as error_detail:
print("not found..", error_detail)
pass
content = {"description": "", "imgPath": "", "title": "", "link": ""}
# description
try:
description = (
soup.find("meta", property="og:description").get("content").strip()
)
imgPath = soup.find("meta", property="og:image").get("content")
title = soup.find("meta", property="og:title").get("content")
link = soup.find("meta", property="og:url").get("content")
content["description"] = description
content["imgPath"] = imgPath
content["title"] = title
content["link"] = link
except Exception as error_detail:
print("no description!", error_detail)
pass
return content