-
Notifications
You must be signed in to change notification settings - Fork 1
/
run.py
121 lines (100 loc) · 3.79 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import sys
import requests
from bs4 import BeautifulSoup
import xlsxwriter
import argparse
class Book:
pass
class Tag:
pass
INPUT = "input.txt"
OUTPUT = "output.xlsx"
NUMBEROFTAGS = 6
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--source", help="file location of sourcefile (list of ISBN separated with newlines)", default=INPUT
)
parser.add_argument(
"--destination", help="Name of the output file", default=OUTPUT
)
parser.add_argument(
"--numberOfTags", help="number of n most used tags to save", default=NUMBEROFTAGS
)
args = parser.parse_args()
source = args.source
destination = args.destination
numberOfTags = args.numberOfTags
workIds = getIDs(source)
books = getBooks(workIds, numberOfTags)
# books = getBooks([5403381, 92619, 1118065, 1888520, 880777, 8300098, 8384326, 8461488, 8024489], numberOfTags)
writeToExcel(books, destination)
def getBooks(isbns, numberOfTags):
books = []
for i, isbn in enumerate(isbns):
print("looking for work: {}".format(isbn))
# url = "https://www.librarything.com/isbn/{}".format(isbn)
url = "https://www.librarything.com/work/{}".format(isbn)
page = requests.get(url, verify=False)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find("meta", property="og:title")
isbn = soup.find("meta", property="books:isbn")
workID = soup.find("meta", property="og:url")
workIDContent = workID["content"]
workIDContentSplit = workIDContent.split("/")[4]
book = Book()
book.title = title["content"]
book.isbn = isbn["content"]
book.workId = workIDContentSplit
print("workID: " + book.workId)
print("isbn: " + book.isbn)
print("title: " + book.title)
book.tags = getTags(book.workId, numberOfTags)
books.append(book)
return books
def getTags(workID, numberOfTags):
url = "http://www.librarything.com/ajaxinc_showbooktags.php?work={}&all=1&print=1&doit=1&lang=\"+lang".format(
workID)
page = requests.get(url, verify=False)
soup = BeautifulSoup(page.content, 'html.parser')
tags = soup.find_all("span", class_="tag")
tagCollection = []
for unparsedTag in tags:
tag = Tag()
tag.content = unparsedTag.find("a").text
tagcounttext = unparsedTag.find("span", class_="count").text
countString = tagcounttext[tagcounttext.find("(") + 1:tagcounttext.find(")")]
tag.count = int(countString.replace(',', ''))
if tag.content != '':
tagCollection.append(tag)
tagCollection.sort(key=lambda x: x.count, reverse=True)
tagCollection = tagCollection[:numberOfTags]
return tagCollection
def writeToExcel(books, output):
print("excel parsing")
workbook = xlsxwriter.Workbook(output)
worksheet = workbook.add_worksheet()
worksheet.write(0, 0, "title")
worksheet.write(0, 1, "ISBN")
worksheet.write(0, 2, "workID")
worksheet.write(0, 3, "tags")
for i, book in enumerate(books):
# Write some numbers, with row/column notation.
worksheet.write(i + 1, 0, book.title)
worksheet.write(i + 1, 1, book.isbn)
worksheet.write(i + 1, 2, book.workId)
tagcontentlist = []
for tag in book.tags:
tagcontentlist.append(tag.content)
worksheet.write(i + 1, 3, ','.join(tagcontentlist))
print("Writing to excel...")
workbook.close()
def getIDs(source):
ids = []
with open(source, "rt") as f:
for line in f:
ids.append(line.strip())
print(ids)
return ids
if __name__ == "__main__":
sys.exit(main())