-
Notifications
You must be signed in to change notification settings - Fork 26
/
main.py
123 lines (106 loc) · 4.24 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""Load up book text, and generate a magic videobook, and serve it."""
import os
import re
from flask import Flask, render_template, request
import requests
import logging
from youtube_transcript_api import YouTubeTranscriptApi
APP = Flask(__name__)
# From https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences
alphabets = "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
digits = "([0-9])"
def split_into_sentences(text):
text = " " + text + " "
text = text.replace("\n", " ")
text = re.sub(prefixes, "\\1<prd>", text)
text = re.sub(websites, "<prd>\\1", text)
text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
if "..." in text:
text = text.replace("...", "<prd><prd><prd>")
if "Ph.D" in text:
text = text.replace("Ph.D.", "Ph<prd>D<prd>")
text = re.sub("\s" + alphabets + "[.] ", " \\1<prd> ", text)
text = re.sub(acronyms+" "+starters, "\\1<stop> \\2", text)
text = re.sub(alphabets + "[.]" + alphabets + "[.]" +
alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text)
text = re.sub(alphabets + "[.]" + alphabets +
"[.]", "\\1<prd>\\2<prd>", text)
text = re.sub(" "+suffixes+"[.] "+starters, " \\1<stop> \\2", text)
text = re.sub(" "+suffixes+"[.]", " \\1<prd>", text)
text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
if "”" in text:
text = text.replace(".”", "”.")
if "\"" in text:
text = text.replace(".\"", "\".")
if "!" in text:
text = text.replace("!\"", "\"!")
if "?" in text:
text = text.replace("?\"", "\"?")
text = text.replace(".", ".<stop>")
text = text.replace("?", "?<stop>")
text = text.replace("!", "!<stop>")
text = text.replace("<prd>", ".")
sentences = text.split("<stop>")
sentences = sentences[:-1]
sentences = [s.strip() for s in sentences]
return sentences
def get_book_sentences():
"""Get the book file, and tokenize it."""
text = open('caesar-and-christ.txt').read()
sentences = split_into_sentences(text)
return sentences[5004:10000] # TODO: generalize this
@APP.route('/')
def index():
"""Serve the index page."""
sentences = get_book_sentences()
logging.info("Loaded sentences: %s", sentences)
return render_template('index.html', sentences=sentences)
@APP.route('/image')
def image():
"""Serve the image page."""
prompt = request.args.get('prompt')
results = requests.get(
'https://lexica.art/api/v1/search', params={'q': prompt})
logging.info("Requesting image for prompt: %s", prompt)
if results.status_code != 200:
logging.info("Requested URL: %s", results.url)
logging.info("Content: %s", results.content)
results.raise_for_status()
results = results.json()
if results and results['images']:
response = {
'src': results['images'][0]['src'],
'alt': results['images'][0]['prompt'],
}
return response
return {}
_CACHED_TRANSCRIPTS = {}
@APP.route('/get_youtube_start_time')
def get_youtube_start_time():
"""Given a phrase and a YouTube video, find the start
time of that phrase in the video by looking at the transcript from YouTube.
"""
phrase = request.args.get('p') # a string that's likely a sentence
youtube_url = request.args.get('yt')
video_id = youtube_url.split('v=')[1]
if video_id not in _CACHED_TRANSCRIPTS:
_CACHED_TRANSCRIPTS[video_id] = YouTubeTranscriptApi.get_transcript(
video_id)
transcript = _CACHED_TRANSCRIPTS[video_id]
phrase = re.sub(r'[^\w\s]', '', phrase)
phrase_words = ' '.join([x.strip() for x in phrase.split(
' ')[:3]]).lower() # TODO: trigrams or somesuch
print("Searching for: ", phrase_words)
for sentence in transcript:
if phrase_words in sentence['text']:
print("Found %s in %s" % (phrase_words, sentence))
return sentence
return {}
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
APP.run(debug=True, port=5001)