-
Notifications
You must be signed in to change notification settings - Fork 7
/
nyaa.go
175 lines (157 loc) · 5 KB
/
nyaa.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
package main
import (
"fmt"
"io/ioutil"
"net/http"
"strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/microcosm-cc/bluemonday"
"github.com/russross/blackfriday"
)
/*
### Nyaa.si adult categories ###
Art - Anime
Art - Doujinshi
Art - Games
Art - Manga
Art - Pictures
Real Life - Photobooks and Pictures
Real Life - Videos
### Nyaa.si categories ###
Anime - Anime Music Video
Anime - English-translated
Anime - Non-English-translated
Anime - Raw
Audio - Lossless
Audio - Lossy
Literature - English-translated
Literature - Non-English-translated
Literature - Raw
Live Action - English-translated
Live Action - Idol/Promotional Video
Live Action - Non-English-translated
Live Action - Raw
Pictures - Graphics
Pictures - Photos
Software - Applications
Software - Games
*/
//nyaaParent crawls nyaa.si main pages to get torrent IDs
//startOffset is the page to start scraping on
//pageMax is the maximum number of pages we want to scrape
func nyaaParent(startOffset, pageMax int, baseURL string, chHTML chan<- HTMLBlob) {
nyaaPage := startOffset
var Blob HTMLBlob
Blob.URL = baseURL
//I'm pretty sure there's a way to do this without an iterator
for i := 0; i < pageMax; i++ {
nyaaURL := baseURL + "/?p=" + strconv.Itoa(nyaaPage)
nyaaPage++
response, err := http.Get(nyaaURL)
if err != nil {
fmt.Println("ERROR: Failed to crawl\"" + nyaaURL + "\"")
response.Body.Close()
break
}
b, _ := ioutil.ReadAll(response.Body)
response.Body.Close()
//TODO: This really should be its own function
doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(b)))
if err != nil {
fmt.Println("Errored checking for Nyaa.si 404", err)
return
}
is404 := doc.Find("div.container:nth-child(2) > h1:nth-child(1)").Text()
if is404 == "404 Not Found" {
fmt.Println("Found 404, exiting crawler")
return
}
Blob.Raw = b
chHTML <- Blob
}
}
//nyaaChild leverages their API for torrent info
func nyaaChild(chTorrent chan<- Torrent, chNyaaURL chan string) {
for url := range chNyaaURL {
n, err := nyaaAPI(url)
if err != nil {
fmt.Println(err, "on page", url)
}
info := nyaaBuildStruct(n, url)
if len(info.Description) < 1 {
info.Description = "No description found"
}
if len(info.Uploader) < 2 {
info.Uploader = "Anonymous"
}
//If any key info was missed, send it back and rescrape it
if len(info.Title) < 2 || len(info.Hash) == 0 || len(info.Magnet) == 0 {
fmt.Println("Nyaa scrape failed, missing title, hash, or magnet link." +
"Pushing to end of queue")
chNyaaURL <- url
continue
}
for len(chTorrent) == cap(chTorrent) {
fmt.Println("Torrent channel full, sleeping 3 seconds")
time.Sleep(time.Millisecond * 3000)
}
chTorrent <- info
}
}
func nyaaBuildStruct(n nyaaJSON, url string) (info Torrent) {
info.Source = url
info.Title = n.Name
info.Uploader = n.Uploader
info.UploaderID = 0
//info.Language = //Doesn't exist on Nyaa.si
info.Description = n.Description
info.Magnet = n.Magnet
info.Hash = strings.TrimSpace(n.HashHex)
info.Hash = strings.ToUpper(info.Hash)
info.FileSize = n.FileSize
info.Date = n.CreatedOn
info.Seeders = n.Stats.Seeders
info.Leechers = n.Stats.Leechers
info.Completed = n.Stats.Downloads
if strings.Contains(info.Source, "subekei") {
info.Adult = true
} else {
info.Adult = false
}
//Join the api (sub)category with - to map it easier
category := n.MainCategory + " - " + n.SubCategory
copy(info.Category[:2], nyaaCategoryMap[category][:2])
//Convert the api markdown description to sanitized HTML
unsafe := blackfriday.MarkdownCommon([]byte(n.Description))
b := bluemonday.UGCPolicy().SanitizeBytes(unsafe)
info.Description = string(b)
return
}
var nyaaCategoryMap = map[string][]int{
"Art - Anime": []int{1, 1},
"Art - Doujinshi": []int{1, 2},
"Art - Games": []int{1, 3},
"Art - Manga": []int{1, 4},
"Art - Pictures": []int{1, 5},
"Real Life - Photobooks and Pictures": []int{2, 1},
"Real Life - Videos": []int{2, 2},
"Anime - Anime Music Video": []int{3, 12},
"Anime - English-translated": []int{3, 5},
"Anime - Non-English-translated": []int{3, 13},
"Anime - Raw": []int{3, 6},
"Audio - Lossless": []int{2, 3},
"Audio - Lossy": []int{2, 4},
"Literature - English-translated": []int{4, 7},
"Literature - Non-English-translated": []int{4, 14},
"Literature - Raw": []int{4, 8},
"Live Action - English-translated": []int{5, 9},
"Live Action - Idol/Promotional Video": []int{5, 10},
"Live Action - Non-English-translated": []int{5, 18},
"Live Action - Raw": []int{5, 11},
"Pictures - Graphics": []int{6, 15},
"Pictures - Photos": []int{6, 16},
"Software - Applications": []int{1, 1},
"Software - Games": []int{1, 2},
}