Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Replace WARC with Obelisk #481

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ require (
github.com/disintegration/imaging v1.6.2
github.com/fatih/color v1.13.0
github.com/go-shiori/go-readability v0.0.0-20220215145315-dd6828d2f09b
github.com/go-shiori/obelisk v0.0.0-20220524135250-3d6752a59bd7
github.com/go-shiori/warc v0.0.0-20200621032813-359908319d1d
github.com/go-sql-driver/mysql v1.6.0
github.com/gofrs/uuid v4.2.0+incompatible
Expand All @@ -27,25 +28,30 @@ require (

require (
github.com/andybalholm/cascadia v1.3.1 // indirect
github.com/cenkalti/backoff/v4 v4.1.2 // indirect
github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65 // indirect
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
github.com/google/uuid v1.3.0 // indirect
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/inconshreveable/mousetrap v1.0.0 // indirect
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 // indirect
github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/mattn/go-colorable v0.1.12 // indirect
github.com/mattn/go-isatty v0.0.14 // indirect
github.com/mitchellh/go-homedir v1.1.0 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0 // indirect
github.com/shurcooL/httpfs v0.0.0-20190707220628-8d4bc4ba7749 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/tdewolff/parse v2.3.4+incompatible // indirect
github.com/tdewolff/parse/v2 v2.5.27 // indirect
go.etcd.io/bbolt v1.3.6 // indirect
go.uber.org/atomic v1.9.0 // indirect
golang.org/x/image v0.0.0-20220413100746-70e8d0d3baa9 // indirect
golang.org/x/mod v0.6.0-dev.0.20220106191415-9b9b3d81d5e3 // indirect
golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4 // indirect
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c // indirect
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8 // indirect
golang.org/x/text v0.3.7 // indirect
golang.org/x/tools v0.1.10 // indirect
Expand Down
13 changes: 12 additions & 1 deletion go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ github.com/bugsnag/bugsnag-go v0.0.0-20141110184014-b1d153021fcd/go.mod h1:2oa8n
github.com/bugsnag/osext v0.0.0-20130617224835-0dd3f918b21b/go.mod h1:obH5gd0BsqsP2LwDJ9aOkm/6J86V6lyAXCoQWGw3K50=
github.com/bugsnag/panicwrap v0.0.0-20151223152923-e2c28503fcd0/go.mod h1:D/8v3kj0zr8ZAKg1AQ6crr+5VwKN5eIywRkfhyM/+dE=
github.com/cenkalti/backoff/v4 v4.1.1/go.mod h1:scbssz8iZGpm3xbr14ovlUdkxfGXNInqkPWOWmG2CLw=
github.com/cenkalti/backoff/v4 v4.1.2 h1:6Yo7N8UP2K6LWZnW94DLVSSrbobcWdVzAYOisuDPIFo=
github.com/cenkalti/backoff/v4 v4.1.2/go.mod h1:scbssz8iZGpm3xbr14ovlUdkxfGXNInqkPWOWmG2CLw=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/census-instrumentation/opencensus-proto v0.3.0/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
Expand Down Expand Up @@ -333,6 +334,7 @@ github.com/coreos/pkg v0.0.0-20160727233714-3ac0863d7acf/go.mod h1:E3G3o1h8I7cfc
github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA=
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
github.com/cpuguy83/go-md2man/v2 v2.0.1/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
Expand Down Expand Up @@ -460,6 +462,8 @@ github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65 h1:zx4B0AiwqKDQq+Agq
github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65/go.mod h1:NPO1+buE6TYOWhUI98/hXLHHJhunIpXRuvDN4xjkCoE=
github.com/go-shiori/go-readability v0.0.0-20220215145315-dd6828d2f09b h1:yrGomo5CP7IvXwSwKbDeaJkhwa4BxfgOO/s1V7iOQm4=
github.com/go-shiori/go-readability v0.0.0-20220215145315-dd6828d2f09b/go.mod h1:LTRGsNyO3/Y6u3ERbz17OiXy2qO1Y+/8QjXpg2ViyEY=
github.com/go-shiori/obelisk v0.0.0-20220524135250-3d6752a59bd7 h1:+TWg0Pe3/7YUbL0MuF4O/PdN+68M4HsUt1GyER/pvbU=
github.com/go-shiori/obelisk v0.0.0-20220524135250-3d6752a59bd7/go.mod h1:qKa73D7hc0YucHndvsCOgZ5Ap54XgSmZxaIytNAFUAQ=
github.com/go-shiori/warc v0.0.0-20200621032813-359908319d1d h1:+SEf4hYDaAt2eyq8Xu3YyWCpnMsK8sZfbYsDRFCUgBM=
github.com/go-shiori/warc v0.0.0-20200621032813-359908319d1d/go.mod h1:uaK5DAxFig7atOzy+aqLzhs6qJacMDfs8NxHV5+shzc=
github.com/go-sql-driver/mysql v1.4.0/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w=
Expand Down Expand Up @@ -755,6 +759,8 @@ github.com/karrick/godirwalk v1.8.0/go.mod h1:H5KPZjojv4lE+QYImBI8xVtrBRgYrIVsaR
github.com/karrick/godirwalk v1.10.3/go.mod h1:RoGL9dQei4vP9ilrpETWE8CLOZ1kiN0LhBygSwrAsHA=
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs=
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8=
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q=
github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
Expand Down Expand Up @@ -1047,6 +1053,7 @@ github.com/spf13/cobra v0.0.2-0.20171109065643-2da4a54c5cee/go.mod h1:1l0Ry5zgKv
github.com/spf13/cobra v0.0.3/go.mod h1:1l0Ry5zgKvJasoi3XT1TypsSe7PqH0Sj9dhYf7v3XqQ=
github.com/spf13/cobra v1.0.0/go.mod h1:/6GTrnGXV9HjY+aR4k0oJ5tcvakLuG6EuKReYlHNrgE=
github.com/spf13/cobra v1.1.3/go.mod h1:pGADOWyqRD/YMrPZigI/zbliZ2wVD/23d+is3pSWzOo=
github.com/spf13/cobra v1.4.0/go.mod h1:Wo4iy3BUC+X2Fybo0PDqwJIv3dNRiZLHQymsfxlB84g=
github.com/spf13/cobra v1.5.0 h1:X+jTBEBqF0bHN+9cSMgmfuvv2VHJ9ezmFNf9Y/XstYU=
github.com/spf13/cobra v1.5.0/go.mod h1:dWXEIy2H428czQCjInthrTRUg7yKbok+2Qi/yBIJoUM=
github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo=
Expand Down Expand Up @@ -1079,8 +1086,11 @@ github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG
github.com/tchap/go-patricia v2.2.6+incompatible/go.mod h1:bmLyhP68RS6kStMGxByiQ23RP/odRBOTVjwp2cDyi6I=
github.com/tdewolff/parse v2.3.4+incompatible h1:x05/cnGwIMf4ceLuDMBOdQ1qGniMoxpP46ghf0Qzh38=
github.com/tdewolff/parse v2.3.4+incompatible/go.mod h1:8oBwCsVmUkgHO8M5iCzSIDtpzXOT0WXX9cWhz+bIzJQ=
github.com/tdewolff/test v1.0.0 h1:jOwzqCXr5ePXEPGJaq2ivoR6HOCi+D5TPfpoyg8yvmU=
github.com/tdewolff/parse/v2 v2.5.27 h1:PL3LzzXaOpmdrknnOlIeO2muIBHAwiKp6TxN1RbU5gI=
github.com/tdewolff/parse/v2 v2.5.27/go.mod h1:WzaJpRSbwq++EIQHYIRTpbYKNA3gn9it1Ik++q4zyho=
github.com/tdewolff/test v1.0.0/go.mod h1:DiQUlutnqlEvdvhSn2LPGy4TFwRauAaYDsL+683RNX4=
github.com/tdewolff/test v1.0.6 h1:76mzYJQ83Op284kMT+63iCNCI7NEERsIN8dLM+RiKr4=
github.com/tdewolff/test v1.0.6/go.mod h1:6DAvZliBAAnD7rhVgwaM7DE5/d9NMOAJ09SqYqeK4QE=
github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
Expand Down Expand Up @@ -1357,6 +1367,7 @@ golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c h1:5KslGYwFpkhGh+Q16bwMP3cOontH8FOep7tGV86Y7SQ=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180224232135-f6cff0780e54/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
Expand Down
2 changes: 1 addition & 1 deletion internal/core/core.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
package core

const userAgent = "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)"
const userAgent = "Shiori (+https://github.com/go-shiori/shiori)"
35 changes: 26 additions & 9 deletions internal/core/processing.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package core

import (
"bytes"
"compress/gzip"
"context"
"fmt"
"image"
"image/color"
Expand All @@ -18,8 +20,8 @@ import (

"github.com/disintegration/imaging"
"github.com/go-shiori/go-readability"
"github.com/go-shiori/obelisk"
"github.com/go-shiori/shiori/internal/model"
"github.com/go-shiori/warc"

// Add support for png
_ "image/png"
Expand Down Expand Up @@ -130,17 +132,32 @@ func ProcessBookmark(req ProcessRequest) (book model.Bookmark, isFatalErr bool,
archivePath := fp.Join(req.DataDir, "archive", fmt.Sprintf("%d", book.ID))
os.Remove(archivePath)

archivalRequest := warc.ArchivalRequest{
URL: book.URL,
Reader: archivalInput,
ContentType: contentType,
UserAgent: userAgent,
LogEnabled: req.LogArchival,
req := obelisk.Request{
URL: book.URL,
}

err = warc.NewArchive(archivalRequest, archivePath)
arc := obelisk.Archiver{
UserAgent: userAgent,
}
arc.Validate()

result, _, err := arc.Archive(context.Background(), req)
if err != nil {
return book, false, fmt.Errorf("failed to create archive: %v", err)
return book, false, fmt.Errorf("failed to archive: %s", err)
}

// Destination
f, err := os.Create(archivePath)
if err != nil {
return book, false, fmt.Errorf("failed to create archive file: %s", err)
}
defer f.Close()

// Compress
gz := gzip.NewWriter(f)
defer gz.Close()
if _, err := gz.Write(result); err != nil {
return book, false, fmt.Errorf("failed to write archive file: %s", err)
}

book.HasArchive = true
Expand Down
159 changes: 77 additions & 82 deletions internal/webserver/handler-ui.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import (
"strings"

"github.com/PuerkitoBio/goquery"
"github.com/go-shiori/warc"
"github.com/julienschmidt/httprouter"

"github.com/go-shiori/shiori/internal/model"
Expand Down Expand Up @@ -115,72 +114,72 @@ func (h *handler) serveBookmarkContent(w http.ResponseWriter, r *http.Request, p
}

// Check if it has archive.
archivePath := fp.Join(h.DataDir, "archive", strID)
if fileExists(archivePath) {
bookmark.HasArchive = true

// Open archive, look in cache first
var archive *warc.Archive
cacheData, found := h.ArchiveCache.Get(strID)

if found {
archive = cacheData.(*warc.Archive)
} else {
archivePath := fp.Join(h.DataDir, "archive", strID)
archive, err = warc.Open(archivePath)
checkError(err)

h.ArchiveCache.Set(strID, archive, 0)
}

// Find all image and convert its source to use the archive URL.
createArchivalURL := func(archivalName string) string {
archivalURL := *r.URL
archivalURL.Path = path.Join(h.RootPath, "bookmark", strID, "archive", archivalName)
return archivalURL.String()
}

buffer := strings.NewReader(bookmark.HTML)
doc, err := goquery.NewDocumentFromReader(buffer)
checkError(err)

doc.Find("img, picture, figure, source").Each(func(_ int, node *goquery.Selection) {
// Get the needed attributes
src, _ := node.Attr("src")
strSrcSets, _ := node.Attr("srcset")

// Convert `src` attributes
if src != "" {
archivalName := getArchivalName(src)
if archivalName != "" && archive.HasResource(archivalName) {
node.SetAttr("src", createArchivalURL(archivalName))
}
}

// Split srcset by comma, then process it like any URLs
srcSets := strings.Split(strSrcSets, ",")
for i, srcSet := range srcSets {
srcSet = strings.TrimSpace(srcSet)
parts := strings.SplitN(srcSet, " ", 2)
if parts[0] == "" {
continue
}

archivalName := getArchivalName(parts[0])
if archivalName != "" && archive.HasResource(archivalName) {
archivalURL := createArchivalURL(archivalName)
srcSets[i] = strings.Replace(srcSets[i], parts[0], archivalURL, 1)
}
}

if len(srcSets) > 0 {
node.SetAttr("srcset", strings.Join(srcSets, ","))
}
})

bookmark.HTML, err = goquery.OuterHtml(doc.Selection)
checkError(err)
}
// archivePath := fp.Join(h.DataDir, "archive", strID)
// if fileExists(archivePath) {
// bookmark.HasArchive = true

// // Open archive, look in cache first
// var archive *warc.Archive
// cacheData, found := h.ArchiveCache.Get(strID)

// if found {
// archive = cacheData.(*warc.Archive)
// } else {
// archivePath := fp.Join(h.DataDir, "archive", strID)
// archive, err = warc.Open(archivePath)
// checkError(err)

// h.ArchiveCache.Set(strID, archive, 0)
// }

// // Find all image and convert its source to use the archive URL.
// createArchivalURL := func(archivalName string) string {
// archivalURL := *r.URL
// archivalURL.Path = path.Join(h.RootPath, "bookmark", strID, "archive", archivalName)
// return archivalURL.String()
// }

// buffer := strings.NewReader(bookmark.HTML)
// doc, err := goquery.NewDocumentFromReader(buffer)
// checkError(err)

// doc.Find("img, picture, figure, source").Each(func(_ int, node *goquery.Selection) {
// // Get the needed attributes
// src, _ := node.Attr("src")
// strSrcSets, _ := node.Attr("srcset")

// // Convert `src` attributes
// if src != "" {
// archivalName := getArchivalName(src)
// if archivalName != "" && archive.HasResource(archivalName) {
// node.SetAttr("src", createArchivalURL(archivalName))
// }
// }

// // Split srcset by comma, then process it like any URLs
// srcSets := strings.Split(strSrcSets, ",")
// for i, srcSet := range srcSets {
// srcSet = strings.TrimSpace(srcSet)
// parts := strings.SplitN(srcSet, " ", 2)
// if parts[0] == "" {
// continue
// }

// archivalName := getArchivalName(parts[0])
// if archivalName != "" && archive.HasResource(archivalName) {
// archivalURL := createArchivalURL(archivalName)
// srcSets[i] = strings.Replace(srcSets[i], parts[0], archivalURL, 1)
// }
// }

// if len(srcSets) > 0 {
// node.SetAttr("srcset", strings.Join(srcSets, ","))
// }
// })

// bookmark.HTML, err = goquery.OuterHtml(doc.Selection)
// checkError(err)
// }

// Execute template
if developmentMode {
Expand Down Expand Up @@ -237,8 +236,6 @@ func (h *handler) serveThumbnailImage(w http.ResponseWriter, r *http.Request, ps
func (h *handler) serveBookmarkArchive(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
// Get parameter from URL
strID := ps.ByName("id")
resourcePath := ps.ByName("filepath")
resourcePath = strings.TrimPrefix(resourcePath, "/")

// Get bookmark from database
id, err := strconv.Atoi(strID)
Expand All @@ -260,29 +257,27 @@ func (h *handler) serveBookmarkArchive(w http.ResponseWriter, r *http.Request, p
}
}

// Open archive, look in cache first
var archive *warc.Archive
cacheData, found := h.ArchiveCache.Get(strID)
resourcePath := fp.Join(h.DataDir, "archive", strID)

if found {
archive = cacheData.(*warc.Archive)
} else {
archivePath := fp.Join(h.DataDir, "archive", strID)
archive, err = warc.Open(archivePath)
checkError(err)
archive, err := os.Open(resourcePath)
checkError(err)

h.ArchiveCache.Set(strID, archive, 0)
}
// reader, err := gzip.NewReader(archive)
// checkError(err)

content, contentType, err := archive.Read(resourcePath)
content, err := io.ReadAll(archive)
checkError(err)

// TODO: cache layer

contentType := "text/html"

// Set response header
w.Header().Set("Content-Encoding", "gzip")
w.Header().Set("Content-Type", contentType)

// If this is HTML and root, inject shiori header
if strings.Contains(strings.ToLower(contentType), "text/html") && resourcePath == "" {
if strings.Contains(strings.ToLower(contentType), "text/html") {
// Extract gzip
buffer := bytes.NewBuffer(content)
gzipReader, err := gzip.NewReader(buffer)
Expand Down