diff --git a/go.mod b/go.mod index baf635988..3192dc0eb 100644 --- a/go.mod +++ b/go.mod @@ -8,6 +8,7 @@ require ( github.com/disintegration/imaging v1.6.2 github.com/fatih/color v1.13.0 github.com/go-shiori/go-readability v0.0.0-20220215145315-dd6828d2f09b + github.com/go-shiori/obelisk v0.0.0-20220524135250-3d6752a59bd7 github.com/go-shiori/warc v0.0.0-20200621032813-359908319d1d github.com/go-sql-driver/mysql v1.6.0 github.com/gofrs/uuid v4.2.0+incompatible @@ -27,6 +28,7 @@ require ( require ( github.com/andybalholm/cascadia v1.3.1 // indirect + github.com/cenkalti/backoff/v4 v4.1.2 // indirect github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65 // indirect github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect github.com/google/uuid v1.3.0 // indirect @@ -34,18 +36,22 @@ require ( github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/inconshreveable/mousetrap v1.0.0 // indirect github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 // indirect + github.com/kennygrant/sanitize v1.2.4 // indirect github.com/mattn/go-colorable v0.1.12 // indirect github.com/mattn/go-isatty v0.0.14 // indirect github.com/mitchellh/go-homedir v1.1.0 // indirect + github.com/pkg/errors v0.9.1 // indirect github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0 // indirect github.com/shurcooL/httpfs v0.0.0-20190707220628-8d4bc4ba7749 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/tdewolff/parse v2.3.4+incompatible // indirect + github.com/tdewolff/parse/v2 v2.5.27 // indirect go.etcd.io/bbolt v1.3.6 // indirect go.uber.org/atomic v1.9.0 // indirect golang.org/x/image v0.0.0-20220413100746-70e8d0d3baa9 // indirect golang.org/x/mod v0.6.0-dev.0.20220106191415-9b9b3d81d5e3 // indirect golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4 // indirect + golang.org/x/sync v0.0.0-20210220032951-036812b2e83c // indirect golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8 // indirect golang.org/x/text v0.3.7 // indirect golang.org/x/tools v0.1.10 // indirect diff --git a/go.sum b/go.sum index dc799f8bc..59f192ed1 100644 --- a/go.sum +++ b/go.sum @@ -178,6 +178,7 @@ github.com/bugsnag/bugsnag-go v0.0.0-20141110184014-b1d153021fcd/go.mod h1:2oa8n github.com/bugsnag/osext v0.0.0-20130617224835-0dd3f918b21b/go.mod h1:obH5gd0BsqsP2LwDJ9aOkm/6J86V6lyAXCoQWGw3K50= github.com/bugsnag/panicwrap v0.0.0-20151223152923-e2c28503fcd0/go.mod h1:D/8v3kj0zr8ZAKg1AQ6crr+5VwKN5eIywRkfhyM/+dE= github.com/cenkalti/backoff/v4 v4.1.1/go.mod h1:scbssz8iZGpm3xbr14ovlUdkxfGXNInqkPWOWmG2CLw= +github.com/cenkalti/backoff/v4 v4.1.2 h1:6Yo7N8UP2K6LWZnW94DLVSSrbobcWdVzAYOisuDPIFo= github.com/cenkalti/backoff/v4 v4.1.2/go.mod h1:scbssz8iZGpm3xbr14ovlUdkxfGXNInqkPWOWmG2CLw= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/census-instrumentation/opencensus-proto v0.3.0/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= @@ -333,6 +334,7 @@ github.com/coreos/pkg v0.0.0-20160727233714-3ac0863d7acf/go.mod h1:E3G3o1h8I7cfc github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= +github.com/cpuguy83/go-md2man/v2 v2.0.1/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= @@ -460,6 +462,8 @@ github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65 h1:zx4B0AiwqKDQq+Agq github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65/go.mod h1:NPO1+buE6TYOWhUI98/hXLHHJhunIpXRuvDN4xjkCoE= github.com/go-shiori/go-readability v0.0.0-20220215145315-dd6828d2f09b h1:yrGomo5CP7IvXwSwKbDeaJkhwa4BxfgOO/s1V7iOQm4= github.com/go-shiori/go-readability v0.0.0-20220215145315-dd6828d2f09b/go.mod h1:LTRGsNyO3/Y6u3ERbz17OiXy2qO1Y+/8QjXpg2ViyEY= +github.com/go-shiori/obelisk v0.0.0-20220524135250-3d6752a59bd7 h1:+TWg0Pe3/7YUbL0MuF4O/PdN+68M4HsUt1GyER/pvbU= +github.com/go-shiori/obelisk v0.0.0-20220524135250-3d6752a59bd7/go.mod h1:qKa73D7hc0YucHndvsCOgZ5Ap54XgSmZxaIytNAFUAQ= github.com/go-shiori/warc v0.0.0-20200621032813-359908319d1d h1:+SEf4hYDaAt2eyq8Xu3YyWCpnMsK8sZfbYsDRFCUgBM= github.com/go-shiori/warc v0.0.0-20200621032813-359908319d1d/go.mod h1:uaK5DAxFig7atOzy+aqLzhs6qJacMDfs8NxHV5+shzc= github.com/go-sql-driver/mysql v1.4.0/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w= @@ -755,6 +759,8 @@ github.com/karrick/godirwalk v1.8.0/go.mod h1:H5KPZjojv4lE+QYImBI8xVtrBRgYrIVsaR github.com/karrick/godirwalk v1.10.3/go.mod h1:RoGL9dQei4vP9ilrpETWE8CLOZ1kiN0LhBygSwrAsHA= github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs= github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8= +github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= +github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= @@ -1047,6 +1053,7 @@ github.com/spf13/cobra v0.0.2-0.20171109065643-2da4a54c5cee/go.mod h1:1l0Ry5zgKv github.com/spf13/cobra v0.0.3/go.mod h1:1l0Ry5zgKvJasoi3XT1TypsSe7PqH0Sj9dhYf7v3XqQ= github.com/spf13/cobra v1.0.0/go.mod h1:/6GTrnGXV9HjY+aR4k0oJ5tcvakLuG6EuKReYlHNrgE= github.com/spf13/cobra v1.1.3/go.mod h1:pGADOWyqRD/YMrPZigI/zbliZ2wVD/23d+is3pSWzOo= +github.com/spf13/cobra v1.4.0/go.mod h1:Wo4iy3BUC+X2Fybo0PDqwJIv3dNRiZLHQymsfxlB84g= github.com/spf13/cobra v1.5.0 h1:X+jTBEBqF0bHN+9cSMgmfuvv2VHJ9ezmFNf9Y/XstYU= github.com/spf13/cobra v1.5.0/go.mod h1:dWXEIy2H428czQCjInthrTRUg7yKbok+2Qi/yBIJoUM= github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= @@ -1079,8 +1086,11 @@ github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG github.com/tchap/go-patricia v2.2.6+incompatible/go.mod h1:bmLyhP68RS6kStMGxByiQ23RP/odRBOTVjwp2cDyi6I= github.com/tdewolff/parse v2.3.4+incompatible h1:x05/cnGwIMf4ceLuDMBOdQ1qGniMoxpP46ghf0Qzh38= github.com/tdewolff/parse v2.3.4+incompatible/go.mod h1:8oBwCsVmUkgHO8M5iCzSIDtpzXOT0WXX9cWhz+bIzJQ= -github.com/tdewolff/test v1.0.0 h1:jOwzqCXr5ePXEPGJaq2ivoR6HOCi+D5TPfpoyg8yvmU= +github.com/tdewolff/parse/v2 v2.5.27 h1:PL3LzzXaOpmdrknnOlIeO2muIBHAwiKp6TxN1RbU5gI= +github.com/tdewolff/parse/v2 v2.5.27/go.mod h1:WzaJpRSbwq++EIQHYIRTpbYKNA3gn9it1Ik++q4zyho= github.com/tdewolff/test v1.0.0/go.mod h1:DiQUlutnqlEvdvhSn2LPGy4TFwRauAaYDsL+683RNX4= +github.com/tdewolff/test v1.0.6 h1:76mzYJQ83Op284kMT+63iCNCI7NEERsIN8dLM+RiKr4= +github.com/tdewolff/test v1.0.6/go.mod h1:6DAvZliBAAnD7rhVgwaM7DE5/d9NMOAJ09SqYqeK4QE= github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= @@ -1357,6 +1367,7 @@ golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20210220032951-036812b2e83c h1:5KslGYwFpkhGh+Q16bwMP3cOontH8FOep7tGV86Y7SQ= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180224232135-f6cff0780e54/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= diff --git a/internal/core/core.go b/internal/core/core.go index a75f3ec49..9c545d09e 100644 --- a/internal/core/core.go +++ b/internal/core/core.go @@ -1,3 +1,3 @@ package core -const userAgent = "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)" +const userAgent = "Shiori (+https://github.com/go-shiori/shiori)" diff --git a/internal/core/processing.go b/internal/core/processing.go index c7bdaf3b1..e56113d64 100644 --- a/internal/core/processing.go +++ b/internal/core/processing.go @@ -2,6 +2,8 @@ package core import ( "bytes" + "compress/gzip" + "context" "fmt" "image" "image/color" @@ -18,8 +20,8 @@ import ( "github.com/disintegration/imaging" "github.com/go-shiori/go-readability" + "github.com/go-shiori/obelisk" "github.com/go-shiori/shiori/internal/model" - "github.com/go-shiori/warc" // Add support for png _ "image/png" @@ -130,17 +132,32 @@ func ProcessBookmark(req ProcessRequest) (book model.Bookmark, isFatalErr bool, archivePath := fp.Join(req.DataDir, "archive", fmt.Sprintf("%d", book.ID)) os.Remove(archivePath) - archivalRequest := warc.ArchivalRequest{ - URL: book.URL, - Reader: archivalInput, - ContentType: contentType, - UserAgent: userAgent, - LogEnabled: req.LogArchival, + req := obelisk.Request{ + URL: book.URL, } - err = warc.NewArchive(archivalRequest, archivePath) + arc := obelisk.Archiver{ + UserAgent: userAgent, + } + arc.Validate() + + result, _, err := arc.Archive(context.Background(), req) if err != nil { - return book, false, fmt.Errorf("failed to create archive: %v", err) + return book, false, fmt.Errorf("failed to archive: %s", err) + } + + // Destination + f, err := os.Create(archivePath) + if err != nil { + return book, false, fmt.Errorf("failed to create archive file: %s", err) + } + defer f.Close() + + // Compress + gz := gzip.NewWriter(f) + defer gz.Close() + if _, err := gz.Write(result); err != nil { + return book, false, fmt.Errorf("failed to write archive file: %s", err) } book.HasArchive = true diff --git a/internal/webserver/handler-ui.go b/internal/webserver/handler-ui.go index 4d4eb25d4..42bf4f339 100644 --- a/internal/webserver/handler-ui.go +++ b/internal/webserver/handler-ui.go @@ -14,7 +14,6 @@ import ( "strings" "github.com/PuerkitoBio/goquery" - "github.com/go-shiori/warc" "github.com/julienschmidt/httprouter" "github.com/go-shiori/shiori/internal/model" @@ -115,72 +114,72 @@ func (h *handler) serveBookmarkContent(w http.ResponseWriter, r *http.Request, p } // Check if it has archive. - archivePath := fp.Join(h.DataDir, "archive", strID) - if fileExists(archivePath) { - bookmark.HasArchive = true - - // Open archive, look in cache first - var archive *warc.Archive - cacheData, found := h.ArchiveCache.Get(strID) - - if found { - archive = cacheData.(*warc.Archive) - } else { - archivePath := fp.Join(h.DataDir, "archive", strID) - archive, err = warc.Open(archivePath) - checkError(err) - - h.ArchiveCache.Set(strID, archive, 0) - } - - // Find all image and convert its source to use the archive URL. - createArchivalURL := func(archivalName string) string { - archivalURL := *r.URL - archivalURL.Path = path.Join(h.RootPath, "bookmark", strID, "archive", archivalName) - return archivalURL.String() - } - - buffer := strings.NewReader(bookmark.HTML) - doc, err := goquery.NewDocumentFromReader(buffer) - checkError(err) - - doc.Find("img, picture, figure, source").Each(func(_ int, node *goquery.Selection) { - // Get the needed attributes - src, _ := node.Attr("src") - strSrcSets, _ := node.Attr("srcset") - - // Convert `src` attributes - if src != "" { - archivalName := getArchivalName(src) - if archivalName != "" && archive.HasResource(archivalName) { - node.SetAttr("src", createArchivalURL(archivalName)) - } - } - - // Split srcset by comma, then process it like any URLs - srcSets := strings.Split(strSrcSets, ",") - for i, srcSet := range srcSets { - srcSet = strings.TrimSpace(srcSet) - parts := strings.SplitN(srcSet, " ", 2) - if parts[0] == "" { - continue - } - - archivalName := getArchivalName(parts[0]) - if archivalName != "" && archive.HasResource(archivalName) { - archivalURL := createArchivalURL(archivalName) - srcSets[i] = strings.Replace(srcSets[i], parts[0], archivalURL, 1) - } - } - - if len(srcSets) > 0 { - node.SetAttr("srcset", strings.Join(srcSets, ",")) - } - }) - - bookmark.HTML, err = goquery.OuterHtml(doc.Selection) - checkError(err) - } + // archivePath := fp.Join(h.DataDir, "archive", strID) + // if fileExists(archivePath) { + // bookmark.HasArchive = true + + // // Open archive, look in cache first + // var archive *warc.Archive + // cacheData, found := h.ArchiveCache.Get(strID) + + // if found { + // archive = cacheData.(*warc.Archive) + // } else { + // archivePath := fp.Join(h.DataDir, "archive", strID) + // archive, err = warc.Open(archivePath) + // checkError(err) + + // h.ArchiveCache.Set(strID, archive, 0) + // } + + // // Find all image and convert its source to use the archive URL. + // createArchivalURL := func(archivalName string) string { + // archivalURL := *r.URL + // archivalURL.Path = path.Join(h.RootPath, "bookmark", strID, "archive", archivalName) + // return archivalURL.String() + // } + + // buffer := strings.NewReader(bookmark.HTML) + // doc, err := goquery.NewDocumentFromReader(buffer) + // checkError(err) + + // doc.Find("img, picture, figure, source").Each(func(_ int, node *goquery.Selection) { + // // Get the needed attributes + // src, _ := node.Attr("src") + // strSrcSets, _ := node.Attr("srcset") + + // // Convert `src` attributes + // if src != "" { + // archivalName := getArchivalName(src) + // if archivalName != "" && archive.HasResource(archivalName) { + // node.SetAttr("src", createArchivalURL(archivalName)) + // } + // } + + // // Split srcset by comma, then process it like any URLs + // srcSets := strings.Split(strSrcSets, ",") + // for i, srcSet := range srcSets { + // srcSet = strings.TrimSpace(srcSet) + // parts := strings.SplitN(srcSet, " ", 2) + // if parts[0] == "" { + // continue + // } + + // archivalName := getArchivalName(parts[0]) + // if archivalName != "" && archive.HasResource(archivalName) { + // archivalURL := createArchivalURL(archivalName) + // srcSets[i] = strings.Replace(srcSets[i], parts[0], archivalURL, 1) + // } + // } + + // if len(srcSets) > 0 { + // node.SetAttr("srcset", strings.Join(srcSets, ",")) + // } + // }) + + // bookmark.HTML, err = goquery.OuterHtml(doc.Selection) + // checkError(err) + // } // Execute template if developmentMode { @@ -237,8 +236,6 @@ func (h *handler) serveThumbnailImage(w http.ResponseWriter, r *http.Request, ps func (h *handler) serveBookmarkArchive(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { // Get parameter from URL strID := ps.ByName("id") - resourcePath := ps.ByName("filepath") - resourcePath = strings.TrimPrefix(resourcePath, "/") // Get bookmark from database id, err := strconv.Atoi(strID) @@ -260,29 +257,27 @@ func (h *handler) serveBookmarkArchive(w http.ResponseWriter, r *http.Request, p } } - // Open archive, look in cache first - var archive *warc.Archive - cacheData, found := h.ArchiveCache.Get(strID) + resourcePath := fp.Join(h.DataDir, "archive", strID) - if found { - archive = cacheData.(*warc.Archive) - } else { - archivePath := fp.Join(h.DataDir, "archive", strID) - archive, err = warc.Open(archivePath) - checkError(err) + archive, err := os.Open(resourcePath) + checkError(err) - h.ArchiveCache.Set(strID, archive, 0) - } + // reader, err := gzip.NewReader(archive) + // checkError(err) - content, contentType, err := archive.Read(resourcePath) + content, err := io.ReadAll(archive) checkError(err) + // TODO: cache layer + + contentType := "text/html" + // Set response header w.Header().Set("Content-Encoding", "gzip") w.Header().Set("Content-Type", contentType) // If this is HTML and root, inject shiori header - if strings.Contains(strings.ToLower(contentType), "text/html") && resourcePath == "" { + if strings.Contains(strings.ToLower(contentType), "text/html") { // Extract gzip buffer := bytes.NewBuffer(content) gzipReader, err := gzip.NewReader(buffer)