Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scraping fix #9

Merged
merged 7 commits into from
Apr 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/go.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v3
with:
go-version: 1.19
go-version: 1.22

- name: Build
run: go build -v ./...
Expand Down
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
FROM golang:1.20-alpine AS builder
FROM golang:1.22-alpine AS builder
LABEL maintainer="github:@rusq"

WORKDIR /build
COPY . .

RUN go build -ldflags="-s -w" ./cmd/aklapi

FROM alpine:3.17
FROM alpine:3.19.1
LABEL maintainer="github:@rusq"

RUN apk add --no-cache ca-certificates && apk --no-cache add tzdata
RUN apk add --no-cache ca-certificates


WORKDIR /app
Expand Down
17 changes: 13 additions & 4 deletions cmd/aklapi/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@ import (
"errors"
"log"
"net/http"
"time"

"github.com/rusq/aklapi"
)

const dateFmt = "2006-01-02"
const dttmLayout = "2006-01-02"

type rrResponse struct {
Rubbish string `json:"rubbish,omitempty"`
Expand Down Expand Up @@ -59,14 +60,22 @@ func rrHandler(w http.ResponseWriter, r *http.Request) {
return
}
resp := rrResponse{
Recycle: res.NextRecycle().Format(dateFmt),
Rubbish: res.NextRubbish().Format(dateFmt),
FoodScraps: res.NextFoodScraps().Format(dateFmt),
Recycle: timefmt(res.NextRecycle()),
Rubbish: timefmt(res.NextRubbish()),
FoodScraps: timefmt(res.NextFoodScraps()),
Address: res.Address.Address,
}
respond(w, resp, http.StatusOK)
}

// timefmt formats the time skipping empty time.
func timefmt(t time.Time) string {
if t.IsZero() {
return ""
}
return t.Format(dttmLayout)
}

func rrExtHandler(w http.ResponseWriter, r *http.Request) {
res, err := rubbish(r)
if err != nil {
Expand Down
2 changes: 2 additions & 0 deletions cmd/aklapi/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"log"
"net/http"
"os"
_ "time/tzdata"

"github.com/rusq/aklapi"
"github.com/rusq/osenv/v2"
Expand Down Expand Up @@ -41,6 +42,7 @@ var (
var tmpl = template.Must(template.New("index.html").Parse(rootHTML))

func main() {
log.SetFlags(log.LstdFlags | log.Lmicroseconds)
flag.Parse()
if *port == "" {
log.Printf("no port specified, defaulting to %s", defaultPort)
Expand Down
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
module github.com/rusq/aklapi

go 1.19
go 1.22

require (
github.com/PuerkitoBio/goquery v1.8.1
github.com/PuerkitoBio/goquery v1.9.1
github.com/rusq/osenv/v2 v2.0.1
github.com/stretchr/testify v1.8.1
)
Expand All @@ -12,6 +12,6 @@ require (
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
golang.org/x/net v0.19.0 // indirect
golang.org/x/net v0.24.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
6 changes: 4 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM=
github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ=
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VPW7UI=
github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY=
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
Expand Down Expand Up @@ -28,11 +29,12 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c=
golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U=
golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w=
golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
Expand Down
1,400 changes: 13 additions & 1,387 deletions mocks_test.go

Large diffs are not rendered by default.

22 changes: 11 additions & 11 deletions rubbish.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,15 +125,15 @@ type refuseParser struct {

// Parse parses the auckland council rubbish webpage.
func (p *refuseParser) parse(r io.Reader) ([]RubbishCollection, error) {
const datesSection = "#ctl00_SPWebPartManager1_g_dfe289d2_6a8a_414d_a384_fc25a0db9a6d_ctl00_pnlHouseholdBlock"
p.detail = make([]RubbishCollection, 2)
const datesSection = "#ctl00_SPWebPartManager1_g_dfe289d2_6a8a_414d_a384_fc25a0db9a6d_ctl00_pnlHouseholdBlock2"
p.detail = make([]RubbishCollection, 3)
doc, err := goquery.NewDocumentFromReader(r)
if err != nil {
return nil, err
}
_ = doc.Find(datesSection).
Children().
Slice(1, 3).
Slice(1, 4).
Each(p.parseLinks) // p.parseLinks populates p.detail
for i := range p.detail {
if err := (&p.detail[i]).parseDate(); err != nil {
Expand All @@ -150,17 +150,11 @@ func (p *refuseParser) parse(r io.Reader) ([]RubbishCollection, error) {
return p.detail, p.Err
}

// parseLinks parses the links within selection
// parseLinks parses the links within selection and populates p.detail.
func (p *refuseParser) parseLinks(el int, sel *goquery.Selection) {
sel.Children().Each(func(n int, sel *goquery.Selection) {
sel.Children().Children().Each(func(n int, sel *goquery.Selection) {
switch n {
case 0:
if dow.FindString(sel.Text()) == "" {
log.Println("unable to detect day of week")
return
}
p.detail[el].Day = sel.Text()
default:
if sel.Text() == "Rubbish" {
p.detail[el].Rubbish = true
} else if sel.Text() == "Food scraps" {
Expand All @@ -170,6 +164,12 @@ func (p *refuseParser) parseLinks(el int, sel *goquery.Selection) {
} else {
p.Err = fmt.Errorf("parse error: sel.Text = %q, el = %d, n = %d", sel.Text(), el, n)
}
default:
if dow.FindString(sel.Text()) == "" {
log.Println("unable to detect day of week")
return
}
p.detail[el].Day = sel.Text()
}
})
}
Expand Down
42 changes: 25 additions & 17 deletions rubbish_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,16 @@ func Test_parse(t *testing.T) {
args{strings.NewReader(testHTML)},
&CollectionDayDetailResult{
Collections: []RubbishCollection{
{Day: "Tuesday 11 February",
Date: adjustYear(time.Date(0, 02, 11, 0, 0, 0, 0, defaultLoc)),
Rubbish: true,
Recycle: true,
FoodScraps: false},
{Day: "Tuesday 18 February",
Date: adjustYear(time.Date(0, 02, 18, 0, 0, 0, 0, defaultLoc)),
{Day: "Sunday 21 April",
Date: adjustYear(time.Date(0, 04, 21, 0, 0, 0, 0, defaultLoc)),
Rubbish: true,
Recycle: false,
FoodScraps: false},
{Day: "Sunday 21 April",
Date: adjustYear(time.Date(0, 04, 21, 0, 0, 0, 0, defaultLoc)),
Rubbish: false,
Recycle: true,
FoodScraps: false},
},
Address: nil,
},
Expand All @@ -45,10 +45,18 @@ func Test_parse(t *testing.T) {
args{strings.NewReader(testHTMLcommercial)},
&CollectionDayDetailResult{
Collections: []RubbishCollection{
{Day: "Monday 24 February",
Date: adjustYear(time.Date(0, 02, 24, 0, 0, 0, 0, defaultLoc)),
{
Day: "Sunday 21 April",
Date: adjustYear(time.Date(0, 04, 21, 0, 0, 0, 0, defaultLoc)),
Rubbish: true,
Recycle: true},
Recycle: false,
},
{
Day: "Sunday 21 April",
Date: adjustYear(time.Date(0, 04, 21, 0, 0, 0, 0, defaultLoc)),
Rubbish: false,
Recycle: true,
},
},
Address: nil,
},
Expand Down Expand Up @@ -201,17 +209,17 @@ func TestCollectionDayDetail(t *testing.T) {
&CollectionDayDetailResult{
Collections: []RubbishCollection{
{
Day: "Tuesday 11 February",
Date: adjustYear(time.Date(0, 2, 11, 0, 0, 0, 0, defaultLoc)),
Day: "Sunday 21 April",
Date: adjustYear(time.Date(0, 4, 21, 0, 0, 0, 0, defaultLoc)),
Rubbish: true,
Recycle: true,
Recycle: false,
FoodScraps: false,
},
{
Day: "Tuesday 18 February",
Date: adjustYear(time.Date(0, 2, 18, 0, 0, 0, 0, defaultLoc)),
Rubbish: true,
Recycle: false,
Day: "Sunday 21 April",
Date: adjustYear(time.Date(0, 4, 21, 0, 0, 0, 0, defaultLoc)),
Rubbish: false,
Recycle: true,
FoodScraps: false,
},
},
Expand Down
Loading
Loading