Skip to content

Commit

Permalink
Merge pull request #9 from rusq/scraping-fix
Browse files Browse the repository at this point in the history
Scraping fix
  • Loading branch information
rusq authored Apr 21, 2024
2 parents 418f385 + adffd0d commit 3d0f7fa
Show file tree
Hide file tree
Showing 10 changed files with 901 additions and 1,428 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/go.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v3
with:
go-version: 1.19
go-version: 1.22

- name: Build
run: go build -v ./...
Expand Down
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
FROM golang:1.20-alpine AS builder
FROM golang:1.22-alpine AS builder
LABEL maintainer="github:@rusq"

WORKDIR /build
COPY . .

RUN go build -ldflags="-s -w" ./cmd/aklapi

FROM alpine:3.17
FROM alpine:3.19.1
LABEL maintainer="github:@rusq"

RUN apk add --no-cache ca-certificates && apk --no-cache add tzdata
RUN apk add --no-cache ca-certificates


WORKDIR /app
Expand Down
17 changes: 13 additions & 4 deletions cmd/aklapi/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@ import (
"errors"
"log"
"net/http"
"time"

"github.com/rusq/aklapi"
)

const dateFmt = "2006-01-02"
const dttmLayout = "2006-01-02"

type rrResponse struct {
Rubbish string `json:"rubbish,omitempty"`
Expand Down Expand Up @@ -59,14 +60,22 @@ func rrHandler(w http.ResponseWriter, r *http.Request) {
return
}
resp := rrResponse{
Recycle: res.NextRecycle().Format(dateFmt),
Rubbish: res.NextRubbish().Format(dateFmt),
FoodScraps: res.NextFoodScraps().Format(dateFmt),
Recycle: timefmt(res.NextRecycle()),
Rubbish: timefmt(res.NextRubbish()),
FoodScraps: timefmt(res.NextFoodScraps()),
Address: res.Address.Address,
}
respond(w, resp, http.StatusOK)
}

// timefmt formats the time skipping empty time.
func timefmt(t time.Time) string {
if t.IsZero() {
return ""
}
return t.Format(dttmLayout)
}

func rrExtHandler(w http.ResponseWriter, r *http.Request) {
res, err := rubbish(r)
if err != nil {
Expand Down
2 changes: 2 additions & 0 deletions cmd/aklapi/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"log"
"net/http"
"os"
_ "time/tzdata"

"github.com/rusq/aklapi"
"github.com/rusq/osenv/v2"
Expand Down Expand Up @@ -41,6 +42,7 @@ var (
var tmpl = template.Must(template.New("index.html").Parse(rootHTML))

func main() {
log.SetFlags(log.LstdFlags | log.Lmicroseconds)
flag.Parse()
if *port == "" {
log.Printf("no port specified, defaulting to %s", defaultPort)
Expand Down
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
module github.com/rusq/aklapi

go 1.19
go 1.22

require (
github.com/PuerkitoBio/goquery v1.8.1
github.com/PuerkitoBio/goquery v1.9.1
github.com/rusq/osenv/v2 v2.0.1
github.com/stretchr/testify v1.8.1
)
Expand All @@ -12,6 +12,6 @@ require (
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
golang.org/x/net v0.19.0 // indirect
golang.org/x/net v0.24.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
6 changes: 4 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM=
github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ=
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VPW7UI=
github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY=
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
Expand Down Expand Up @@ -28,11 +29,12 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c=
golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U=
golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w=
golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
Expand Down
1,400 changes: 13 additions & 1,387 deletions mocks_test.go

Large diffs are not rendered by default.

22 changes: 11 additions & 11 deletions rubbish.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,15 +125,15 @@ type refuseParser struct {

// Parse parses the auckland council rubbish webpage.
func (p *refuseParser) parse(r io.Reader) ([]RubbishCollection, error) {
const datesSection = "#ctl00_SPWebPartManager1_g_dfe289d2_6a8a_414d_a384_fc25a0db9a6d_ctl00_pnlHouseholdBlock"
p.detail = make([]RubbishCollection, 2)
const datesSection = "#ctl00_SPWebPartManager1_g_dfe289d2_6a8a_414d_a384_fc25a0db9a6d_ctl00_pnlHouseholdBlock2"
p.detail = make([]RubbishCollection, 3)
doc, err := goquery.NewDocumentFromReader(r)
if err != nil {
return nil, err
}
_ = doc.Find(datesSection).
Children().
Slice(1, 3).
Slice(1, 4).
Each(p.parseLinks) // p.parseLinks populates p.detail
for i := range p.detail {
if err := (&p.detail[i]).parseDate(); err != nil {
Expand All @@ -150,17 +150,11 @@ func (p *refuseParser) parse(r io.Reader) ([]RubbishCollection, error) {
return p.detail, p.Err
}

// parseLinks parses the links within selection
// parseLinks parses the links within selection and populates p.detail.
func (p *refuseParser) parseLinks(el int, sel *goquery.Selection) {
sel.Children().Each(func(n int, sel *goquery.Selection) {
sel.Children().Children().Each(func(n int, sel *goquery.Selection) {
switch n {
case 0:
if dow.FindString(sel.Text()) == "" {
log.Println("unable to detect day of week")
return
}
p.detail[el].Day = sel.Text()
default:
if sel.Text() == "Rubbish" {
p.detail[el].Rubbish = true
} else if sel.Text() == "Food scraps" {
Expand All @@ -170,6 +164,12 @@ func (p *refuseParser) parseLinks(el int, sel *goquery.Selection) {
} else {
p.Err = fmt.Errorf("parse error: sel.Text = %q, el = %d, n = %d", sel.Text(), el, n)
}
default:
if dow.FindString(sel.Text()) == "" {
log.Println("unable to detect day of week")
return
}
p.detail[el].Day = sel.Text()
}
})
}
Expand Down
42 changes: 25 additions & 17 deletions rubbish_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,16 @@ func Test_parse(t *testing.T) {
args{strings.NewReader(testHTML)},
&CollectionDayDetailResult{
Collections: []RubbishCollection{
{Day: "Tuesday 11 February",
Date: adjustYear(time.Date(0, 02, 11, 0, 0, 0, 0, defaultLoc)),
Rubbish: true,
Recycle: true,
FoodScraps: false},
{Day: "Tuesday 18 February",
Date: adjustYear(time.Date(0, 02, 18, 0, 0, 0, 0, defaultLoc)),
{Day: "Sunday 21 April",
Date: adjustYear(time.Date(0, 04, 21, 0, 0, 0, 0, defaultLoc)),
Rubbish: true,
Recycle: false,
FoodScraps: false},
{Day: "Sunday 21 April",
Date: adjustYear(time.Date(0, 04, 21, 0, 0, 0, 0, defaultLoc)),
Rubbish: false,
Recycle: true,
FoodScraps: false},
},
Address: nil,
},
Expand All @@ -45,10 +45,18 @@ func Test_parse(t *testing.T) {
args{strings.NewReader(testHTMLcommercial)},
&CollectionDayDetailResult{
Collections: []RubbishCollection{
{Day: "Monday 24 February",
Date: adjustYear(time.Date(0, 02, 24, 0, 0, 0, 0, defaultLoc)),
{
Day: "Sunday 21 April",
Date: adjustYear(time.Date(0, 04, 21, 0, 0, 0, 0, defaultLoc)),
Rubbish: true,
Recycle: true},
Recycle: false,
},
{
Day: "Sunday 21 April",
Date: adjustYear(time.Date(0, 04, 21, 0, 0, 0, 0, defaultLoc)),
Rubbish: false,
Recycle: true,
},
},
Address: nil,
},
Expand Down Expand Up @@ -201,17 +209,17 @@ func TestCollectionDayDetail(t *testing.T) {
&CollectionDayDetailResult{
Collections: []RubbishCollection{
{
Day: "Tuesday 11 February",
Date: adjustYear(time.Date(0, 2, 11, 0, 0, 0, 0, defaultLoc)),
Day: "Sunday 21 April",
Date: adjustYear(time.Date(0, 4, 21, 0, 0, 0, 0, defaultLoc)),
Rubbish: true,
Recycle: true,
Recycle: false,
FoodScraps: false,
},
{
Day: "Tuesday 18 February",
Date: adjustYear(time.Date(0, 2, 18, 0, 0, 0, 0, defaultLoc)),
Rubbish: true,
Recycle: false,
Day: "Sunday 21 April",
Date: adjustYear(time.Date(0, 4, 21, 0, 0, 0, 0, defaultLoc)),
Rubbish: false,
Recycle: true,
FoodScraps: false,
},
},
Expand Down
Loading

0 comments on commit 3d0f7fa

Please sign in to comment.