-
Notifications
You must be signed in to change notification settings - Fork 14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Port builds.sh to Go (huge speed increase) #5
Changes from all commits
52c9fdd
e21336a
d307f68
281698f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
builds | ||
tar-scrubber |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#!/usr/bin/env bash | ||
set -Eeuo pipefail | ||
|
||
dir="$(dirname "$BASH_SOURCE")" | ||
dir="$(readlink -ve "$dir")" | ||
|
||
user="$(id -u):$(id -g)" | ||
args=( | ||
--interactive --rm --init | ||
--user "$user" | ||
--mount "type=bind,src=$dir,dst=/app" | ||
--workdir /app | ||
--tmpfs /tmp,exec | ||
--env HOME=/tmp | ||
|
||
# "go mod" cache is stored in /go/pkg/mod/cache | ||
--env GOPATH=/go | ||
--mount type=volume,src=doi-meta-gopath,dst=/go | ||
--env GOCACHE=/go/.cache | ||
|
||
--env "CGO_ENABLED=${CGO_ENABLED-0}" | ||
--env "GOTOOLCHAIN=${GOTOOLCHAIN-local}" | ||
--env GOFLAGS | ||
--env GOOS --env GOARCH | ||
--env GO386 | ||
--env GOAMD64 | ||
--env GOARM | ||
) | ||
if [ -t 0 ] && [ -t 1 ]; then | ||
args+=( --tty ) | ||
fi | ||
go="$(awk '$1 == "go" { print $2; exit }' "$dir/go.mod")" | ||
if [[ "$go" == *.*.* ]]; then | ||
go="${go%.*}" # strip to just X.Y | ||
fi | ||
args+=( | ||
"golang:$go" | ||
"$@" | ||
) | ||
set -x | ||
exec docker run "${args[@]}" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,293 @@ | ||
package main | ||
|
||
import ( | ||
"context" | ||
"crypto/sha256" | ||
"encoding/json" | ||
"fmt" | ||
"io" | ||
"os" | ||
"os/exec" | ||
"strings" | ||
"sync" | ||
|
||
"github.com/docker-library/meta-scripts/om" | ||
|
||
c8derrdefs "github.com/containerd/containerd/errdefs" | ||
"github.com/docker-library/bashbrew/registry" | ||
ocispec "github.com/opencontainers/image-spec/specs-go/v1" | ||
"github.com/sirupsen/logrus" // this is used by containerd libraries, so we need to set the default log level for it | ||
) | ||
|
||
var concurrency = 50 | ||
|
||
type MetaSource struct { | ||
SourceID string `json:"sourceId"` | ||
AllTags []string `json:"allTags"` | ||
Arches map[string]struct { | ||
Parents om.OrderedMap[struct { | ||
SourceID *string `json:"sourceId"` | ||
Pin *string `json:"pin"` | ||
}] | ||
} | ||
} | ||
|
||
type RemoteResolved struct { | ||
Ref string `json:"ref"` | ||
Desc ocispec.Descriptor `json:"desc"` | ||
} | ||
|
||
type RemoteResolvedFull struct { | ||
Manifest RemoteResolved `json:"manifest"` | ||
Index *RemoteResolved `json:"index,omitempty"` | ||
} | ||
|
||
type BuildIDParts struct { | ||
SourceID string `json:"sourceId"` | ||
Arch string `json:"arch"` | ||
Parents om.OrderedMap[string] `json:"parents"` | ||
} | ||
|
||
type MetaBuild struct { | ||
BuildID string `json:"buildId"` | ||
Build struct { | ||
Img string `json:"img"` | ||
Resolved *RemoteResolvedFull `json:"resolved"` | ||
BuildIDParts | ||
ResolvedParents om.OrderedMap[RemoteResolvedFull] `json:"resolvedParents"` | ||
} `json:"build"` | ||
Source json.RawMessage `json:"source"` | ||
} | ||
|
||
var ( | ||
// keys are image/tag names, values are functions that return either cacheResolveType or error | ||
cacheResolve = sync.Map{} | ||
) | ||
|
||
type cacheResolveType struct { | ||
r *registry.ResolvedObject | ||
arches map[string][]registry.ResolvedObject | ||
} | ||
|
||
func resolveRemoteArch(ctx context.Context, img string, arch string) (*RemoteResolvedFull, error) { | ||
cacheFunc, _ := cacheResolve.LoadOrStore(img, sync.OnceValues(func() (*cacheResolveType, error) { | ||
tianon marked this conversation as resolved.
Show resolved
Hide resolved
|
||
var ( | ||
ret = cacheResolveType{} | ||
err error | ||
) | ||
|
||
ret.r, err = registry.Resolve(ctx, img) | ||
if c8derrdefs.IsNotFound(err) { | ||
return nil, nil | ||
} else if err != nil { | ||
return nil, err | ||
} | ||
|
||
// TODO more efficient lookup of single architecture? (probably doesn't matter much, and then we have to have two independent caches) | ||
ret.arches, err = ret.r.Architectures(ctx) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
return &ret, nil | ||
})) | ||
cache, err := cacheFunc.(func() (*cacheResolveType, error))() | ||
if err != nil { | ||
return nil, err | ||
} | ||
if cache == nil { | ||
return nil, nil | ||
} | ||
r := cache.r | ||
rArches := cache.arches | ||
|
||
if _, ok := rArches[arch]; !ok { | ||
// TODO this should probably be just like a 404, right? (it's effectively a 404, even if it's not literally a 404) | ||
return nil, fmt.Errorf("%s missing %s arch", img, arch) | ||
} | ||
// TODO warn/error on multiple entries for arch? (would mean something like index/manifest list with multiple os.version values for Windows - we avoid this in DOI today, but we don't have any automated *checks* for it, so the current state is a little precarious) | ||
|
||
ref := func(obj *registry.ResolvedObject) string { | ||
base, _, _ := strings.Cut(obj.ImageRef, "@") | ||
tianon marked this conversation as resolved.
Show resolved
Hide resolved
|
||
base = strings.TrimPrefix(base, "docker.io/") | ||
base = strings.TrimPrefix(base, "library/") | ||
return base + "@" + string(obj.Desc.Digest) | ||
} | ||
resolved := &RemoteResolvedFull{ | ||
Manifest: RemoteResolved{ | ||
Ref: ref(&rArches[arch][0]), | ||
Desc: rArches[arch][0].Desc, | ||
}, | ||
} | ||
if r.IsImageIndex() { | ||
resolved.Index = &RemoteResolved{ | ||
Ref: ref(r), | ||
Desc: r.Desc, | ||
} | ||
} | ||
return resolved, nil | ||
} | ||
|
||
func main() { | ||
sourcesJsonFile := os.Args[1] // "sources.json" | ||
|
||
stagingTemplate := os.Getenv("BASHBREW_STAGING_TEMPLATE") // "oisupport/staging-ARCH:BUILD" | ||
if !strings.Contains(stagingTemplate, "BUILD") { | ||
panic("invalid BASHBREW_STAGING_TEMPLATE (missing BUILD)") | ||
} | ||
|
||
// containerd uses logrus, but it defaults to "info" (which is a bit leaky where we use containerd) | ||
logrus.SetLevel(logrus.WarnLevel) | ||
|
||
type out struct { | ||
buildId string | ||
json []byte | ||
} | ||
outs := make(chan chan out, concurrency) // we want the end result to be "in order", so we have a channel of channels of outputs so each output can be generated async (and write to the "inner" channel) and the outer channel stays in the input order | ||
|
||
go func() { | ||
// Go does not have ordered maps *and* is complicated to read an object, make a tiny modification, write it back out (without modelling the entire schema), so we'll let a single invocation of jq solve both problems (munging the documents in the way we expect *and* giving us an in-order stream) | ||
jq := exec.Command("jq", "-c", ".[] | (.arches | to_entries[]) as $arch | .arches = { ($arch.key): $arch.value }", sourcesJsonFile) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the issue with having the JSON sorted to begin with and then always outputting it sorted? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The order in That also only solves the ordering problem, not the "read, modify in a small way, write back out" problem (we'd have to define a full schema for that sources object, which we'll need eventually if we port |
||
jq.Stderr = os.Stderr | ||
|
||
stdout, err := jq.StdoutPipe() | ||
if err != nil { | ||
panic(err) | ||
} | ||
if err := jq.Start(); err != nil { | ||
panic(err) | ||
} | ||
|
||
sourceArchResolved := map[string](func() *RemoteResolvedFull){} | ||
sourceArchResolvedMutex := sync.RWMutex{} | ||
|
||
decoder := json.NewDecoder(stdout) | ||
for decoder.More() { | ||
var build MetaBuild | ||
|
||
if err := decoder.Decode(&build.Source); err == io.EOF { | ||
break | ||
} else if err != nil { | ||
panic(err) | ||
} | ||
|
||
var source MetaSource | ||
if err := json.Unmarshal(build.Source, &source); err != nil { | ||
panic(err) | ||
} | ||
|
||
build.Build.SourceID = source.SourceID | ||
|
||
if len(source.Arches) != 1 { | ||
panic("unexpected arches length: " + string(build.Source)) | ||
} | ||
for build.Build.Arch = range source.Arches { | ||
// I really hate Go. | ||
// (just doing a lookup of the only key in my map into a variable) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps soon, https://pkg.go.dev/golang.org/x/exp/maps#Keys There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed! I was angry that https://pkg.go.dev/maps didn't include that (yet), but perhaps there is still hope. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ahh, those functions being included in stdlib is blocked on the "iterators" proposals (which are blocked on " |
||
} | ||
|
||
outChan := make(chan out, 1) | ||
outs <- outChan | ||
|
||
sourceArchResolvedFunc := sync.OnceValue(func() *RemoteResolvedFull { | ||
for _, from := range source.Arches[build.Build.Arch].Parents.Keys() { | ||
parent := source.Arches[build.Build.Arch].Parents.Get(from) | ||
if from == "scratch" { | ||
continue | ||
} | ||
var resolved *RemoteResolvedFull | ||
if parent.SourceID != nil { | ||
sourceArchResolvedMutex.RLock() | ||
resolvedFunc, ok := sourceArchResolved[*parent.SourceID+"-"+build.Build.Arch] | ||
if !ok { | ||
panic("parent of " + source.SourceID + " on " + build.Build.Arch + " should be " + *parent.SourceID + " but that sourceId is unknown to us!") | ||
} | ||
sourceArchResolvedMutex.RUnlock() | ||
resolved = resolvedFunc() | ||
} else { | ||
lookup := from | ||
if parent.Pin != nil { | ||
lookup += "@" + *parent.Pin | ||
} | ||
|
||
resolved, err = resolveRemoteArch(context.TODO(), lookup, build.Build.Arch) | ||
if err != nil { | ||
panic(err) | ||
} | ||
} | ||
if resolved == nil { | ||
fmt.Fprintf(os.Stderr, "%s (%s) -> not yet! [%s]\n", source.SourceID, source.AllTags[0], build.Build.Arch) | ||
close(outChan) | ||
return nil | ||
} | ||
build.Build.ResolvedParents.Set(from, *resolved) | ||
build.Build.Parents.Set(from, string(resolved.Manifest.Desc.Digest)) | ||
} | ||
|
||
// buildId calculation | ||
buildIDJSON, err := json.Marshal(&build.Build.BuildIDParts) | ||
if err != nil { | ||
panic(err) | ||
} | ||
buildIDJSON = append(buildIDJSON, byte('\n')) // previous calculation of buildId included a newline in the JSON, so this preserves compatibility | ||
// TODO if we ever have a bigger "buildId break" event (like adding major base images that force the whole tree to rebuild), we should probably ditch this newline | ||
|
||
build.BuildID = fmt.Sprintf("%x", sha256.Sum256(buildIDJSON)) | ||
fmt.Fprintf(os.Stderr, "%s (%s) -> %s [%s]\n", source.SourceID, source.AllTags[0], build.BuildID, build.Build.Arch) | ||
|
||
build.Build.Img = strings.ReplaceAll(strings.ReplaceAll(stagingTemplate, "BUILD", build.BuildID), "ARCH", build.Build.Arch) // "oisupport/staging-amd64:xxxx" | ||
|
||
build.Build.Resolved, err = resolveRemoteArch(context.TODO(), build.Build.Img, build.Build.Arch) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
json, err := json.Marshal(&build) | ||
if err != nil { | ||
panic(err) | ||
} | ||
outChan <- out{ | ||
buildId: build.BuildID, | ||
json: json, | ||
} | ||
|
||
return build.Build.Resolved | ||
}) | ||
sourceArchResolvedMutex.Lock() | ||
sourceArchResolved[source.SourceID+"-"+build.Build.Arch] = sourceArchResolvedFunc | ||
sourceArchResolvedMutex.Unlock() | ||
go sourceArchResolvedFunc() | ||
} | ||
|
||
if err := stdout.Close(); err != nil { | ||
panic(err) | ||
} | ||
if err := jq.Wait(); err != nil { | ||
panic(err) | ||
} | ||
|
||
close(outs) | ||
}() | ||
|
||
fmt.Print("{") | ||
first := true | ||
for outChan := range outs { | ||
out, ok := <-outChan | ||
if !ok { | ||
continue | ||
} | ||
if !first { | ||
fmt.Print(",") | ||
} else { | ||
first = false | ||
} | ||
fmt.Println() | ||
buildId, err := json.Marshal(out.buildId) | ||
if err != nil { | ||
panic(err) | ||
} | ||
fmt.Printf("\t%s: %s", string(buildId), string(out.json)) | ||
yosifkit marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
fmt.Println() | ||
yosifkit marked this conversation as resolved.
Show resolved
Hide resolved
|
||
fmt.Println("}") | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why
var
and notconst
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just because I moved it up so it'd be more obvious and didn't give it much thought -- I ultimately wanted to make it a CLI flag, but changing it doesn't really change very much (it makes the whole thing slightly faster or slower, but still generally less than a minute so still subject to / limited by the 429s which are based on requests per minute). 😅