Skip to content

Commit

Permalink
Add a bunch of tracing annotations to the firecracker code path (#8242)
Browse files Browse the repository at this point in the history
These helped me understand the code flow and find a bunch of places for
performance improvement. I was careful not to add annotations for
anything that will happen hundreds of times per action.
  • Loading branch information
vanja-p authored Jan 24, 2025
1 parent 976dea6 commit 74c58d1
Show file tree
Hide file tree
Showing 12 changed files with 110 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,8 @@ type FirecrackerContainer struct {
var _ container.VM = (*FirecrackerContainer)(nil)

func NewContainer(ctx context.Context, env environment.Env, task *repb.ExecutionTask, opts ContainerOpts) (*FirecrackerContainer, error) {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
if *snaputil.EnableLocalSnapshotSharing && !(*enableVBD && *enableUFFD) {
return nil, status.FailedPreconditionError("executor configuration error: local snapshot sharing requires VBD and UFFD to be enabled")
}
Expand Down Expand Up @@ -838,6 +840,8 @@ func (c *FirecrackerContainer) pauseVM(ctx context.Context) error {
if c.machine == nil {
return status.InternalError("failed to pause VM: machine is not started")
}
ctx, span := tracing.StartSpan(ctx)
defer span.End()

if err := c.machine.PauseVM(ctx); err != nil {
log.CtxErrorf(ctx, "Error pausing VM: %s", err)
Expand Down Expand Up @@ -1202,6 +1206,8 @@ func (c *FirecrackerContainer) createWorkspaceImage(ctx context.Context, workspa
}

func (c *FirecrackerContainer) convertToCOW(ctx context.Context, filePath, chunkDir string) (*copy_on_write.COWStore, error) {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
start := time.Now()
if err := os.Mkdir(chunkDir, 0755); err != nil {
return nil, status.WrapError(err, "make chunk dir")
Expand Down Expand Up @@ -2274,15 +2280,12 @@ func (c *FirecrackerContainer) remove(ctx context.Context) error {
}

if c.uffdHandler != nil {
if err := c.uffdHandler.Stop(); err != nil {
log.CtxErrorf(ctx, "Error stopping uffd handler: %s", err)
if err := c.stopUffdHandler(ctx); err != nil {
lastErr = err
}
c.uffdHandler = nil
}
if c.memoryStore != nil {
c.memoryStore.Close()
c.memoryStore = nil
c.closeMemoryStore(ctx)
}

exists, err := disk.FileExists(ctx, filepath.Join(c.actionWorkingDir, invalidateSnapshotMarkerFile))
Expand All @@ -2309,11 +2312,31 @@ func (c *FirecrackerContainer) remove(ctx context.Context) error {
return lastErr
}

func (c *FirecrackerContainer) closeMemoryStore(ctx context.Context) {
_, span := tracing.StartSpan(ctx)
defer span.End()
c.memoryStore.Close()
c.memoryStore = nil
}

func (c *FirecrackerContainer) stopUffdHandler(ctx context.Context) error {
_, span := tracing.StartSpan(ctx)
defer span.End()
if err := c.uffdHandler.Stop(); err != nil {
log.CtxErrorf(ctx, "Error stopping uffd handler: %s", err)
return err
}
c.uffdHandler = nil
return nil
}

// Unmounts any mounted VBD filesystems.
// If this func returns a nil error, then the VBD filesystems were successfully
// unmounted and the backing COWStores can no longer be accessed using
// VBD file handles.
func (c *FirecrackerContainer) unmountAllVBDs(ctx context.Context, logErrors bool) error {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
var lastErr error
if c.scratchVBD != nil {
if err := c.scratchVBD.Unmount(ctx); err != nil {
Expand Down Expand Up @@ -2413,7 +2436,7 @@ func (c *FirecrackerContainer) pause(ctx context.Context) error {
}

// If an older snapshot is present -- nuke it since we're writing a new one.
if err = c.cleanupOldSnapshots(snapDetails); err != nil {
if err = c.cleanupOldSnapshots(ctx, snapDetails); err != nil {
return err
}

Expand Down Expand Up @@ -2486,20 +2509,24 @@ func (c *FirecrackerContainer) snapshotDetails(ctx context.Context) (*snapshotDe
}

func (c *FirecrackerContainer) createSnapshot(ctx context.Context, snapshotDetails *snapshotDetails) error {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
machineStart := time.Now()
snapshotTypeOpt := func(params *operations.CreateSnapshotParams) {
params.Body.SnapshotType = snapshotDetails.snapshotType
}
if err := c.machine.CreateSnapshot(ctx, snapshotDetails.memSnapshotName, snapshotDetails.vmStateSnapshotName, snapshotTypeOpt); err != nil {
log.CtxErrorf(ctx, "Error creating snapshot: %s", err)
log.CtxErrorf(ctx, "Error creating %s snapshot after %v: %s", snapshotDetails.snapshotType, time.Since(machineStart), err)
return err
}

log.CtxDebugf(ctx, "VMM CreateSnapshot %s took %s", snapshotDetails.snapshotType, time.Since(machineStart))
return nil
}

func (c *FirecrackerContainer) cleanupOldSnapshots(snapshotDetails *snapshotDetails) error {
func (c *FirecrackerContainer) cleanupOldSnapshots(ctx context.Context, snapshotDetails *snapshotDetails) error {
_, span := tracing.StartSpan(ctx)
defer span.End()
memSnapshotPath := filepath.Join(c.getChroot(), snapshotDetails.memSnapshotName)
vmStateSnapshotPath := filepath.Join(c.getChroot(), snapshotDetails.vmStateSnapshotName)

Expand Down
1 change: 1 addition & 0 deletions enterprise/server/remote_execution/copy_on_write/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ go_library(
"//server/util/log",
"//server/util/lru",
"//server/util/status",
"//server/util/tracing",
"@com_github_prometheus_client_golang//prometheus",
"@org_golang_x_exp//maps",
"@org_golang_x_sync//errgroup",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/buildbuddy-io/buildbuddy/server/util/log"
"github.com/buildbuddy-io/buildbuddy/server/util/lru"
"github.com/buildbuddy-io/buildbuddy/server/util/status"
"github.com/buildbuddy-io/buildbuddy/server/util/tracing"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/exp/maps"
"golang.org/x/sync/errgroup"
Expand Down Expand Up @@ -747,6 +748,8 @@ func (c *COWStore) EmitUsageMetrics(stage string) {
// do with any files written to dataDir. Typically the caller should provide an
// empty dataDir and remove the dir and contents if there is an error.
func ConvertFileToCOW(ctx context.Context, env environment.Env, filePath string, chunkSizeBytes int64, dataDir string, remoteInstanceName string, remoteEnabled bool) (store *COWStore, err error) {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
var chunks []*Mmap
defer func() {
// If there's an error, clean up any Store instances we created.
Expand Down
1 change: 1 addition & 0 deletions enterprise/server/remote_execution/runner/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ go_library(
"//server/util/proto",
"//server/util/random",
"//server/util/status",
"//server/util/tracing",
"@com_github_prometheus_client_golang//prometheus",
"@org_golang_google_protobuf//types/known/durationpb",
"@org_golang_x_sync//errgroup",
Expand Down
4 changes: 4 additions & 0 deletions enterprise/server/remote_execution/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
"github.com/buildbuddy-io/buildbuddy/server/util/proto"
"github.com/buildbuddy-io/buildbuddy/server/util/random"
"github.com/buildbuddy-io/buildbuddy/server/util/status"
"github.com/buildbuddy-io/buildbuddy/server/util/tracing"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/sync/errgroup"
"google.golang.org/protobuf/types/known/durationpb"
Expand Down Expand Up @@ -1326,6 +1327,8 @@ func (p *pool) finalize(ctx context.Context, r *taskRunner) {
// TryRecycle either adds r back to the pool if appropriate, or removes it,
// freeing up any resources it holds.
func (p *pool) TryRecycle(ctx context.Context, r interfaces.Runner, finishedCleanly bool) {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
ctx, cancel := background.ExtendContextForFinalization(ctx, runnerRecycleTimeout)
defer cancel()

Expand Down Expand Up @@ -1362,6 +1365,7 @@ func (p *pool) TryRecycle(ctx context.Context, r interfaces.Runner, finishedClea
(*snaputil.EnableRemoteSnapshotSharing || *snaputil.EnableLocalSnapshotSharing)
if snapshotEnabledRunner {
if err := cr.Container.Pause(ctx); err != nil {
// TODO(vanja) maybe recycled should be set to true here?
log.CtxErrorf(ctx, "Failed to save snapshot for runner %s: %s", cr, err)
return
}
Expand Down
16 changes: 16 additions & 0 deletions enterprise/server/remote_execution/snaploader/snaploader.go
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,8 @@ func (l *FileCacheLoader) GetSnapshot(ctx context.Context, keys *fcpb.SnapshotKe
}

func (l *FileCacheLoader) getSnapshot(ctx context.Context, key *fcpb.SnapshotKey, remoteEnabled bool) (*fcpb.SnapshotManifest, error) {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
if *snaputil.EnableRemoteSnapshotSharing && remoteEnabled {
manifest, err := l.fetchRemoteManifest(ctx, key)
if err != nil {
Expand Down Expand Up @@ -445,6 +447,8 @@ func (l *FileCacheLoader) GetLocalManifestACResult(ctx context.Context, manifest
}

func (l *FileCacheLoader) getLocalManifest(ctx context.Context, key *fcpb.SnapshotKey) (*fcpb.SnapshotManifest, error) {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
gid, err := groupID(ctx, l.env)
if err != nil {
return nil, err
Expand Down Expand Up @@ -475,6 +479,8 @@ func (l *FileCacheLoader) getLocalManifest(ctx context.Context, key *fcpb.Snapsh
}

func (l *FileCacheLoader) actionResultToManifest(ctx context.Context, remoteInstanceName string, snapshotActionResult *repb.ActionResult, tmpDir string, remoteEnabled bool) (*fcpb.SnapshotManifest, error) {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
snapMetadata := snapshotActionResult.GetExecutionMetadata().GetAuxiliaryMetadata()
if len(snapMetadata) < 1 {
return nil, status.InternalErrorf("expected vm config in snapshot auxiliary metadata")
Expand Down Expand Up @@ -597,6 +603,8 @@ func (l *FileCacheLoader) UnpackSnapshot(ctx context.Context, snapshot *Snapshot
}

func (l *FileCacheLoader) CacheSnapshot(ctx context.Context, key *fcpb.SnapshotKey, opts *CacheSnapshotOptions) error {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
vmConfig, err := anypb.New(opts.VMConfiguration)
if err != nil {
return err
Expand Down Expand Up @@ -685,6 +693,8 @@ func (l *FileCacheLoader) CacheSnapshot(ctx context.Context, key *fcpb.SnapshotK
}

func (l *FileCacheLoader) cacheActionResult(ctx context.Context, key *fcpb.SnapshotKey, ar *repb.ActionResult, opts *CacheSnapshotOptions) error {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
b, err := proto.Marshal(ar)
if err != nil {
return err
Expand Down Expand Up @@ -765,6 +775,8 @@ func (l *FileCacheLoader) cacheActionResult(ctx context.Context, key *fcpb.Snaps
}

func (l *FileCacheLoader) checkAllArtifactsExist(ctx context.Context, manifest *fcpb.SnapshotManifest) error {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
for _, f := range manifest.GetFiles() {
if !l.env.GetFileCache().ContainsFile(ctx, f) {
return status.NotFoundErrorf("file %q not found (digest %q)", f.GetName(), digest.String(f.GetDigest()))
Expand Down Expand Up @@ -818,6 +830,8 @@ func (l *FileCacheLoader) unpackCOW(ctx context.Context, file *fcpb.ChunkedFile,
// cacheCOW represents a COWStore as an action result tree and saves the store
// to the cache. Returns the digest of the tree
func (l *FileCacheLoader) cacheCOW(ctx context.Context, name string, remoteInstanceName string, cow *copy_on_write.COWStore, cacheOpts *CacheSnapshotOptions) (*repb.Digest, error) {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
var dirtyBytes, dirtyChunkCount int64
start := time.Now()
defer func() {
Expand Down Expand Up @@ -957,6 +971,8 @@ func NewSnapshotService(env environment.Env) *SnapshotService {

// InvalidateSnapshot returns the new valid version ID for snapshots to be based off.
func (l *SnapshotService) InvalidateSnapshot(ctx context.Context, key *fcpb.SnapshotKey) (string, error) {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
// Update the snapshot version to a random value. This will invalidate all past
// snapshots that have a different version.
newVersion, err := random.RandomString(10)
Expand Down
1 change: 1 addition & 0 deletions enterprise/server/util/oci/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ go_library(
"//server/util/flag",
"//server/util/log",
"//server/util/status",
"//server/util/tracing",
"@com_github_docker_distribution//reference",
"@com_github_google_go_containerregistry//pkg/authn",
"@com_github_google_go_containerregistry//pkg/name",
Expand Down
3 changes: 3 additions & 0 deletions enterprise/server/util/oci/oci.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/buildbuddy-io/buildbuddy/server/util/flag"
"github.com/buildbuddy-io/buildbuddy/server/util/log"
"github.com/buildbuddy-io/buildbuddy/server/util/status"
"github.com/buildbuddy-io/buildbuddy/server/util/tracing"
"github.com/docker/distribution/reference"
"github.com/google/go-containerregistry/pkg/authn"
v1 "github.com/google/go-containerregistry/pkg/v1"
Expand Down Expand Up @@ -151,6 +152,8 @@ func (c Credentials) Equals(o Credentials) bool {
}

func Resolve(ctx context.Context, imageName string, platform *rgpb.Platform, credentials Credentials) (v1.Image, error) {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
imageRef, err := ctrname.ParseReference(imageName)
if err != nil {
return nil, status.InvalidArgumentErrorf("invalid image %q", imageName)
Expand Down
1 change: 1 addition & 0 deletions enterprise/server/util/ociconv/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ go_library(
"//server/util/hash",
"//server/util/log",
"//server/util/status",
"//server/util/tracing",
"//third_party/singleflight",
"@com_github_google_go_containerregistry//pkg/v1/mutate",
],
Expand Down
45 changes: 29 additions & 16 deletions enterprise/server/util/ociconv/ociconv.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/buildbuddy-io/buildbuddy/server/util/hash"
"github.com/buildbuddy-io/buildbuddy/server/util/log"
"github.com/buildbuddy-io/buildbuddy/server/util/status"
"github.com/buildbuddy-io/buildbuddy/server/util/tracing"
"github.com/buildbuddy-io/buildbuddy/third_party/singleflight"
"github.com/google/go-containerregistry/pkg/v1/mutate"
)
Expand Down Expand Up @@ -105,30 +106,18 @@ func CachedDiskImagePath(ctx context.Context, cacheRoot, containerImage string)
// registry to ensure that the image can be accessed. The path to the disk image
// is returned.
func CreateDiskImage(ctx context.Context, cacheRoot, containerImage string, creds oci.Credentials) (string, error) {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
existingPath, err := CachedDiskImagePath(ctx, cacheRoot, containerImage)
if err != nil {
return "", err
}
if existingPath != "" {
// Image is cached. Authenticate with the remote registry to be sure
// the credentials are valid.

inspectArgs := []string{"inspect", "--raw", fmt.Sprintf("docker://%s", containerImage)}
if !creds.IsEmpty() {
inspectArgs = append(inspectArgs, "--creds", creds.String())
}
cmd := exec.CommandContext(ctx, "skopeo", inspectArgs...)
b, err := cmd.CombinedOutput()
if err != nil {
// We don't know whether an authentication error occurred unless we do
// brittle parsing of the command output. So for now just return
// UnavailableError which is the "least common denominator" of errors.
return "", status.UnavailableErrorf(
"Failed to authenticate with container registry for image %q: %s: %s",
containerImage, err, string(b),
)
if err := authenticateWithRegistry(ctx, containerImage, creds); err != nil {
return "", err
}

return existingPath, nil
}

Expand All @@ -152,7 +141,31 @@ func CreateDiskImage(ctx context.Context, cacheRoot, containerImage string, cred
return imageDir, err
}

func authenticateWithRegistry(ctx context.Context, containerImage string, creds oci.Credentials) error {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
inspectArgs := []string{"inspect", "--raw", fmt.Sprintf("docker://%s", containerImage)}
if !creds.IsEmpty() {
inspectArgs = append(inspectArgs, "--creds", creds.String())
}
cmd := exec.CommandContext(ctx, "skopeo", inspectArgs...)
b, err := cmd.CombinedOutput()
if err != nil {
// We don't know whether an authentication error occurred unless we do
// brittle parsing of the command output. So for now just return
// UnavailableError which is the "least common denominator" of errors.
return status.UnavailableErrorf(
"Failed to authenticate with container registry for image %q: %s: %s",
containerImage, err, string(b),
)
}

return nil
}

func createExt4Image(ctx context.Context, cacheRoot, containerImage string, creds oci.Credentials) (string, error) {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
diskImagesPath := getDiskImagesPath(cacheRoot, containerImage)
// container not found -- write one!
tmpImagePath, err := convertContainerToExt4FS(ctx, cacheRoot, containerImage, creds)
Expand Down
3 changes: 3 additions & 0 deletions enterprise/server/util/vsock/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ go_library(
"//server/util/status",
"//server/util/tracing",
"@com_github_mdlayher_vsock//:vsock",
"@io_opentelemetry_go_contrib_instrumentation_google_golang_org_grpc_otelgrpc//:otelgrpc",
"@io_opentelemetry_go_otel_metric//noop",
"@io_opentelemetry_go_otel_trace//:trace",
"@org_golang_google_grpc//:grpc",
"@org_golang_google_grpc//backoff",
"@org_golang_x_sys//unix",
Expand Down
Loading

0 comments on commit 74c58d1

Please sign in to comment.