From ea3f57de3eeb12ac00d65a2995086629e10dedbb Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 6 Aug 2024 15:56:15 +0530 Subject: [PATCH 01/45] chore: add initial structure and signature methods --- pkg/stanza/fileconsumer/config.go | 10 ++++++ pkg/stanza/fileconsumer/file.go | 14 +++++--- .../fileconsumer/internal/archive/archive.go | 36 +++++++++++++++++++ .../internal/archive/archive_nop.go | 20 +++++++++++ .../fileconsumer/internal/fileset/fileset.go | 5 +++ 5 files changed, 81 insertions(+), 4 deletions(-) create mode 100644 pkg/stanza/fileconsumer/internal/archive/archive.go create mode 100644 pkg/stanza/fileconsumer/internal/archive/archive_nop.go diff --git a/pkg/stanza/fileconsumer/config.go b/pkg/stanza/fileconsumer/config.go index 03c481cacc1f..e891b9a372e4 100644 --- a/pkg/stanza/fileconsumer/config.go +++ b/pkg/stanza/fileconsumer/config.go @@ -20,6 +20,7 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/decode" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/attrs" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/emit" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/archive" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/header" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/metadata" @@ -87,6 +88,7 @@ type Config struct { DeleteAfterRead bool `mapstructure:"delete_after_read,omitempty"` IncludeFileRecordNumber bool `mapstructure:"include_file_record_number,omitempty"` Compression string `mapstructure:"compression,omitempty"` + PollsToArchive int `mapstructure:"-"` } type HeaderConfig struct { @@ -179,6 +181,13 @@ func (c Config) Build(set component.TelemetrySettings, emit emit.Callback, opts t = tracker.NewFileTracker(set, c.MaxConcurrentFiles/2) } + var a archive.Archive + if c.PollsToArchive <= 0 { + a = archive.NewNopArchive() + } else { + a = archive.NewArchive(c.PollsToArchive) + } + telemetryBuilder, err := metadata.NewTelemetryBuilder(set) if err != nil { return nil, err @@ -192,6 +201,7 @@ func (c Config) Build(set component.TelemetrySettings, emit emit.Callback, opts maxBatches: c.MaxBatches, tracker: t, telemetryBuilder: telemetryBuilder, + archive: a, }, nil } diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go index d46507ecf3eb..f4398b82c3cc 100644 --- a/pkg/stanza/fileconsumer/file.go +++ b/pkg/stanza/fileconsumer/file.go @@ -13,6 +13,7 @@ import ( "go.opentelemetry.io/collector/component" "go.uber.org/zap" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/archive" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/checkpoint" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/metadata" @@ -30,11 +31,13 @@ type Manager struct { readerFactory reader.Factory fileMatcher *matcher.Matcher tracker tracker.Tracker + archive archive.Archive - pollInterval time.Duration - persister operator.Persister - maxBatches int - maxBatchFiles int + pollInterval time.Duration + persister operator.Persister + maxBatches int + maxBatchFiles int + pollsToArchive int telemetryBuilder *metadata.TelemetryBuilder } @@ -58,6 +61,9 @@ func (m *Manager) Start(persister operator.Persister) error { m.readerFactory.FromBeginning = true m.tracker.LoadMetadata(offsets) } + m.archive.SetStorageClient(persister) + } else if m.pollsToArchive > 0 { + return fmt.Errorf("archiving is not supported in memory, please use a storage extension") } // Start polling goroutine diff --git a/pkg/stanza/fileconsumer/internal/archive/archive.go b/pkg/stanza/fileconsumer/internal/archive/archive.go new file mode 100644 index 000000000000..697ae7e2b30d --- /dev/null +++ b/pkg/stanza/fileconsumer/internal/archive/archive.go @@ -0,0 +1,36 @@ +package archive + +import ( + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fileset" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/reader" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator" +) + +const knownFilesKeyPrefix = "knownFiles" + +type Archive interface { + SetStorageClient(persister operator.Persister) + Match(fp *fingerprint.Fingerprint) *reader.Metadata +} + +type archive struct { + persister operator.Persister + pollsToArchive int + fileset *fileset.Fileset[*reader.Metadata] +} + +func NewArchive(pollsToArchive int) *archive { + return &archive{pollsToArchive: pollsToArchive} +} + +func (a *archive) SetStorageClient(persister operator.Persister) { + a.persister = persister +} + +func (a *archive) Match(fp *fingerprint.Fingerprint) *reader.Metadata { + // TODO: + // Add logic to go through the storage and return a match. + // Also update the storage if match found. + return nil +} diff --git a/pkg/stanza/fileconsumer/internal/archive/archive_nop.go b/pkg/stanza/fileconsumer/internal/archive/archive_nop.go new file mode 100644 index 000000000000..82e1f43ef88c --- /dev/null +++ b/pkg/stanza/fileconsumer/internal/archive/archive_nop.go @@ -0,0 +1,20 @@ +package archive + +import ( + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/reader" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator" +) + +type nop_archive struct{} + +func NewNopArchive() *nop_archive { + return &nop_archive{} +} + +func (a *nop_archive) SetStorageClient(_ operator.Persister) { +} + +func (a *nop_archive) Match(_ *fingerprint.Fingerprint) *reader.Metadata { + return nil +} diff --git a/pkg/stanza/fileconsumer/internal/fileset/fileset.go b/pkg/stanza/fileconsumer/internal/fileset/fileset.go index 3d1cf50fb143..10c66711c9d3 100644 --- a/pkg/stanza/fileconsumer/internal/fileset/fileset.go +++ b/pkg/stanza/fileconsumer/internal/fileset/fileset.go @@ -55,6 +55,11 @@ func (set *Fileset[T]) Add(readers ...T) { set.readers = append(set.readers, readers...) } +func (set *Fileset[T]) Reset(readers ...T) { + // reset the underlying array. + set.readers = readers +} + func (set *Fileset[T]) Match(fp *fingerprint.Fingerprint, cmp func(a, b *fingerprint.Fingerprint) bool) T { var val T for idx, r := range set.readers { From 17ec1aa0fb261d96f09488a0e691fc31d72e6cd4 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 6 Aug 2024 16:17:07 +0530 Subject: [PATCH 02/45] fix: add license --- pkg/stanza/fileconsumer/internal/archive/archive.go | 3 +++ pkg/stanza/fileconsumer/internal/archive/archive_nop.go | 3 +++ 2 files changed, 6 insertions(+) diff --git a/pkg/stanza/fileconsumer/internal/archive/archive.go b/pkg/stanza/fileconsumer/internal/archive/archive.go index 697ae7e2b30d..afcc8be130ad 100644 --- a/pkg/stanza/fileconsumer/internal/archive/archive.go +++ b/pkg/stanza/fileconsumer/internal/archive/archive.go @@ -1,3 +1,6 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + package archive import ( diff --git a/pkg/stanza/fileconsumer/internal/archive/archive_nop.go b/pkg/stanza/fileconsumer/internal/archive/archive_nop.go index 82e1f43ef88c..952f260ae037 100644 --- a/pkg/stanza/fileconsumer/internal/archive/archive_nop.go +++ b/pkg/stanza/fileconsumer/internal/archive/archive_nop.go @@ -1,3 +1,6 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + package archive import ( From 26deb01bdadaf29131c4582d0cec4f97e80dab9a Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 6 Aug 2024 16:30:13 +0530 Subject: [PATCH 03/45] fix: lint --- pkg/stanza/fileconsumer/internal/archive/archive.go | 2 +- pkg/stanza/fileconsumer/internal/archive/archive_nop.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/archive/archive.go b/pkg/stanza/fileconsumer/internal/archive/archive.go index afcc8be130ad..59360bc06c69 100644 --- a/pkg/stanza/fileconsumer/internal/archive/archive.go +++ b/pkg/stanza/fileconsumer/internal/archive/archive.go @@ -23,7 +23,7 @@ type archive struct { fileset *fileset.Fileset[*reader.Metadata] } -func NewArchive(pollsToArchive int) *archive { +func NewArchive(pollsToArchive int) Archive { return &archive{pollsToArchive: pollsToArchive} } diff --git a/pkg/stanza/fileconsumer/internal/archive/archive_nop.go b/pkg/stanza/fileconsumer/internal/archive/archive_nop.go index 952f260ae037..4c51061fb5aa 100644 --- a/pkg/stanza/fileconsumer/internal/archive/archive_nop.go +++ b/pkg/stanza/fileconsumer/internal/archive/archive_nop.go @@ -11,7 +11,7 @@ import ( type nop_archive struct{} -func NewNopArchive() *nop_archive { +func NewNopArchive() Archive { return &nop_archive{} } From fe1912cbc27903261a3c59ed1102ca74a1890b24 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 6 Aug 2024 16:49:54 +0530 Subject: [PATCH 04/45] fix: lint --- pkg/stanza/fileconsumer/internal/archive/archive.go | 6 +++--- pkg/stanza/fileconsumer/internal/archive/archive_nop.go | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/archive/archive.go b/pkg/stanza/fileconsumer/internal/archive/archive.go index 59360bc06c69..2ba572a28c86 100644 --- a/pkg/stanza/fileconsumer/internal/archive/archive.go +++ b/pkg/stanza/fileconsumer/internal/archive/archive.go @@ -10,7 +10,7 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator" ) -const knownFilesKeyPrefix = "knownFiles" +const _ = "knownFiles" type Archive interface { SetStorageClient(persister operator.Persister) @@ -20,7 +20,7 @@ type Archive interface { type archive struct { persister operator.Persister pollsToArchive int - fileset *fileset.Fileset[*reader.Metadata] + _ *fileset.Fileset[*reader.Metadata] } func NewArchive(pollsToArchive int) Archive { @@ -31,7 +31,7 @@ func (a *archive) SetStorageClient(persister operator.Persister) { a.persister = persister } -func (a *archive) Match(fp *fingerprint.Fingerprint) *reader.Metadata { +func (a *archive) Match(_ *fingerprint.Fingerprint) *reader.Metadata { // TODO: // Add logic to go through the storage and return a match. // Also update the storage if match found. diff --git a/pkg/stanza/fileconsumer/internal/archive/archive_nop.go b/pkg/stanza/fileconsumer/internal/archive/archive_nop.go index 4c51061fb5aa..c2269e9f0d58 100644 --- a/pkg/stanza/fileconsumer/internal/archive/archive_nop.go +++ b/pkg/stanza/fileconsumer/internal/archive/archive_nop.go @@ -9,15 +9,15 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator" ) -type nop_archive struct{} +type nopArchive struct{} func NewNopArchive() Archive { - return &nop_archive{} + return &nopArchive{} } -func (a *nop_archive) SetStorageClient(_ operator.Persister) { +func (a *nopArchive) SetStorageClient(_ operator.Persister) { } -func (a *nop_archive) Match(_ *fingerprint.Fingerprint) *reader.Metadata { +func (a *nopArchive) Match(_ *fingerprint.Fingerprint) *reader.Metadata { return nil } From 33566b0397a1c852a16b4ccfceef88a0996b1406 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 6 Aug 2024 16:53:26 +0530 Subject: [PATCH 05/45] fix: check --- pkg/stanza/fileconsumer/internal/archive/archive.go | 2 +- pkg/stanza/fileconsumer/internal/archive/archive_nop.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/archive/archive.go b/pkg/stanza/fileconsumer/internal/archive/archive.go index 2ba572a28c86..eeb701f516dd 100644 --- a/pkg/stanza/fileconsumer/internal/archive/archive.go +++ b/pkg/stanza/fileconsumer/internal/archive/archive.go @@ -1,7 +1,7 @@ // Copyright The OpenTelemetry Authors // SPDX-License-Identifier: Apache-2.0 -package archive +package archive // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/archive" import ( "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fileset" diff --git a/pkg/stanza/fileconsumer/internal/archive/archive_nop.go b/pkg/stanza/fileconsumer/internal/archive/archive_nop.go index c2269e9f0d58..52295b16890a 100644 --- a/pkg/stanza/fileconsumer/internal/archive/archive_nop.go +++ b/pkg/stanza/fileconsumer/internal/archive/archive_nop.go @@ -1,7 +1,7 @@ // Copyright The OpenTelemetry Authors // SPDX-License-Identifier: Apache-2.0 -package archive +package archive // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/archive" import ( "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint" From 5acb848d1cfd47f0d70b885d8fffaafd2e5b3476 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Mon, 19 Aug 2024 22:28:27 +0530 Subject: [PATCH 06/45] chore: add write method, update interface --- .../fileconsumer/internal/archive/archive.go | 15 ++++++++++++--- .../fileconsumer/internal/archive/archive_nop.go | 6 +++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/archive/archive.go b/pkg/stanza/fileconsumer/internal/archive/archive.go index eeb701f516dd..866698a32356 100644 --- a/pkg/stanza/fileconsumer/internal/archive/archive.go +++ b/pkg/stanza/fileconsumer/internal/archive/archive.go @@ -13,13 +13,15 @@ import ( const _ = "knownFiles" type Archive interface { - SetStorageClient(persister operator.Persister) - Match(fp *fingerprint.Fingerprint) *reader.Metadata + SetStorageClient(operator.Persister) + Match(*fingerprint.Fingerprint) (*reader.Metadata, error) + Write([]*reader.Metadata) error } type archive struct { persister operator.Persister pollsToArchive int + index int _ *fileset.Fileset[*reader.Metadata] } @@ -31,9 +33,16 @@ func (a *archive) SetStorageClient(persister operator.Persister) { a.persister = persister } -func (a *archive) Match(_ *fingerprint.Fingerprint) *reader.Metadata { +func (a *archive) Match(_ *fingerprint.Fingerprint) (*reader.Metadata, error) { // TODO: // Add logic to go through the storage and return a match. // Also update the storage if match found. + return nil, nil +} + +func (a *archive) Write(_ []*reader.Metadata) error { + // TODO: + // Add logic to update the index. + // Handle rollover logic return nil } diff --git a/pkg/stanza/fileconsumer/internal/archive/archive_nop.go b/pkg/stanza/fileconsumer/internal/archive/archive_nop.go index 52295b16890a..e5e29e5748f2 100644 --- a/pkg/stanza/fileconsumer/internal/archive/archive_nop.go +++ b/pkg/stanza/fileconsumer/internal/archive/archive_nop.go @@ -18,6 +18,10 @@ func NewNopArchive() Archive { func (a *nopArchive) SetStorageClient(_ operator.Persister) { } -func (a *nopArchive) Match(_ *fingerprint.Fingerprint) *reader.Metadata { +func (a *nopArchive) Match(_ *fingerprint.Fingerprint) (*reader.Metadata, error) { + return nil, nil +} + +func (a *nopArchive) Write(_ []*reader.Metadata) error { return nil } From 431ab4163432fe03658ddd0f34e89598db4abad7 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Thu, 22 Aug 2024 22:49:04 +0530 Subject: [PATCH 07/45] chore: commit, archive module --- pkg/stanza/fileconsumer/config.go | 9 +-- pkg/stanza/fileconsumer/file.go | 64 ++++++++++++++----- .../fileconsumer/internal/archive/archive.go | 43 ++++++++++--- .../{archive_nop.go => archive_default.go} | 18 +++--- .../fileconsumer/internal/fileset/fileset.go | 5 -- .../fileconsumer/internal/tracker/tracker.go | 13 +++- 6 files changed, 105 insertions(+), 47 deletions(-) rename pkg/stanza/fileconsumer/internal/archive/{archive_nop.go => archive_default.go} (50%) diff --git a/pkg/stanza/fileconsumer/config.go b/pkg/stanza/fileconsumer/config.go index c69cf6e19a15..ba0552747372 100644 --- a/pkg/stanza/fileconsumer/config.go +++ b/pkg/stanza/fileconsumer/config.go @@ -88,7 +88,7 @@ type Config struct { DeleteAfterRead bool `mapstructure:"delete_after_read,omitempty"` IncludeFileRecordNumber bool `mapstructure:"include_file_record_number,omitempty"` Compression string `mapstructure:"compression,omitempty"` - PollsToArchive int `mapstructure:"-"` + PollsToArchive int `mapstructure:"-"` // TODO: activate this config once archiving is set up AcquireFSLock bool `mapstructure:"acquire_fs_lock,omitempty"` } @@ -183,12 +183,7 @@ func (c Config) Build(set component.TelemetrySettings, emit emit.Callback, opts t = tracker.NewFileTracker(set, c.MaxConcurrentFiles/2) } - var a archive.Archive - if c.PollsToArchive <= 0 { - a = archive.NewNopArchive() - } else { - a = archive.NewArchive(c.PollsToArchive) - } + a := archive.NewDefaultArchive() // TODO: once archiving is set up, update this. telemetryBuilder, err := metadata.NewTelemetryBuilder(set) if err != nil { diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go index f4398b82c3cc..598d9d51934c 100644 --- a/pkg/stanza/fileconsumer/file.go +++ b/pkg/stanza/fileconsumer/file.go @@ -40,6 +40,8 @@ type Manager struct { pollsToArchive int telemetryBuilder *metadata.TelemetryBuilder + + unmatchedFiles []*archive.FileRecord } func (m *Manager) Start(persister operator.Persister) error { @@ -147,7 +149,7 @@ func (m *Manager) poll(ctx context.Context) { } } // rotate at end of every poll() - m.tracker.EndPoll() + m.tracker.EndPoll(m.archive) } func (m *Manager) consume(ctx context.Context, paths []string) { @@ -201,6 +203,7 @@ func (m *Manager) makeFingerprint(path string) (*fingerprint.Fingerprint, *os.Fi // discarding any that have a duplicate fingerprint to other files that have already // been read this polling interval func (m *Manager) makeReaders(ctx context.Context, paths []string) { + m.unmatchedFiles = make([]*archive.FileRecord, 0) for _, path := range paths { fp, file := m.makeFingerprint(path) if fp == nil { @@ -219,17 +222,54 @@ func (m *Manager) makeReaders(ctx context.Context, paths []string) { continue } - r, err := m.newReader(ctx, file, fp) + r, matchFound, err := m.newReader(ctx, file, fp) if err != nil { m.set.Logger.Error("Failed to create reader", zap.Error(err)) continue } + if matchFound { + m.tracker.Add(r) + } else { + m.unmatchedFiles = append(m.unmatchedFiles, archive.NewFileRecord(file, fp)) + } + } - m.tracker.Add(r) + records, err := m.archive.Match(m.unmatchedFiles) + if err != nil { + m.set.Logger.Error("Errors encountered while reading the archive", zap.Error(err)) } +INNER: + for _, record := range records { + // Exclude duplicate paths with the same content. This can happen when files are + // being rotated with copy/truncate strategy. (After copy, prior to truncate.) + if r := m.tracker.GetCurrentFile(record.Fingerprint); r != nil { + m.set.Logger.Debug("Skipping duplicate file", zap.String("path", r.GetFileName())) + // re-add the reader as Match() removes duplicates + m.tracker.Add(r) + record.File.Close() + continue INNER + } + var reader *reader.Reader + var err error + if record.Metadata != nil { + // match is found if record.Metadata exists + reader, err = m.readerFactory.NewReaderFromMetadata(record.File, record.Metadata) + } else { + // empty record.Metada i.e. a new file + reader, err = m.readerFactory.NewReader(record.File, record.Fingerprint) + } + if err != nil { + m.set.Logger.Error("Failed to create reader", zap.Error(err)) + continue INNER + } + m.telemetryBuilder.FileconsumerOpenFiles.Add(ctx, 1) + m.tracker.Add(reader) + m.set.Logger.Info("Started watching file", zap.String("path", reader.GetFileName())) + } + } -func (m *Manager) newReader(ctx context.Context, file *os.File, fp *fingerprint.Fingerprint) (*reader.Reader, error) { +func (m *Manager) newReader(ctx context.Context, file *os.File, fp *fingerprint.Fingerprint) (*reader.Reader, bool, error) { // Check previous poll cycle for match if oldReader := m.tracker.GetOpenFile(fp); oldReader != nil { if oldReader.GetFileName() != file.Name() { @@ -245,25 +285,19 @@ func (m *Manager) newReader(ctx context.Context, file *os.File, fp *fingerprint. zap.String("rotated_path", file.Name())) } } - return m.readerFactory.NewReaderFromMetadata(file, oldReader.Close()) + r, err := m.readerFactory.NewReaderFromMetadata(file, oldReader.Close()) + return r, true, err } // Check for closed files for match if oldMetadata := m.tracker.GetClosedFile(fp); oldMetadata != nil { r, err := m.readerFactory.NewReaderFromMetadata(file, oldMetadata) if err != nil { - return nil, err + return nil, false, err } m.telemetryBuilder.FileconsumerOpenFiles.Add(ctx, 1) - return r, nil + return r, true, nil } - // If we don't match any previously known files, create a new reader from scratch - m.set.Logger.Info("Started watching file", zap.String("path", file.Name())) - r, err := m.readerFactory.NewReader(file, fp) - if err != nil { - return nil, err - } - m.telemetryBuilder.FileconsumerOpenFiles.Add(ctx, 1) - return r, nil + return nil, false, nil } diff --git a/pkg/stanza/fileconsumer/internal/archive/archive.go b/pkg/stanza/fileconsumer/internal/archive/archive.go index 866698a32356..27f08d73460d 100644 --- a/pkg/stanza/fileconsumer/internal/archive/archive.go +++ b/pkg/stanza/fileconsumer/internal/archive/archive.go @@ -4,7 +4,8 @@ package archive // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/archive" import ( - "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fileset" + "os" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/reader" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator" @@ -14,29 +15,53 @@ const _ = "knownFiles" type Archive interface { SetStorageClient(operator.Persister) - Match(*fingerprint.Fingerprint) (*reader.Metadata, error) + + Match([]*FileRecord) ([]*FileRecord, error) Write([]*reader.Metadata) error } +type FileRecord struct { + File *os.File + Fingerprint *fingerprint.Fingerprint + + // Metadata is populated if a match is found in storage + // For new files, Metadata would remain empty + Metadata *reader.Metadata +} + +func NewFileRecord(file *os.File, fp *fingerprint.Fingerprint) *FileRecord { + return &FileRecord{ + File: file, + Fingerprint: fp, + } +} + type archive struct { - persister operator.Persister - pollsToArchive int - index int - _ *fileset.Fileset[*reader.Metadata] + persister operator.Persister } -func NewArchive(pollsToArchive int) Archive { - return &archive{pollsToArchive: pollsToArchive} +func NewArchive() Archive { + return &archive{} } func (a *archive) SetStorageClient(persister operator.Persister) { a.persister = persister } -func (a *archive) Match(_ *fingerprint.Fingerprint) (*reader.Metadata, error) { +// The Match function processes unmatched files by performing the following steps +// 1. Access the storage key and retrieve the existing metadata. +// 2. Iterate through the unmatched files: +// a. If a corresponding record is found in storage, update the record's Metadata with the retrieved metadata +// b. If no matching record is found, skip to the next file. +// c. Update the storage key with updated metadata +// 3. Unmatched files will be updated with the following details: +// a. Files with a matching record will have their Metadata updated to the known metadata. +// b. New files will have an empty Metadata record. +func (a *archive) Match(_ []*FileRecord) ([]*FileRecord, error) { // TODO: // Add logic to go through the storage and return a match. // Also update the storage if match found. + return nil, nil } diff --git a/pkg/stanza/fileconsumer/internal/archive/archive_nop.go b/pkg/stanza/fileconsumer/internal/archive/archive_default.go similarity index 50% rename from pkg/stanza/fileconsumer/internal/archive/archive_nop.go rename to pkg/stanza/fileconsumer/internal/archive/archive_default.go index e5e29e5748f2..0b50a8db7881 100644 --- a/pkg/stanza/fileconsumer/internal/archive/archive_nop.go +++ b/pkg/stanza/fileconsumer/internal/archive/archive_default.go @@ -4,24 +4,26 @@ package archive // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/archive" import ( - "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/reader" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator" ) -type nopArchive struct{} +type defaultArchive struct { +} -func NewNopArchive() Archive { - return &nopArchive{} +func NewDefaultArchive() Archive { + return &defaultArchive{} } -func (a *nopArchive) SetStorageClient(_ operator.Persister) { +func (a *defaultArchive) SetStorageClient(_ operator.Persister) { } -func (a *nopArchive) Match(_ *fingerprint.Fingerprint) (*reader.Metadata, error) { - return nil, nil +func (a *defaultArchive) Match(unmatchedFiles []*FileRecord) ([]*FileRecord, error) { + // Default archiving returns the files as it is + return unmatchedFiles, nil } -func (a *nopArchive) Write(_ []*reader.Metadata) error { +func (a *defaultArchive) Write(_ []*reader.Metadata) error { + // discard the old offsets by default return nil } diff --git a/pkg/stanza/fileconsumer/internal/fileset/fileset.go b/pkg/stanza/fileconsumer/internal/fileset/fileset.go index 10c66711c9d3..3d1cf50fb143 100644 --- a/pkg/stanza/fileconsumer/internal/fileset/fileset.go +++ b/pkg/stanza/fileconsumer/internal/fileset/fileset.go @@ -55,11 +55,6 @@ func (set *Fileset[T]) Add(readers ...T) { set.readers = append(set.readers, readers...) } -func (set *Fileset[T]) Reset(readers ...T) { - // reset the underlying array. - set.readers = readers -} - func (set *Fileset[T]) Match(fp *fingerprint.Fingerprint, cmp func(a, b *fingerprint.Fingerprint) bool) T { var val T for idx, r := range set.readers { diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 5039003a36ed..6a970387987f 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -7,6 +7,7 @@ import ( "go.opentelemetry.io/collector/component" "go.uber.org/zap" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/archive" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fileset" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/reader" @@ -23,7 +24,7 @@ type Tracker interface { CurrentPollFiles() []*reader.Reader PreviousPollFiles() []*reader.Reader ClosePreviousFiles() int - EndPoll() + EndPoll(archive archive.Archive) EndConsume() int TotalReaders() int } @@ -110,11 +111,17 @@ func (t *fileTracker) ClosePreviousFiles() (filesClosed int) { return } -func (t *fileTracker) EndPoll() { +func (t *fileTracker) EndPoll(archive archive.Archive) { // shift the filesets at end of every poll() call // t.knownFiles[0] -> t.knownFiles[1] -> t.knownFiles[2] + oldFileset := t.knownFiles[2] copy(t.knownFiles[1:], t.knownFiles) t.knownFiles[0] = fileset.New[*reader.Metadata](t.maxBatchFiles) + + err := archive.Write(oldFileset.Get()) + if err != nil { + t.set.Logger.Error("Error faced while archiving", zap.Error(err)) + } } func (t *fileTracker) TotalReaders() int { @@ -176,6 +183,6 @@ func (t *noStateTracker) PreviousPollFiles() []*reader.Reader { return nil } func (t *noStateTracker) ClosePreviousFiles() int { return 0 } -func (t *noStateTracker) EndPoll() {} +func (t *noStateTracker) EndPoll(archive.Archive) {} func (t *noStateTracker) TotalReaders() int { return 0 } From 077ab35e0ab32ffb8daef9a3d7ba8231bdfdd315 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Fri, 13 Sep 2024 23:03:38 +0530 Subject: [PATCH 08/45] chore: cleanup, simplify the PR --- pkg/stanza/fileconsumer/config.go | 4 - pkg/stanza/fileconsumer/file.go | 68 ++++------------- .../fileconsumer/internal/archive/archive.go | 73 ------------------- .../internal/archive/archive_default.go | 29 -------- .../fileconsumer/internal/tracker/tracker.go | 35 ++++++--- 5 files changed, 41 insertions(+), 168 deletions(-) delete mode 100644 pkg/stanza/fileconsumer/internal/archive/archive.go delete mode 100644 pkg/stanza/fileconsumer/internal/archive/archive_default.go diff --git a/pkg/stanza/fileconsumer/config.go b/pkg/stanza/fileconsumer/config.go index ba0552747372..6fc1a74ee28f 100644 --- a/pkg/stanza/fileconsumer/config.go +++ b/pkg/stanza/fileconsumer/config.go @@ -20,7 +20,6 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/decode" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/attrs" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/emit" - "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/archive" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/header" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/metadata" @@ -183,8 +182,6 @@ func (c Config) Build(set component.TelemetrySettings, emit emit.Callback, opts t = tracker.NewFileTracker(set, c.MaxConcurrentFiles/2) } - a := archive.NewDefaultArchive() // TODO: once archiving is set up, update this. - telemetryBuilder, err := metadata.NewTelemetryBuilder(set) if err != nil { return nil, err @@ -198,7 +195,6 @@ func (c Config) Build(set component.TelemetrySettings, emit emit.Callback, opts maxBatches: c.MaxBatches, tracker: t, telemetryBuilder: telemetryBuilder, - archive: a, }, nil } diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go index 598d9d51934c..f849be667b7a 100644 --- a/pkg/stanza/fileconsumer/file.go +++ b/pkg/stanza/fileconsumer/file.go @@ -13,7 +13,6 @@ import ( "go.opentelemetry.io/collector/component" "go.uber.org/zap" - "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/archive" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/checkpoint" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/metadata" @@ -31,7 +30,6 @@ type Manager struct { readerFactory reader.Factory fileMatcher *matcher.Matcher tracker tracker.Tracker - archive archive.Archive pollInterval time.Duration persister operator.Persister @@ -40,8 +38,6 @@ type Manager struct { pollsToArchive int telemetryBuilder *metadata.TelemetryBuilder - - unmatchedFiles []*archive.FileRecord } func (m *Manager) Start(persister operator.Persister) error { @@ -63,7 +59,7 @@ func (m *Manager) Start(persister operator.Persister) error { m.readerFactory.FromBeginning = true m.tracker.LoadMetadata(offsets) } - m.archive.SetStorageClient(persister) + m.tracker.SetPersister(persister) } else if m.pollsToArchive > 0 { return fmt.Errorf("archiving is not supported in memory, please use a storage extension") } @@ -149,7 +145,7 @@ func (m *Manager) poll(ctx context.Context) { } } // rotate at end of every poll() - m.tracker.EndPoll(m.archive) + m.tracker.EndPoll() } func (m *Manager) consume(ctx context.Context, paths []string) { @@ -203,7 +199,6 @@ func (m *Manager) makeFingerprint(path string) (*fingerprint.Fingerprint, *os.Fi // discarding any that have a duplicate fingerprint to other files that have already // been read this polling interval func (m *Manager) makeReaders(ctx context.Context, paths []string) { - m.unmatchedFiles = make([]*archive.FileRecord, 0) for _, path := range paths { fp, file := m.makeFingerprint(path) if fp == nil { @@ -222,54 +217,17 @@ func (m *Manager) makeReaders(ctx context.Context, paths []string) { continue } - r, matchFound, err := m.newReader(ctx, file, fp) + r, err := m.newReader(ctx, file, fp) if err != nil { m.set.Logger.Error("Failed to create reader", zap.Error(err)) continue } - if matchFound { - m.tracker.Add(r) - } else { - m.unmatchedFiles = append(m.unmatchedFiles, archive.NewFileRecord(file, fp)) - } - } - records, err := m.archive.Match(m.unmatchedFiles) - if err != nil { - m.set.Logger.Error("Errors encountered while reading the archive", zap.Error(err)) + m.tracker.Add(r) } -INNER: - for _, record := range records { - // Exclude duplicate paths with the same content. This can happen when files are - // being rotated with copy/truncate strategy. (After copy, prior to truncate.) - if r := m.tracker.GetCurrentFile(record.Fingerprint); r != nil { - m.set.Logger.Debug("Skipping duplicate file", zap.String("path", r.GetFileName())) - // re-add the reader as Match() removes duplicates - m.tracker.Add(r) - record.File.Close() - continue INNER - } - var reader *reader.Reader - var err error - if record.Metadata != nil { - // match is found if record.Metadata exists - reader, err = m.readerFactory.NewReaderFromMetadata(record.File, record.Metadata) - } else { - // empty record.Metada i.e. a new file - reader, err = m.readerFactory.NewReader(record.File, record.Fingerprint) - } - if err != nil { - m.set.Logger.Error("Failed to create reader", zap.Error(err)) - continue INNER - } - m.telemetryBuilder.FileconsumerOpenFiles.Add(ctx, 1) - m.tracker.Add(reader) - m.set.Logger.Info("Started watching file", zap.String("path", reader.GetFileName())) - } - } -func (m *Manager) newReader(ctx context.Context, file *os.File, fp *fingerprint.Fingerprint) (*reader.Reader, bool, error) { +func (m *Manager) newReader(ctx context.Context, file *os.File, fp *fingerprint.Fingerprint) (*reader.Reader, error) { // Check previous poll cycle for match if oldReader := m.tracker.GetOpenFile(fp); oldReader != nil { if oldReader.GetFileName() != file.Name() { @@ -285,19 +243,25 @@ func (m *Manager) newReader(ctx context.Context, file *os.File, fp *fingerprint. zap.String("rotated_path", file.Name())) } } - r, err := m.readerFactory.NewReaderFromMetadata(file, oldReader.Close()) - return r, true, err + return m.readerFactory.NewReaderFromMetadata(file, oldReader.Close()) } // Check for closed files for match if oldMetadata := m.tracker.GetClosedFile(fp); oldMetadata != nil { r, err := m.readerFactory.NewReaderFromMetadata(file, oldMetadata) if err != nil { - return nil, false, err + return nil, err } m.telemetryBuilder.FileconsumerOpenFiles.Add(ctx, 1) - return r, true, nil + return r, nil } - return nil, false, nil + // If we don't match any previously known files, create a new reader from scratch + m.set.Logger.Info("Started watching file", zap.String("path", file.Name())) + r, err := m.readerFactory.NewReader(file, fp) + if err != nil { + return nil, err + } + m.telemetryBuilder.FileconsumerOpenFiles.Add(ctx, 1) + return r, nil } diff --git a/pkg/stanza/fileconsumer/internal/archive/archive.go b/pkg/stanza/fileconsumer/internal/archive/archive.go deleted file mode 100644 index 27f08d73460d..000000000000 --- a/pkg/stanza/fileconsumer/internal/archive/archive.go +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright The OpenTelemetry Authors -// SPDX-License-Identifier: Apache-2.0 - -package archive // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/archive" - -import ( - "os" - - "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint" - "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/reader" - "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator" -) - -const _ = "knownFiles" - -type Archive interface { - SetStorageClient(operator.Persister) - - Match([]*FileRecord) ([]*FileRecord, error) - Write([]*reader.Metadata) error -} - -type FileRecord struct { - File *os.File - Fingerprint *fingerprint.Fingerprint - - // Metadata is populated if a match is found in storage - // For new files, Metadata would remain empty - Metadata *reader.Metadata -} - -func NewFileRecord(file *os.File, fp *fingerprint.Fingerprint) *FileRecord { - return &FileRecord{ - File: file, - Fingerprint: fp, - } -} - -type archive struct { - persister operator.Persister -} - -func NewArchive() Archive { - return &archive{} -} - -func (a *archive) SetStorageClient(persister operator.Persister) { - a.persister = persister -} - -// The Match function processes unmatched files by performing the following steps -// 1. Access the storage key and retrieve the existing metadata. -// 2. Iterate through the unmatched files: -// a. If a corresponding record is found in storage, update the record's Metadata with the retrieved metadata -// b. If no matching record is found, skip to the next file. -// c. Update the storage key with updated metadata -// 3. Unmatched files will be updated with the following details: -// a. Files with a matching record will have their Metadata updated to the known metadata. -// b. New files will have an empty Metadata record. -func (a *archive) Match(_ []*FileRecord) ([]*FileRecord, error) { - // TODO: - // Add logic to go through the storage and return a match. - // Also update the storage if match found. - - return nil, nil -} - -func (a *archive) Write(_ []*reader.Metadata) error { - // TODO: - // Add logic to update the index. - // Handle rollover logic - return nil -} diff --git a/pkg/stanza/fileconsumer/internal/archive/archive_default.go b/pkg/stanza/fileconsumer/internal/archive/archive_default.go deleted file mode 100644 index 0b50a8db7881..000000000000 --- a/pkg/stanza/fileconsumer/internal/archive/archive_default.go +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright The OpenTelemetry Authors -// SPDX-License-Identifier: Apache-2.0 - -package archive // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/archive" - -import ( - "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/reader" - "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator" -) - -type defaultArchive struct { -} - -func NewDefaultArchive() Archive { - return &defaultArchive{} -} - -func (a *defaultArchive) SetStorageClient(_ operator.Persister) { -} - -func (a *defaultArchive) Match(unmatchedFiles []*FileRecord) ([]*FileRecord, error) { - // Default archiving returns the files as it is - return unmatchedFiles, nil -} - -func (a *defaultArchive) Write(_ []*reader.Metadata) error { - // discard the old offsets by default - return nil -} diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 6a970387987f..011fe97d893f 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -7,10 +7,10 @@ import ( "go.opentelemetry.io/collector/component" "go.uber.org/zap" - "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/archive" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fileset" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/reader" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator" ) // Interface for tracking files that are being consumed. @@ -24,9 +24,10 @@ type Tracker interface { CurrentPollFiles() []*reader.Reader PreviousPollFiles() []*reader.Reader ClosePreviousFiles() int - EndPoll(archive archive.Archive) + EndPoll() EndConsume() int TotalReaders() int + SetPersister(persister operator.Persister) } // fileTracker tracks known offsets for files that are being consumed by the manager. @@ -38,6 +39,10 @@ type fileTracker struct { currentPollFiles *fileset.Fileset[*reader.Reader] previousPollFiles *fileset.Fileset[*reader.Reader] knownFiles []*fileset.Fileset[*reader.Metadata] + + // persister is to be used to store offsets older than 3 poll cycles. + // These offsets will be stored on disk + persister operator.Persister } func NewFileTracker(set component.TelemetrySettings, maxBatchFiles int) Tracker { @@ -111,17 +116,18 @@ func (t *fileTracker) ClosePreviousFiles() (filesClosed int) { return } -func (t *fileTracker) EndPoll(archive archive.Archive) { +func (t *fileTracker) SetPersister(persister operator.Persister) { + t.persister = persister +} + +func (t *fileTracker) EndPoll() { // shift the filesets at end of every poll() call // t.knownFiles[0] -> t.knownFiles[1] -> t.knownFiles[2] - oldFileset := t.knownFiles[2] + + // Instead of throwing it away, archive it. + t.archive(t.knownFiles[2]) copy(t.knownFiles[1:], t.knownFiles) t.knownFiles[0] = fileset.New[*reader.Metadata](t.maxBatchFiles) - - err := archive.Write(oldFileset.Get()) - if err != nil { - t.set.Logger.Error("Error faced while archiving", zap.Error(err)) - } } func (t *fileTracker) TotalReaders() int { @@ -132,6 +138,13 @@ func (t *fileTracker) TotalReaders() int { return total } +func (t *fileTracker) archive(*fileset.Fileset[*reader.Metadata]) { + // TODO; core logic to be implemented in followup PR + // We make use of a ring buffer, where each set of files is stored under a specific index. + // Instead of discarding knownFiles[2], write it to the next index and eventually roll over. + // Separate storage keys knownFilesArchive0, knownFilesArchive1, ..., knownFilesArchiveN, roll over back to knownFilesArchive0 +} + // noStateTracker only tracks the current polled files. Once the poll is // complete and telemetry is consumed, the tracked files are closed. The next // poll will create fresh readers with no previously tracked offsets. @@ -183,6 +196,8 @@ func (t *noStateTracker) PreviousPollFiles() []*reader.Reader { return nil } func (t *noStateTracker) ClosePreviousFiles() int { return 0 } -func (t *noStateTracker) EndPoll(archive.Archive) {} +func (t *noStateTracker) EndPoll() {} func (t *noStateTracker) TotalReaders() int { return 0 } + +func (t *noStateTracker) SetPersister(operator.Persister) {} From e3cdd5df8017d74dfd0a08e40b4e926f673f34e9 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Mon, 16 Sep 2024 23:46:35 +0530 Subject: [PATCH 09/45] fix: initial commit, second PR --- .../internal/checkpoint/checkpoint.go | 8 ++ .../fileconsumer/internal/tracker/tracker.go | 60 +++++++++++- testbed/datareceivers/stanza.go | 91 +++++++++++++++++++ 3 files changed, 156 insertions(+), 3 deletions(-) create mode 100644 testbed/datareceivers/stanza.go diff --git a/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go b/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go index b75933abcc28..5d638c1720aa 100644 --- a/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go +++ b/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go @@ -18,6 +18,10 @@ const knownFilesKey = "knownFiles" // Save syncs the most recent set of files to the database func Save(ctx context.Context, persister operator.Persister, rmds []*reader.Metadata) error { + return SaveKey(ctx, persister, rmds, knownFilesKey) +} + +func SaveKey(ctx context.Context, persister operator.Persister, rmds []*reader.Metadata, key string) error { var buf bytes.Buffer enc := json.NewEncoder(&buf) @@ -43,6 +47,10 @@ func Save(ctx context.Context, persister operator.Persister, rmds []*reader.Meta // Load loads the most recent set of files to the database func Load(ctx context.Context, persister operator.Persister) ([]*reader.Metadata, error) { + return LoadKey(ctx, persister, knownFilesKey) +} + +func LoadKey(ctx context.Context, persister operator.Persister, key string) ([]*reader.Metadata, error) { encoded, err := persister.Get(ctx, knownFilesKey) if err != nil { return nil, err diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 011fe97d893f..5ff1d91848a6 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -4,9 +4,13 @@ package tracker // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/tracker" import ( + "context" + "fmt" + "go.opentelemetry.io/collector/component" "go.uber.org/zap" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/checkpoint" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fileset" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/reader" @@ -43,9 +47,35 @@ type fileTracker struct { // persister is to be used to store offsets older than 3 poll cycles. // These offsets will be stored on disk persister operator.Persister + + pollsToArchive int + archiveIndex int +} + +type option struct { + maxBatchFiles int + pollsToArchive int +} + +type optionFunc func(*option) + +func WithMaxBatchFiles(maxBatchFiles int) optionFunc { + return func(fto *option) { + fto.maxBatchFiles = maxBatchFiles + } } -func NewFileTracker(set component.TelemetrySettings, maxBatchFiles int) Tracker { +func WithPollsToArchive(pollsToArchive int) optionFunc { + return func(fto *option) { + fto.pollsToArchive = pollsToArchive + } +} + +func NewFileTracker(set component.TelemetrySettings, maxBatchFiles int, opts ...optionFunc) Tracker { + option := &option{} + for _, opt := range opts { + opt(option) + } knownFiles := make([]*fileset.Fileset[*reader.Metadata], 3) for i := 0; i < len(knownFiles); i++ { knownFiles[i] = fileset.New[*reader.Metadata](maxBatchFiles) @@ -57,6 +87,8 @@ func NewFileTracker(set component.TelemetrySettings, maxBatchFiles int) Tracker currentPollFiles: fileset.New[*reader.Reader](maxBatchFiles), previousPollFiles: fileset.New[*reader.Reader](maxBatchFiles), knownFiles: knownFiles, + pollsToArchive: option.pollsToArchive, + archiveIndex: 0, } } @@ -138,11 +170,33 @@ func (t *fileTracker) TotalReaders() int { return total } -func (t *fileTracker) archive(*fileset.Fileset[*reader.Metadata]) { - // TODO; core logic to be implemented in followup PR +func (t *fileTracker) archive(metadata *fileset.Fileset[*reader.Metadata]) { // We make use of a ring buffer, where each set of files is stored under a specific index. // Instead of discarding knownFiles[2], write it to the next index and eventually roll over. // Separate storage keys knownFilesArchive0, knownFilesArchive1, ..., knownFilesArchiveN, roll over back to knownFilesArchive0 + + // Archiving: ┌─────────────────────on-disk archive─────────────────────────┐ + // | ┌───┐ ┌───┐ ┌──────────────────┐ | + // index | ▶ │ 0 │ ▶ │ 1 │ ▶ ... ▶ │ polls_to_archive │ | + // | ▲ └───┘ └───┘ └──────────────────┘ | + // | ▲ ▲ ▼ | + // | ▲ │ Roll over overriting older offsets, if any ◀ | + // └──────│──────────────────────────────────────────────────────┘ + // │ + // │ + // │ + // start + // index + + if t.pollsToArchive == 0 { + return + } + key := fmt.Sprintf("knownFiles%d", t.archiveIndex) + if err := checkpoint.SaveKey(context.Background(), t.persister, metadata.Get(), key); err != nil { + t.set.Logger.Error("error faced while saving to the archive", zap.Error(err)) + } + t.archiveIndex += 1 // increment the index + t.archiveIndex %= t.pollsToArchive // ring buffer } // noStateTracker only tracks the current polled files. Once the poll is diff --git a/testbed/datareceivers/stanza.go b/testbed/datareceivers/stanza.go new file mode 100644 index 000000000000..31f403ad01a5 --- /dev/null +++ b/testbed/datareceivers/stanza.go @@ -0,0 +1,91 @@ +package datareceivers + +import ( + "context" + "fmt" + "os" + + "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/filelogreceiver" + "github.com/open-telemetry/opentelemetry-collector-contrib/testbed/testbed" + "go.opentelemetry.io/collector/component/componenttest" + "go.opentelemetry.io/collector/consumer" + "go.opentelemetry.io/collector/receiver" + "go.opentelemetry.io/collector/receiver/receivertest" + "go.uber.org/zap" +) + +type StanzaDataReceiver struct { + testbed.DataReceiverBase + logReceiver receiver.Logs + path string + retry string + sendingQueue string +} + +var _ testbed.DataReceiver = (*StanzaDataReceiver)(nil) + +// NewFileLogWriter creates a new data sender that will write log entries to a +// file, to be tailed by FluentBit and sent to the collector. +func NewFileLogReceiver() *StanzaDataReceiver { + file, err := os.CreateTemp("", "perf-logs.log") + if err != nil { + panic("failed to create temp file") + } + + f := &StanzaDataReceiver{ + path: file.Name(), + } + + return f +} +func NewLogger() (*zap.Logger, error) { + cfg := zap.NewProductionConfig() + cfg.OutputPaths = []string{ + "/Users/vihasmakwana/Desktop/Vihas/OTeL/opentelemetry-collector-contrib/testbed/datareceivers/hello.log", + } + return cfg.Build() +} +func (s *StanzaDataReceiver) Start(tc consumer.Traces, mc consumer.Metrics, lc consumer.Logs) error { + factory := filelogreceiver.NewFactory() + cfg := factory.CreateDefaultConfig().(*filelogreceiver.FileLogConfig) + cfg.InputConfig.Include = []string{s.path} + cfg.InputConfig.StartAt = "beginning" + // cfg.RetryOnFailure = consumerretry.NewDefaultConfig() + // cfg.RetryOnFailure.Enabled = true + var err error + set := receivertest.NewNopSettings() + logger, _ := NewLogger() + set.TelemetrySettings.Logger = logger + if s.logReceiver, err = factory.CreateLogsReceiver(context.Background(), set, cfg, lc); err != nil { + return err + } + return s.logReceiver.Start(context.Background(), componenttest.NewNopHost()) +} + +func (*StanzaDataReceiver) Stop() error { + return nil +} + +func (s *StanzaDataReceiver) GenConfigYAMLStr() string { + config := fmt.Sprintf(` + file: + path: %s + %s + %s +`, s.path, s.retry, s.sendingQueue) + return config +} + +func (*StanzaDataReceiver) ProtocolName() string { + return "file" +} + +func (bor *StanzaDataReceiver) WithRetry(retry string) *StanzaDataReceiver { + bor.retry = retry + return bor +} + +func (bor *StanzaDataReceiver) WithQueue(sendingQueue string) *StanzaDataReceiver { + bor.sendingQueue = sendingQueue + return bor +} From d88d5ae14f1767de320cc6bf0d3d0b42a13fa44b Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 17 Sep 2024 12:31:17 +0530 Subject: [PATCH 10/45] chore: remove stanza.go --- testbed/datareceivers/stanza.go | 91 --------------------------------- 1 file changed, 91 deletions(-) delete mode 100644 testbed/datareceivers/stanza.go diff --git a/testbed/datareceivers/stanza.go b/testbed/datareceivers/stanza.go deleted file mode 100644 index 31f403ad01a5..000000000000 --- a/testbed/datareceivers/stanza.go +++ /dev/null @@ -1,91 +0,0 @@ -package datareceivers - -import ( - "context" - "fmt" - "os" - - "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/filelogreceiver" - "github.com/open-telemetry/opentelemetry-collector-contrib/testbed/testbed" - "go.opentelemetry.io/collector/component/componenttest" - "go.opentelemetry.io/collector/consumer" - "go.opentelemetry.io/collector/receiver" - "go.opentelemetry.io/collector/receiver/receivertest" - "go.uber.org/zap" -) - -type StanzaDataReceiver struct { - testbed.DataReceiverBase - logReceiver receiver.Logs - path string - retry string - sendingQueue string -} - -var _ testbed.DataReceiver = (*StanzaDataReceiver)(nil) - -// NewFileLogWriter creates a new data sender that will write log entries to a -// file, to be tailed by FluentBit and sent to the collector. -func NewFileLogReceiver() *StanzaDataReceiver { - file, err := os.CreateTemp("", "perf-logs.log") - if err != nil { - panic("failed to create temp file") - } - - f := &StanzaDataReceiver{ - path: file.Name(), - } - - return f -} -func NewLogger() (*zap.Logger, error) { - cfg := zap.NewProductionConfig() - cfg.OutputPaths = []string{ - "/Users/vihasmakwana/Desktop/Vihas/OTeL/opentelemetry-collector-contrib/testbed/datareceivers/hello.log", - } - return cfg.Build() -} -func (s *StanzaDataReceiver) Start(tc consumer.Traces, mc consumer.Metrics, lc consumer.Logs) error { - factory := filelogreceiver.NewFactory() - cfg := factory.CreateDefaultConfig().(*filelogreceiver.FileLogConfig) - cfg.InputConfig.Include = []string{s.path} - cfg.InputConfig.StartAt = "beginning" - // cfg.RetryOnFailure = consumerretry.NewDefaultConfig() - // cfg.RetryOnFailure.Enabled = true - var err error - set := receivertest.NewNopSettings() - logger, _ := NewLogger() - set.TelemetrySettings.Logger = logger - if s.logReceiver, err = factory.CreateLogsReceiver(context.Background(), set, cfg, lc); err != nil { - return err - } - return s.logReceiver.Start(context.Background(), componenttest.NewNopHost()) -} - -func (*StanzaDataReceiver) Stop() error { - return nil -} - -func (s *StanzaDataReceiver) GenConfigYAMLStr() string { - config := fmt.Sprintf(` - file: - path: %s - %s - %s -`, s.path, s.retry, s.sendingQueue) - return config -} - -func (*StanzaDataReceiver) ProtocolName() string { - return "file" -} - -func (bor *StanzaDataReceiver) WithRetry(retry string) *StanzaDataReceiver { - bor.retry = retry - return bor -} - -func (bor *StanzaDataReceiver) WithQueue(sendingQueue string) *StanzaDataReceiver { - bor.sendingQueue = sendingQueue - return bor -} From d9e5992f71ff5f1e4fae0053e49fdfbd33656a37 Mon Sep 17 00:00:00 2001 From: VihasMakwana <121151420+VihasMakwana@users.noreply.github.com> Date: Tue, 17 Sep 2024 13:29:25 +0530 Subject: [PATCH 11/45] Format the comment --- .../fileconsumer/internal/tracker/tracker.go | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 5ff1d91848a6..e3b5885f788a 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -176,17 +176,17 @@ func (t *fileTracker) archive(metadata *fileset.Fileset[*reader.Metadata]) { // Separate storage keys knownFilesArchive0, knownFilesArchive1, ..., knownFilesArchiveN, roll over back to knownFilesArchive0 // Archiving: ┌─────────────────────on-disk archive─────────────────────────┐ - // | ┌───┐ ┌───┐ ┌──────────────────┐ | - // index | ▶ │ 0 │ ▶ │ 1 │ ▶ ... ▶ │ polls_to_archive │ | - // | ▲ └───┘ └───┘ └──────────────────┘ | - // | ▲ ▲ ▼ | - // | ▲ │ Roll over overriting older offsets, if any ◀ | - // └──────│──────────────────────────────────────────────────────┘ - // │ - // │ - // │ - // start - // index + // | ┌───┐ ┌───┐ ┌──────────────────┐ | + // index | ▶ │ 0 │ ▶ │ 1 │ ▶ ... ▶ │ polls_to_archive │ | + // | ▲ └───┘ └───┘ └──────────────────┘ | + // | ▲ ▲ ▼ | + // | ▲ │ Roll over overriting older offsets, if any ◀ | + // └──────│──────────────────────────────────────────────────────┘ + // │ + // │ + // │ + // start + // index if t.pollsToArchive == 0 { return From c68817a6ae86b7bbc08369da0790abb021435e79 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 17 Sep 2024 13:35:48 +0530 Subject: [PATCH 12/45] chore: use options --- pkg/stanza/fileconsumer/config.go | 4 ++-- .../fileconsumer/internal/tracker/tracker.go | 20 +++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/pkg/stanza/fileconsumer/config.go b/pkg/stanza/fileconsumer/config.go index 6fc1a74ee28f..d60def60bbdc 100644 --- a/pkg/stanza/fileconsumer/config.go +++ b/pkg/stanza/fileconsumer/config.go @@ -177,9 +177,9 @@ func (c Config) Build(set component.TelemetrySettings, emit emit.Callback, opts var t tracker.Tracker if o.noTracking { - t = tracker.NewNoStateTracker(set, c.MaxConcurrentFiles/2) + t = tracker.NewNoStateTracker(set, tracker.WithMaxBatchFiles(c.MaxConcurrentFiles/2)) } else { - t = tracker.NewFileTracker(set, c.MaxConcurrentFiles/2) + t = tracker.NewFileTracker(set, tracker.WithMaxBatchFiles(c.MaxConcurrentFiles/2)) } telemetryBuilder, err := metadata.NewTelemetryBuilder(set) diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index e3b5885f788a..689c95d6751d 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -71,21 +71,21 @@ func WithPollsToArchive(pollsToArchive int) optionFunc { } } -func NewFileTracker(set component.TelemetrySettings, maxBatchFiles int, opts ...optionFunc) Tracker { +func NewFileTracker(set component.TelemetrySettings, opts ...optionFunc) Tracker { option := &option{} for _, opt := range opts { opt(option) } knownFiles := make([]*fileset.Fileset[*reader.Metadata], 3) for i := 0; i < len(knownFiles); i++ { - knownFiles[i] = fileset.New[*reader.Metadata](maxBatchFiles) + knownFiles[i] = fileset.New[*reader.Metadata](option.maxBatchFiles) } set.Logger = set.Logger.With(zap.String("tracker", "fileTracker")) return &fileTracker{ set: set, - maxBatchFiles: maxBatchFiles, - currentPollFiles: fileset.New[*reader.Reader](maxBatchFiles), - previousPollFiles: fileset.New[*reader.Reader](maxBatchFiles), + maxBatchFiles: option.maxBatchFiles, + currentPollFiles: fileset.New[*reader.Reader](option.maxBatchFiles), + previousPollFiles: fileset.New[*reader.Reader](option.maxBatchFiles), knownFiles: knownFiles, pollsToArchive: option.pollsToArchive, archiveIndex: 0, @@ -208,12 +208,16 @@ type noStateTracker struct { currentPollFiles *fileset.Fileset[*reader.Reader] } -func NewNoStateTracker(set component.TelemetrySettings, maxBatchFiles int) Tracker { +func NewNoStateTracker(set component.TelemetrySettings, opts ...optionFunc) Tracker { + option := &option{} + for _, opt := range opts { + opt(option) + } set.Logger = set.Logger.With(zap.String("tracker", "noStateTracker")) return &noStateTracker{ set: set, - maxBatchFiles: maxBatchFiles, - currentPollFiles: fileset.New[*reader.Reader](maxBatchFiles), + maxBatchFiles: option.maxBatchFiles, + currentPollFiles: fileset.New[*reader.Reader](option.maxBatchFiles), } } From 39d9e86357c1999a4eed1e85ff5f6ff354c2254a Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Fri, 27 Sep 2024 17:03:46 +0530 Subject: [PATCH 13/45] fix: lint --- .../fileconsumer/internal/checkpoint/checkpoint.go | 4 ---- pkg/stanza/fileconsumer/internal/tracker/tracker.go | 13 ++++++------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go b/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go index 5d638c1720aa..e1ba151dbd4a 100644 --- a/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go +++ b/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go @@ -47,10 +47,6 @@ func SaveKey(ctx context.Context, persister operator.Persister, rmds []*reader.M // Load loads the most recent set of files to the database func Load(ctx context.Context, persister operator.Persister) ([]*reader.Metadata, error) { - return LoadKey(ctx, persister, knownFilesKey) -} - -func LoadKey(ctx context.Context, persister operator.Persister, key string) ([]*reader.Metadata, error) { encoded, err := persister.Get(ctx, knownFilesKey) if err != nil { return nil, err diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 689c95d6751d..85a6db4663e6 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -57,21 +57,21 @@ type option struct { pollsToArchive int } -type optionFunc func(*option) +type OptionFunc func(*option) -func WithMaxBatchFiles(maxBatchFiles int) optionFunc { +func WithMaxBatchFiles(maxBatchFiles int) OptionFunc { return func(fto *option) { fto.maxBatchFiles = maxBatchFiles } } -func WithPollsToArchive(pollsToArchive int) optionFunc { +func WithPollsToArchive(pollsToArchive int) OptionFunc { return func(fto *option) { fto.pollsToArchive = pollsToArchive } } -func NewFileTracker(set component.TelemetrySettings, opts ...optionFunc) Tracker { +func NewFileTracker(set component.TelemetrySettings, opts ...OptionFunc) Tracker { option := &option{} for _, opt := range opts { opt(option) @@ -195,8 +195,7 @@ func (t *fileTracker) archive(metadata *fileset.Fileset[*reader.Metadata]) { if err := checkpoint.SaveKey(context.Background(), t.persister, metadata.Get(), key); err != nil { t.set.Logger.Error("error faced while saving to the archive", zap.Error(err)) } - t.archiveIndex += 1 // increment the index - t.archiveIndex %= t.pollsToArchive // ring buffer + t.archiveIndex = (t.archiveIndex + 1) % t.pollsToArchive // increment the index } // noStateTracker only tracks the current polled files. Once the poll is @@ -208,7 +207,7 @@ type noStateTracker struct { currentPollFiles *fileset.Fileset[*reader.Reader] } -func NewNoStateTracker(set component.TelemetrySettings, opts ...optionFunc) Tracker { +func NewNoStateTracker(set component.TelemetrySettings, opts ...OptionFunc) Tracker { option := &option{} for _, opt := range opts { opt(option) From 3490e7d28b4d5d6cf5534805135aebc8abd945b5 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Fri, 27 Sep 2024 17:41:58 +0530 Subject: [PATCH 14/45] fix: bug --- pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go b/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go index e1ba151dbd4a..b9476fb3d5e9 100644 --- a/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go +++ b/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go @@ -38,7 +38,7 @@ func SaveKey(ctx context.Context, persister operator.Persister, rmds []*reader.M } } - if err := persister.Set(ctx, knownFilesKey, buf.Bytes()); err != nil { + if err := persister.Set(ctx, key, buf.Bytes()); err != nil { errs = append(errs, fmt.Errorf("persist known files: %w", err)) } From 87c2d2a4dcc8ca5d214c19363696255347e4ae4d Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Fri, 27 Sep 2024 17:46:23 +0530 Subject: [PATCH 15/45] chore: rename function --- pkg/stanza/fileconsumer/file.go | 2 +- pkg/stanza/fileconsumer/internal/tracker/tracker.go | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go index f849be667b7a..4f8c1408e874 100644 --- a/pkg/stanza/fileconsumer/file.go +++ b/pkg/stanza/fileconsumer/file.go @@ -59,7 +59,7 @@ func (m *Manager) Start(persister operator.Persister) error { m.readerFactory.FromBeginning = true m.tracker.LoadMetadata(offsets) } - m.tracker.SetPersister(persister) + m.tracker.EnableArchiving(persister) } else if m.pollsToArchive > 0 { return fmt.Errorf("archiving is not supported in memory, please use a storage extension") } diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 85a6db4663e6..51a2be351acf 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -31,7 +31,7 @@ type Tracker interface { EndPoll() EndConsume() int TotalReaders() int - SetPersister(persister operator.Persister) + EnableArchiving(persister operator.Persister) } // fileTracker tracks known offsets for files that are being consumed by the manager. @@ -148,7 +148,7 @@ func (t *fileTracker) ClosePreviousFiles() (filesClosed int) { return } -func (t *fileTracker) SetPersister(persister operator.Persister) { +func (t *fileTracker) EnableArchiving(persister operator.Persister) { t.persister = persister } @@ -257,4 +257,4 @@ func (t *noStateTracker) EndPoll() {} func (t *noStateTracker) TotalReaders() int { return 0 } -func (t *noStateTracker) SetPersister(operator.Persister) {} +func (t *noStateTracker) EnableArchiving(operator.Persister) {} From b668554f3e37c2f39a1374e01ded7046cfea9f7e Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Fri, 27 Sep 2024 17:47:25 +0530 Subject: [PATCH 16/45] chore: rename function --- pkg/stanza/fileconsumer/file.go | 4 +++- pkg/stanza/fileconsumer/internal/tracker/tracker.go | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go index 4f8c1408e874..3f1a1723214a 100644 --- a/pkg/stanza/fileconsumer/file.go +++ b/pkg/stanza/fileconsumer/file.go @@ -59,7 +59,9 @@ func (m *Manager) Start(persister operator.Persister) error { m.readerFactory.FromBeginning = true m.tracker.LoadMetadata(offsets) } - m.tracker.EnableArchiving(persister) + if m.pollsToArchive > 0 { + m.tracker.EnableArchiving(persister) + } } else if m.pollsToArchive > 0 { return fmt.Errorf("archiving is not supported in memory, please use a storage extension") } diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 51a2be351acf..a20cb5b69823 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -188,7 +188,7 @@ func (t *fileTracker) archive(metadata *fileset.Fileset[*reader.Metadata]) { // start // index - if t.pollsToArchive == 0 { + if t.pollsToArchive == 0 || t.persister == nil { return } key := fmt.Sprintf("knownFiles%d", t.archiveIndex) From 3f762ed453d4ed9ee61a222e82cdd19f8a7bb00e Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Mon, 30 Sep 2024 20:12:37 +0530 Subject: [PATCH 17/45] chore: cleanup tracker and use options --- pkg/stanza/fileconsumer/config.go | 10 +--- pkg/stanza/fileconsumer/file.go | 8 +++ .../fileconsumer/internal/tracker/tracker.go | 51 ++++++++++++++----- 3 files changed, 46 insertions(+), 23 deletions(-) diff --git a/pkg/stanza/fileconsumer/config.go b/pkg/stanza/fileconsumer/config.go index 2eec1b6b2e79..cbc708ae5f03 100644 --- a/pkg/stanza/fileconsumer/config.go +++ b/pkg/stanza/fileconsumer/config.go @@ -25,7 +25,6 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/metadata" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/reader" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/scanner" - "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/tracker" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/matcher" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator/helper" @@ -174,13 +173,6 @@ func (c Config) Build(set component.TelemetrySettings, emit emit.Callback, opts AcquireFSLock: c.AcquireFSLock, } - var t tracker.Tracker - if o.noTracking { - t = tracker.NewNoStateTracker(set, c.MaxConcurrentFiles/2) - } else { - t = tracker.NewFileTracker(set, c.MaxConcurrentFiles/2) - } - telemetryBuilder, err := metadata.NewTelemetryBuilder(set) if err != nil { return nil, err @@ -192,8 +184,8 @@ func (c Config) Build(set component.TelemetrySettings, emit emit.Callback, opts pollInterval: c.PollInterval, maxBatchFiles: c.MaxConcurrentFiles / 2, maxBatches: c.MaxBatches, - tracker: t, telemetryBuilder: telemetryBuilder, + noTracking: o.noTracking, }, nil } diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go index d46507ecf3eb..fe32b05aa6dc 100644 --- a/pkg/stanza/fileconsumer/file.go +++ b/pkg/stanza/fileconsumer/file.go @@ -30,6 +30,7 @@ type Manager struct { readerFactory reader.Factory fileMatcher *matcher.Matcher tracker tracker.Tracker + noTracking bool pollInterval time.Duration persister operator.Persister @@ -60,6 +61,9 @@ func (m *Manager) Start(persister operator.Persister) error { } } + // instantiate the tracker + m.instantiateTracker() + // Start polling goroutine m.startPoller(ctx) @@ -261,3 +265,7 @@ func (m *Manager) newReader(ctx context.Context, file *os.File, fp *fingerprint. m.telemetryBuilder.FileconsumerOpenFiles.Add(ctx, 1) return r, nil } + +func (m *Manager) instantiateTracker() { + m.tracker = tracker.NewFileTracker(m.set, tracker.WithMaxBatchFiles(m.maxBatchFiles), tracker.WithNoTracking(m.noTracking)) +} diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 5039003a36ed..43fe724ead30 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -39,17 +39,49 @@ type fileTracker struct { knownFiles []*fileset.Fileset[*reader.Metadata] } -func NewFileTracker(set component.TelemetrySettings, maxBatchFiles int) Tracker { +type option struct { + maxBatchFiles int + pollsToArchive int + noTracking bool +} + +type OptionFunc func(*option) + +func WithMaxBatchFiles(maxBatchFiles int) OptionFunc { + return func(fto *option) { + fto.maxBatchFiles = maxBatchFiles + } +} + +func WithNoTracking(noTracking bool) OptionFunc { + return func(fto *option) { + fto.noTracking = noTracking + } +} + +func NewFileTracker(set component.TelemetrySettings, opts ...OptionFunc) Tracker { + option := &option{} + for _, opt := range opts { + opt(option) + } + if option.noTracking { + return &noStateTracker{ + set: set, + maxBatchFiles: option.maxBatchFiles, + currentPollFiles: fileset.New[*reader.Reader](option.maxBatchFiles), + } + } + knownFiles := make([]*fileset.Fileset[*reader.Metadata], 3) for i := 0; i < len(knownFiles); i++ { - knownFiles[i] = fileset.New[*reader.Metadata](maxBatchFiles) + knownFiles[i] = fileset.New[*reader.Metadata](option.maxBatchFiles) } set.Logger = set.Logger.With(zap.String("tracker", "fileTracker")) return &fileTracker{ set: set, - maxBatchFiles: maxBatchFiles, - currentPollFiles: fileset.New[*reader.Reader](maxBatchFiles), - previousPollFiles: fileset.New[*reader.Reader](maxBatchFiles), + maxBatchFiles: option.maxBatchFiles, + currentPollFiles: fileset.New[*reader.Reader](option.maxBatchFiles), + previousPollFiles: fileset.New[*reader.Reader](option.maxBatchFiles), knownFiles: knownFiles, } } @@ -134,15 +166,6 @@ type noStateTracker struct { currentPollFiles *fileset.Fileset[*reader.Reader] } -func NewNoStateTracker(set component.TelemetrySettings, maxBatchFiles int) Tracker { - set.Logger = set.Logger.With(zap.String("tracker", "noStateTracker")) - return &noStateTracker{ - set: set, - maxBatchFiles: maxBatchFiles, - currentPollFiles: fileset.New[*reader.Reader](maxBatchFiles), - } -} - func (t *noStateTracker) Add(reader *reader.Reader) { // add a new reader for tracking t.currentPollFiles.Add(reader) From 66824c63149e52d540f263280bc11aa2dd851bc1 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Mon, 30 Sep 2024 20:17:12 +0530 Subject: [PATCH 18/45] fix: move function before loading --- pkg/stanza/fileconsumer/file.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go index fe32b05aa6dc..dc1520a80371 100644 --- a/pkg/stanza/fileconsumer/file.go +++ b/pkg/stanza/fileconsumer/file.go @@ -48,6 +48,9 @@ func (m *Manager) Start(persister operator.Persister) error { m.set.Logger.Warn("finding files", zap.Error(err)) } + // instantiate the tracker + m.instantiateTracker() + if persister != nil { m.persister = persister offsets, err := checkpoint.Load(ctx, m.persister) @@ -61,9 +64,6 @@ func (m *Manager) Start(persister operator.Persister) error { } } - // instantiate the tracker - m.instantiateTracker() - // Start polling goroutine m.startPoller(ctx) From ef2e53aebe3afcaba12017fcd3e70d2cc7be66c1 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 1 Oct 2024 00:31:18 +0530 Subject: [PATCH 19/45] chore: log the error --- pkg/stanza/fileconsumer/file.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go index 3f1a1723214a..c3fa0ee3b987 100644 --- a/pkg/stanza/fileconsumer/file.go +++ b/pkg/stanza/fileconsumer/file.go @@ -63,7 +63,7 @@ func (m *Manager) Start(persister operator.Persister) error { m.tracker.EnableArchiving(persister) } } else if m.pollsToArchive > 0 { - return fmt.Errorf("archiving is not supported in memory, please use a storage extension") + m.set.Logger.Error("archiving is not supported in memory, please use a storage extension") } // Start polling goroutine From f009e71b9a340f1b9fad11c45538d9eddc8ccce6 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 1 Oct 2024 00:24:27 +0530 Subject: [PATCH 20/45] chore: lint, ci --- pkg/stanza/fileconsumer/file.go | 4 +++- pkg/stanza/fileconsumer/internal/tracker/tracker.go | 5 ++--- pkg/stanza/fileconsumer/util_test.go | 2 ++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go index dc1520a80371..4f590237a713 100644 --- a/pkg/stanza/fileconsumer/file.go +++ b/pkg/stanza/fileconsumer/file.go @@ -77,7 +77,9 @@ func (m *Manager) Stop() error { m.cancel = nil } m.wg.Wait() - m.telemetryBuilder.FileconsumerOpenFiles.Add(context.TODO(), int64(0-m.tracker.ClosePreviousFiles())) + if m.tracker != nil { + m.telemetryBuilder.FileconsumerOpenFiles.Add(context.TODO(), int64(0-m.tracker.ClosePreviousFiles())) + } if m.persister != nil { if err := checkpoint.Save(context.Background(), m.persister, m.tracker.GetMetadata()); err != nil { m.set.Logger.Error("save offsets", zap.Error(err)) diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 43fe724ead30..4a5d1fd9ff3c 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -40,9 +40,8 @@ type fileTracker struct { } type option struct { - maxBatchFiles int - pollsToArchive int - noTracking bool + maxBatchFiles int + noTracking bool } type OptionFunc func(*option) diff --git a/pkg/stanza/fileconsumer/util_test.go b/pkg/stanza/fileconsumer/util_test.go index 19d465a6e43b..46d0e842839c 100644 --- a/pkg/stanza/fileconsumer/util_test.go +++ b/pkg/stanza/fileconsumer/util_test.go @@ -10,6 +10,7 @@ import ( "go.opentelemetry.io/collector/component/componenttest" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/emittest" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/tracker" ) func testManager(t *testing.T, cfg *Config, opts ...Option) (*Manager, *emittest.Sink) { @@ -20,6 +21,7 @@ func testManager(t *testing.T, cfg *Config, opts ...Option) (*Manager, *emittest func testManagerWithSink(t *testing.T, cfg *Config, sink *emittest.Sink, opts ...Option) *Manager { set := componenttest.NewNopTelemetrySettings() input, err := cfg.Build(set, sink.Callback, opts...) + input.tracker = tracker.NewFileTracker(set) require.NoError(t, err) t.Cleanup(func() { input.tracker.ClosePreviousFiles() }) return input From 79ce0e3adbadd27b534649a0bfed2a98f67fdb6b Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 1 Oct 2024 01:24:26 +0530 Subject: [PATCH 21/45] chore: remove redundant argument --- pkg/stanza/fileconsumer/file.go | 6 +++++- pkg/stanza/fileconsumer/internal/tracker/tracker.go | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go index 4f590237a713..42386d0c82f7 100644 --- a/pkg/stanza/fileconsumer/file.go +++ b/pkg/stanza/fileconsumer/file.go @@ -269,5 +269,9 @@ func (m *Manager) newReader(ctx context.Context, file *os.File, fp *fingerprint. } func (m *Manager) instantiateTracker() { - m.tracker = tracker.NewFileTracker(m.set, tracker.WithMaxBatchFiles(m.maxBatchFiles), tracker.WithNoTracking(m.noTracking)) + opts := []tracker.OptionFunc{tracker.WithMaxBatchFiles(m.maxBatchFiles)} + if m.noTracking { + opts = append(opts, tracker.WithNoTracking()) + } + m.tracker = tracker.NewFileTracker(m.set, opts...) } diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 4a5d1fd9ff3c..2c92ec095f4a 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -52,9 +52,9 @@ func WithMaxBatchFiles(maxBatchFiles int) OptionFunc { } } -func WithNoTracking(noTracking bool) OptionFunc { +func WithNoTracking() OptionFunc { return func(fto *option) { - fto.noTracking = noTracking + fto.noTracking = true } } From cae05c7c7a6e8b526301309257833a2a1baf20d4 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Thu, 3 Oct 2024 01:23:06 +0530 Subject: [PATCH 22/45] chore: remove redundant code --- pkg/stanza/fileconsumer/file.go | 7 ++----- pkg/stanza/fileconsumer/internal/tracker/tracker.go | 3 ++- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go index 2c72104bb172..51e30fc89b3d 100644 --- a/pkg/stanza/fileconsumer/file.go +++ b/pkg/stanza/fileconsumer/file.go @@ -63,9 +63,6 @@ func (m *Manager) Start(persister operator.Persister) error { m.readerFactory.FromBeginning = true m.tracker.LoadMetadata(offsets) } - if m.pollsToArchive > 0 { - m.tracker.EnableArchiving(persister) - } } else if m.pollsToArchive > 0 { m.set.Logger.Error("archiving is not supported in memory, please use a storage extension") } @@ -276,8 +273,8 @@ func (m *Manager) newReader(ctx context.Context, file *os.File, fp *fingerprint. func (m *Manager) instantiateTracker() { opts := []tracker.OptionFunc{tracker.WithMaxBatchFiles(m.maxBatchFiles)} - if m.noTracking { - opts = append(opts, tracker.WithNoTracking()) + if m.pollsToArchive > 0 { + opts = append(opts, tracker.WithPollsToArchive(m.pollsToArchive), tracker.WithPersister(m.persister)) } m.tracker = tracker.NewFileTracker(m.set, opts...) } diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 3df8c3667068..60e6fed9db21 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -94,6 +94,7 @@ func NewFileTracker(set component.TelemetrySettings, opts ...OptionFunc) Tracker previousPollFiles: fileset.New[*reader.Reader](option.maxBatchFiles), knownFiles: knownFiles, pollsToArchive: option.pollsToArchive, + persister: option.persister, archiveIndex: 0, } } @@ -190,7 +191,7 @@ func (t *fileTracker) archive(metadata *fileset.Fileset[*reader.Metadata]) { // start // index - if t.pollsToArchive == 0 || t.persister == nil { + if t.pollsToArchive <= 0 || t.persister == nil { return } key := fmt.Sprintf("knownFiles%d", t.archiveIndex) From 25d923b489f5a147cff2b06c9828da961164b6fc Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Thu, 3 Oct 2024 01:32:55 +0530 Subject: [PATCH 23/45] chore: add new no tracking --- pkg/stanza/fileconsumer/file.go | 3 +++ .../fileconsumer/internal/tracker/tracker.go | 27 ++++++++++--------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go index 51e30fc89b3d..b15fae540895 100644 --- a/pkg/stanza/fileconsumer/file.go +++ b/pkg/stanza/fileconsumer/file.go @@ -273,6 +273,9 @@ func (m *Manager) newReader(ctx context.Context, file *os.File, fp *fingerprint. func (m *Manager) instantiateTracker() { opts := []tracker.OptionFunc{tracker.WithMaxBatchFiles(m.maxBatchFiles)} + if m.noTracking { + opts = append(opts, tracker.WithNoTracking()) + } if m.pollsToArchive > 0 { opts = append(opts, tracker.WithPollsToArchive(m.pollsToArchive), tracker.WithPersister(m.persister)) } diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 60e6fed9db21..d5034d8357b8 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -52,6 +52,7 @@ type fileTracker struct { } type option struct { + noTracking bool maxBatchFiles int pollsToArchive int persister operator.Persister @@ -77,11 +78,24 @@ func WithPersister(persister operator.Persister) OptionFunc { } } +func WithNoTracking() OptionFunc { + return func(fto *option) { + fto.noTracking = true + } +} + func NewFileTracker(set component.TelemetrySettings, opts ...OptionFunc) Tracker { option := &option{} for _, opt := range opts { opt(option) } + if option.noTracking { + return &noStateTracker{ + set: set, + maxBatchFiles: option.maxBatchFiles, + currentPollFiles: fileset.New[*reader.Reader](option.maxBatchFiles), + } + } knownFiles := make([]*fileset.Fileset[*reader.Metadata], 3) for i := 0; i < len(knownFiles); i++ { knownFiles[i] = fileset.New[*reader.Metadata](option.maxBatchFiles) @@ -210,19 +224,6 @@ type noStateTracker struct { currentPollFiles *fileset.Fileset[*reader.Reader] } -func NewNoStateTracker(set component.TelemetrySettings, opts ...OptionFunc) Tracker { - option := &option{} - for _, opt := range opts { - opt(option) - } - set.Logger = set.Logger.With(zap.String("tracker", "noStateTracker")) - return &noStateTracker{ - set: set, - maxBatchFiles: option.maxBatchFiles, - currentPollFiles: fileset.New[*reader.Reader](option.maxBatchFiles), - } -} - func (t *noStateTracker) Add(reader *reader.Reader) { // add a new reader for tracking t.currentPollFiles.Add(reader) From 4bc21507c7a81f20b15b06e0f2f4777492d329b5 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Thu, 3 Oct 2024 02:13:45 +0530 Subject: [PATCH 24/45] fix: pass persister instead of m.persister --- pkg/stanza/fileconsumer/file.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go index b15fae540895..8d12388b86f1 100644 --- a/pkg/stanza/fileconsumer/file.go +++ b/pkg/stanza/fileconsumer/file.go @@ -50,7 +50,7 @@ func (m *Manager) Start(persister operator.Persister) error { } // instantiate the tracker - m.instantiateTracker() + m.instantiateTracker(persister) if persister != nil { m.persister = persister @@ -271,13 +271,13 @@ func (m *Manager) newReader(ctx context.Context, file *os.File, fp *fingerprint. return r, nil } -func (m *Manager) instantiateTracker() { +func (m *Manager) instantiateTracker(persister operator.Persister) { opts := []tracker.OptionFunc{tracker.WithMaxBatchFiles(m.maxBatchFiles)} if m.noTracking { opts = append(opts, tracker.WithNoTracking()) } if m.pollsToArchive > 0 { - opts = append(opts, tracker.WithPollsToArchive(m.pollsToArchive), tracker.WithPersister(m.persister)) + opts = append(opts, tracker.WithPollsToArchive(m.pollsToArchive), tracker.WithPersister(persister)) } m.tracker = tracker.NewFileTracker(m.set, opts...) } From 9006e3f793ccf8aff60e0004a6a7df66c44892c5 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Thu, 3 Oct 2024 19:11:13 +0530 Subject: [PATCH 25/45] chore: remove options --- pkg/stanza/fileconsumer/file.go | 11 ++- .../fileconsumer/internal/tracker/tracker.go | 67 +++++-------------- 2 files changed, 21 insertions(+), 57 deletions(-) diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go index 8d12388b86f1..c5675a90a063 100644 --- a/pkg/stanza/fileconsumer/file.go +++ b/pkg/stanza/fileconsumer/file.go @@ -272,12 +272,11 @@ func (m *Manager) newReader(ctx context.Context, file *os.File, fp *fingerprint. } func (m *Manager) instantiateTracker(persister operator.Persister) { - opts := []tracker.OptionFunc{tracker.WithMaxBatchFiles(m.maxBatchFiles)} + var t tracker.Tracker if m.noTracking { - opts = append(opts, tracker.WithNoTracking()) + t = tracker.NewNoStateTracker(m.set, m.maxBatchFiles) + } else { + t = tracker.NewFileTracker(m.set, m.maxBatchFiles, m.pollsToArchive, persister) } - if m.pollsToArchive > 0 { - opts = append(opts, tracker.WithPollsToArchive(m.pollsToArchive), tracker.WithPersister(persister)) - } - m.tracker = tracker.NewFileTracker(m.set, opts...) + m.tracker = t } diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index d5034d8357b8..54bf5e9e12c1 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -51,64 +51,20 @@ type fileTracker struct { archiveIndex int } -type option struct { - noTracking bool - maxBatchFiles int - pollsToArchive int - persister operator.Persister -} - -type OptionFunc func(*option) - -func WithMaxBatchFiles(maxBatchFiles int) OptionFunc { - return func(fto *option) { - fto.maxBatchFiles = maxBatchFiles - } -} - -func WithPollsToArchive(pollsToArchive int) OptionFunc { - return func(fto *option) { - fto.pollsToArchive = pollsToArchive - } -} - -func WithPersister(persister operator.Persister) OptionFunc { - return func(fto *option) { - fto.persister = persister - } -} - -func WithNoTracking() OptionFunc { - return func(fto *option) { - fto.noTracking = true - } -} - -func NewFileTracker(set component.TelemetrySettings, opts ...OptionFunc) Tracker { - option := &option{} - for _, opt := range opts { - opt(option) - } - if option.noTracking { - return &noStateTracker{ - set: set, - maxBatchFiles: option.maxBatchFiles, - currentPollFiles: fileset.New[*reader.Reader](option.maxBatchFiles), - } - } +func NewFileTracker(set component.TelemetrySettings, maxBatchFiles int, pollsToArchive int, persister operator.Persister) Tracker { knownFiles := make([]*fileset.Fileset[*reader.Metadata], 3) for i := 0; i < len(knownFiles); i++ { - knownFiles[i] = fileset.New[*reader.Metadata](option.maxBatchFiles) + knownFiles[i] = fileset.New[*reader.Metadata](maxBatchFiles) } set.Logger = set.Logger.With(zap.String("tracker", "fileTracker")) return &fileTracker{ set: set, - maxBatchFiles: option.maxBatchFiles, - currentPollFiles: fileset.New[*reader.Reader](option.maxBatchFiles), - previousPollFiles: fileset.New[*reader.Reader](option.maxBatchFiles), + maxBatchFiles: maxBatchFiles, + currentPollFiles: fileset.New[*reader.Reader](maxBatchFiles), + previousPollFiles: fileset.New[*reader.Reader](maxBatchFiles), knownFiles: knownFiles, - pollsToArchive: option.pollsToArchive, - persister: option.persister, + pollsToArchive: pollsToArchive, + persister: persister, archiveIndex: 0, } } @@ -224,6 +180,15 @@ type noStateTracker struct { currentPollFiles *fileset.Fileset[*reader.Reader] } +func NewNoStateTracker(set component.TelemetrySettings, maxBatchFiles int) Tracker { + set.Logger = set.Logger.With(zap.String("tracker", "noStateTracker")) + return &noStateTracker{ + set: set, + maxBatchFiles: maxBatchFiles, + currentPollFiles: fileset.New[*reader.Reader](maxBatchFiles), + } +} + func (t *noStateTracker) Add(reader *reader.Reader) { // add a new reader for tracking t.currentPollFiles.Add(reader) From 4413322ada66cf9df6e48600abcb8e2e8aed3fad Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Thu, 3 Oct 2024 19:13:26 +0530 Subject: [PATCH 26/45] chore: fix tests --- pkg/stanza/fileconsumer/util_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/stanza/fileconsumer/util_test.go b/pkg/stanza/fileconsumer/util_test.go index 46d0e842839c..69bb92ca26cd 100644 --- a/pkg/stanza/fileconsumer/util_test.go +++ b/pkg/stanza/fileconsumer/util_test.go @@ -11,6 +11,7 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/emittest" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/tracker" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/testutil" ) func testManager(t *testing.T, cfg *Config, opts ...Option) (*Manager, *emittest.Sink) { @@ -21,7 +22,7 @@ func testManager(t *testing.T, cfg *Config, opts ...Option) (*Manager, *emittest func testManagerWithSink(t *testing.T, cfg *Config, sink *emittest.Sink, opts ...Option) *Manager { set := componenttest.NewNopTelemetrySettings() input, err := cfg.Build(set, sink.Callback, opts...) - input.tracker = tracker.NewFileTracker(set) + input.tracker = tracker.NewFileTracker(set, cfg.MaxBatches, cfg.PollsToArchive, testutil.NewUnscopedMockPersister()) require.NoError(t, err) t.Cleanup(func() { input.tracker.ClosePreviousFiles() }) return input From 79b48c1f796024524aee3fc977416536b409ab09 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 8 Oct 2024 21:58:56 +0530 Subject: [PATCH 27/45] initial read commit --- pkg/stanza/fileconsumer/file.go | 1 + .../internal/checkpoint/checkpoint.go | 7 +++- .../fileconsumer/internal/reader/factory.go | 7 +++- .../fileconsumer/internal/reader/reader.go | 9 ++++ .../fileconsumer/internal/tracker/tracker.go | 41 +++++++++++++++++++ 5 files changed, 63 insertions(+), 2 deletions(-) diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go index c5675a90a063..e351452063bd 100644 --- a/pkg/stanza/fileconsumer/file.go +++ b/pkg/stanza/fileconsumer/file.go @@ -230,6 +230,7 @@ func (m *Manager) makeReaders(ctx context.Context, paths []string) { m.tracker.Add(r) } + m.tracker.SyncOffsets() } func (m *Manager) newReader(ctx context.Context, file *os.File, fp *fingerprint.Fingerprint) (*reader.Reader, error) { diff --git a/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go b/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go index b9476fb3d5e9..73f3c55c12b2 100644 --- a/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go +++ b/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go @@ -46,8 +46,13 @@ func SaveKey(ctx context.Context, persister operator.Persister, rmds []*reader.M } // Load loads the most recent set of files to the database + +// Save syncs the most recent set of files to the database func Load(ctx context.Context, persister operator.Persister) ([]*reader.Metadata, error) { - encoded, err := persister.Get(ctx, knownFilesKey) + return LoadKey(ctx, persister, knownFilesKey) +} +func LoadKey(ctx context.Context, persister operator.Persister, key string) ([]*reader.Metadata, error) { + encoded, err := persister.Get(ctx, key) if err != nil { return nil, err } diff --git a/pkg/stanza/fileconsumer/internal/reader/factory.go b/pkg/stanza/fileconsumer/internal/reader/factory.go index 7287ca40dae7..97f286db796d 100644 --- a/pkg/stanza/fileconsumer/internal/reader/factory.go +++ b/pkg/stanza/fileconsumer/internal/reader/factory.go @@ -60,7 +60,12 @@ func (f *Factory) NewReader(file *os.File, fp *fingerprint.Fingerprint) (*Reader if f.FlushTimeout > 0 { m.FlushState = &flush.State{LastDataChange: time.Now()} } - return f.NewReaderFromMetadata(file, m) + r, err := f.NewReaderFromMetadata(file, m) + if err != nil { + return nil, err + } + r.new = true // indicates that a reader is new (no previously known offset) + return r, nil } func (f *Factory) NewReaderFromMetadata(file *os.File, m *Metadata) (r *Reader, err error) { diff --git a/pkg/stanza/fileconsumer/internal/reader/reader.go b/pkg/stanza/fileconsumer/internal/reader/reader.go index a0c93a63a0f9..5a3c0414339f 100644 --- a/pkg/stanza/fileconsumer/internal/reader/reader.go +++ b/pkg/stanza/fileconsumer/internal/reader/reader.go @@ -53,6 +53,7 @@ type Reader struct { includeFileRecordNum bool compression string acquireFSLock bool + new bool // indicates that a reader is new (no previously known offset) } // ReadToEnd will read until the end of the file @@ -239,6 +240,9 @@ func (r *Reader) GetFileName() string { return r.fileName } +func (r *Reader) IsNew() bool { + return r.new +} func (m Metadata) GetFingerprint() *fingerprint.Fingerprint { return m.Fingerprint } @@ -257,3 +261,8 @@ func (r *Reader) updateFingerprint() { } r.Fingerprint = refreshedFingerprint } + +func (r *Reader) SyncMetadata(m *Metadata) { + r.Metadata = m + r.new = false +} diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 54bf5e9e12c1..bc6c31b31663 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -31,6 +31,7 @@ type Tracker interface { EndPoll() EndConsume() int TotalReaders() int + SyncOffsets() } // fileTracker tracks known offsets for files that are being consumed by the manager. @@ -171,6 +172,44 @@ func (t *fileTracker) archive(metadata *fileset.Fileset[*reader.Metadata]) { t.archiveIndex = (t.archiveIndex + 1) % t.pollsToArchive // increment the index } +func (t *fileTracker) readArchive(readIndex int) (*fileset.Fileset[*reader.Metadata], error) { + key := fmt.Sprintf("knownFiles%d", readIndex) + metadata, err := checkpoint.LoadKey(context.Background(), t.persister, key) + if err != nil { + return nil, err + } + f := fileset.New[*reader.Metadata](len(metadata)) + f.Add(metadata...) + return f, nil +} + +func (t *fileTracker) updateArchive(readIndex int, rmds *fileset.Fileset[*reader.Metadata]) error { + key := fmt.Sprintf("knownFiles%d", readIndex) + return checkpoint.SaveKey(context.Background(), t.persister, rmds.Get(), key) +} + +func (t *fileTracker) SyncOffsets() { + archiveReadIndex := 0 + for i := 0; i < t.pollsToArchive; i++ { + newFound := false + data, _ := t.readArchive(archiveReadIndex) + for _, v := range t.currentPollFiles.Get() { + if v.IsNew() { + newFound = true + if md := data.Match(v.GetFingerprint(), fileset.StartsWith); md != nil { + v.SyncMetadata(md) + } + } + } + if !newFound { + // no new reader exists. No need to walk through archive. Just exit to save time + break + } + t.updateArchive(archiveReadIndex, data) + } + +} + // noStateTracker only tracks the current polled files. Once the poll is // complete and telemetry is consumed, the tracked files are closed. The next // poll will create fresh readers with no previously tracked offsets. @@ -225,3 +264,5 @@ func (t *noStateTracker) ClosePreviousFiles() int { return 0 } func (t *noStateTracker) EndPoll() {} func (t *noStateTracker) TotalReaders() int { return 0 } + +func (t *noStateTracker) SyncOffsets() {} From e284eb175e91ed3a99100db5e927eb7c0502dd18 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Mon, 14 Oct 2024 18:52:13 +0530 Subject: [PATCH 28/45] improve readablity --- pkg/stanza/fileconsumer/internal/tracker/tracker.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index bc6c31b31663..97ae089690a6 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -165,8 +165,7 @@ func (t *fileTracker) archive(metadata *fileset.Fileset[*reader.Metadata]) { if t.pollsToArchive <= 0 || t.persister == nil { return } - key := fmt.Sprintf("knownFiles%d", t.archiveIndex) - if err := checkpoint.SaveKey(context.Background(), t.persister, metadata.Get(), key); err != nil { + if err := t.updateArchive(t.archiveIndex, metadata); err != nil { t.set.Logger.Error("error faced while saving to the archive", zap.Error(err)) } t.archiveIndex = (t.archiveIndex + 1) % t.pollsToArchive // increment the index @@ -183,8 +182,8 @@ func (t *fileTracker) readArchive(readIndex int) (*fileset.Fileset[*reader.Metad return f, nil } -func (t *fileTracker) updateArchive(readIndex int, rmds *fileset.Fileset[*reader.Metadata]) error { - key := fmt.Sprintf("knownFiles%d", readIndex) +func (t *fileTracker) updateArchive(index int, rmds *fileset.Fileset[*reader.Metadata]) error { + key := fmt.Sprintf("knownFiles%d", index) return checkpoint.SaveKey(context.Background(), t.persister, rmds.Get(), key) } From 0b8a00d32f030c6a22914aeb8fa38827c13bb9b5 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Mon, 14 Oct 2024 18:56:04 +0530 Subject: [PATCH 29/45] improve readablity --- .../internal/checkpoint/checkpoint.go | 2 -- .../fileconsumer/internal/tracker/tracker.go | 16 ++++++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go b/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go index 73f3c55c12b2..4f48d820ad9b 100644 --- a/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go +++ b/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go @@ -46,8 +46,6 @@ func SaveKey(ctx context.Context, persister operator.Persister, rmds []*reader.M } // Load loads the most recent set of files to the database - -// Save syncs the most recent set of files to the database func Load(ctx context.Context, persister operator.Persister) ([]*reader.Metadata, error) { return LoadKey(ctx, persister, knownFilesKey) } diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 97ae089690a6..0339c3b79cbc 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -171,8 +171,8 @@ func (t *fileTracker) archive(metadata *fileset.Fileset[*reader.Metadata]) { t.archiveIndex = (t.archiveIndex + 1) % t.pollsToArchive // increment the index } -func (t *fileTracker) readArchive(readIndex int) (*fileset.Fileset[*reader.Metadata], error) { - key := fmt.Sprintf("knownFiles%d", readIndex) +func (t *fileTracker) readArchive(index int) (*fileset.Fileset[*reader.Metadata], error) { + key := fmt.Sprintf("knownFiles%d", index) metadata, err := checkpoint.LoadKey(context.Background(), t.persister, key) if err != nil { return nil, err @@ -188,7 +188,12 @@ func (t *fileTracker) updateArchive(index int, rmds *fileset.Fileset[*reader.Met } func (t *fileTracker) SyncOffsets() { - archiveReadIndex := 0 + // SyncOffsets goes through all new (unmatched) readers and updates the metadata, if found on archive. + + // To minimize disk access, we first access the index, then review unmatched readers and synchronize their metadata if a match is found. + // We exit if no new reader exists. + + archiveReadIndex := t.archiveIndex - 1 // try loading most recently written index and iterate backwards for i := 0; i < t.pollsToArchive; i++ { newFound := false data, _ := t.readArchive(archiveReadIndex) @@ -201,10 +206,13 @@ func (t *fileTracker) SyncOffsets() { } } if !newFound { - // no new reader exists. No need to walk through archive. Just exit to save time + // No new reader is available, so there’s no need to go through the rest of the archive. + // Just exit to save time. break } t.updateArchive(archiveReadIndex, data) + + archiveReadIndex = (archiveReadIndex - 1) % t.pollsToArchive } } From d00b376240498262e43ac0ec16ec2ca49d28487f Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 15 Oct 2024 19:27:21 +0530 Subject: [PATCH 30/45] comments --- pkg/stanza/fileconsumer/internal/tracker/tracker.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 0339c3b79cbc..f91098d6c933 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -172,6 +172,7 @@ func (t *fileTracker) archive(metadata *fileset.Fileset[*reader.Metadata]) { } func (t *fileTracker) readArchive(index int) (*fileset.Fileset[*reader.Metadata], error) { + // readArchive loads data from the archive for a given index and returns a fileset.Filset. key := fmt.Sprintf("knownFiles%d", index) metadata, err := checkpoint.LoadKey(context.Background(), t.persister, key) if err != nil { @@ -183,6 +184,7 @@ func (t *fileTracker) readArchive(index int) (*fileset.Fileset[*reader.Metadata] } func (t *fileTracker) updateArchive(index int, rmds *fileset.Fileset[*reader.Metadata]) error { + // updateArchive saves data to the archive for a given index and returns an error, if encountered. key := fmt.Sprintf("knownFiles%d", index) return checkpoint.SaveKey(context.Background(), t.persister, rmds.Get(), key) } From 2936928b5cc24042b25e3152433f611d7f16c7f6 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 22 Oct 2024 23:17:23 +0530 Subject: [PATCH 31/45] lint --- pkg/stanza/fileconsumer/internal/tracker/tracker.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index f91098d6c933..25041419f495 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -198,7 +198,11 @@ func (t *fileTracker) SyncOffsets() { archiveReadIndex := t.archiveIndex - 1 // try loading most recently written index and iterate backwards for i := 0; i < t.pollsToArchive; i++ { newFound := false - data, _ := t.readArchive(archiveReadIndex) + data, err := t.readArchive(archiveReadIndex) + if err != nil { + t.set.Logger.Error("error while opening archive", zap.Error(err)) + continue + } for _, v := range t.currentPollFiles.Get() { if v.IsNew() { newFound = true @@ -212,7 +216,10 @@ func (t *fileTracker) SyncOffsets() { // Just exit to save time. break } - t.updateArchive(archiveReadIndex, data) + if err := t.updateArchive(archiveReadIndex, data); err != nil { + t.set.Logger.Error("error while opening archive", zap.Error(err)) + continue + } archiveReadIndex = (archiveReadIndex - 1) % t.pollsToArchive } From fbf35c59d78e5e4e101c67a142353ea6233aa045 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Sat, 2 Nov 2024 04:09:14 +0530 Subject: [PATCH 32/45] chore: improve logic --- pkg/stanza/fileconsumer/file.go | 1 - .../fileconsumer/internal/reader/factory.go | 7 +- .../fileconsumer/internal/reader/reader.go | 9 --- .../fileconsumer/internal/tracker/tracker.go | 67 ++++++++++--------- 4 files changed, 38 insertions(+), 46 deletions(-) diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go index e351452063bd..c5675a90a063 100644 --- a/pkg/stanza/fileconsumer/file.go +++ b/pkg/stanza/fileconsumer/file.go @@ -230,7 +230,6 @@ func (m *Manager) makeReaders(ctx context.Context, paths []string) { m.tracker.Add(r) } - m.tracker.SyncOffsets() } func (m *Manager) newReader(ctx context.Context, file *os.File, fp *fingerprint.Fingerprint) (*reader.Reader, error) { diff --git a/pkg/stanza/fileconsumer/internal/reader/factory.go b/pkg/stanza/fileconsumer/internal/reader/factory.go index 97f286db796d..7287ca40dae7 100644 --- a/pkg/stanza/fileconsumer/internal/reader/factory.go +++ b/pkg/stanza/fileconsumer/internal/reader/factory.go @@ -60,12 +60,7 @@ func (f *Factory) NewReader(file *os.File, fp *fingerprint.Fingerprint) (*Reader if f.FlushTimeout > 0 { m.FlushState = &flush.State{LastDataChange: time.Now()} } - r, err := f.NewReaderFromMetadata(file, m) - if err != nil { - return nil, err - } - r.new = true // indicates that a reader is new (no previously known offset) - return r, nil + return f.NewReaderFromMetadata(file, m) } func (f *Factory) NewReaderFromMetadata(file *os.File, m *Metadata) (r *Reader, err error) { diff --git a/pkg/stanza/fileconsumer/internal/reader/reader.go b/pkg/stanza/fileconsumer/internal/reader/reader.go index 5a3c0414339f..a0c93a63a0f9 100644 --- a/pkg/stanza/fileconsumer/internal/reader/reader.go +++ b/pkg/stanza/fileconsumer/internal/reader/reader.go @@ -53,7 +53,6 @@ type Reader struct { includeFileRecordNum bool compression string acquireFSLock bool - new bool // indicates that a reader is new (no previously known offset) } // ReadToEnd will read until the end of the file @@ -240,9 +239,6 @@ func (r *Reader) GetFileName() string { return r.fileName } -func (r *Reader) IsNew() bool { - return r.new -} func (m Metadata) GetFingerprint() *fingerprint.Fingerprint { return m.Fingerprint } @@ -261,8 +257,3 @@ func (r *Reader) updateFingerprint() { } r.Fingerprint = refreshedFingerprint } - -func (r *Reader) SyncMetadata(m *Metadata) { - r.Metadata = m - r.new = false -} diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 25041419f495..08960990ebba 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -6,6 +6,7 @@ package tracker // import "github.com/open-telemetry/opentelemetry-collector-con import ( "context" "fmt" + "os" "go.opentelemetry.io/collector/component" "go.uber.org/zap" @@ -17,6 +18,14 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator" ) +// A convenience struct that holds a file fingerprint and its associated metadata. +// This will be used during reader creation after the FindFiles() method. +type Record struct { + File *os.File + Fingerprint *fingerprint.Fingerprint + Metadata *reader.Metadata +} + // Interface for tracking files that are being consumed. type Tracker interface { Add(reader *reader.Reader) @@ -31,7 +40,7 @@ type Tracker interface { EndPoll() EndConsume() int TotalReaders() int - SyncOffsets() + FindFiles(records []*Record) } // fileTracker tracks known offsets for files that are being consumed by the manager. @@ -165,14 +174,14 @@ func (t *fileTracker) archive(metadata *fileset.Fileset[*reader.Metadata]) { if t.pollsToArchive <= 0 || t.persister == nil { return } - if err := t.updateArchive(t.archiveIndex, metadata); err != nil { + if err := t.writeArchive(t.archiveIndex, metadata); err != nil { t.set.Logger.Error("error faced while saving to the archive", zap.Error(err)) } t.archiveIndex = (t.archiveIndex + 1) % t.pollsToArchive // increment the index } +// readArchive loads data from the archive for a given index and returns a fileset.Filset. func (t *fileTracker) readArchive(index int) (*fileset.Fileset[*reader.Metadata], error) { - // readArchive loads data from the archive for a given index and returns a fileset.Filset. key := fmt.Sprintf("knownFiles%d", index) metadata, err := checkpoint.LoadKey(context.Background(), t.persister, key) if err != nil { @@ -183,47 +192,45 @@ func (t *fileTracker) readArchive(index int) (*fileset.Fileset[*reader.Metadata] return f, nil } -func (t *fileTracker) updateArchive(index int, rmds *fileset.Fileset[*reader.Metadata]) error { - // updateArchive saves data to the archive for a given index and returns an error, if encountered. +// writeArchive saves data to the archive for a given index and returns an error, if encountered. +func (t *fileTracker) writeArchive(index int, rmds *fileset.Fileset[*reader.Metadata]) error { key := fmt.Sprintf("knownFiles%d", index) return checkpoint.SaveKey(context.Background(), t.persister, rmds.Get(), key) } -func (t *fileTracker) SyncOffsets() { - // SyncOffsets goes through all new (unmatched) readers and updates the metadata, if found on archive. +func (t *fileTracker) FindFiles(records []*Record) { + // FindFiles goes through archive, one fileset at a time and tries to match all fingerprints agains that loaded set. - // To minimize disk access, we first access the index, then review unmatched readers and synchronize their metadata if a match is found. + // To minimize disk access, we first access the index, then review unmatched files and update the metadata, if found. // We exit if no new reader exists. - archiveReadIndex := t.archiveIndex - 1 // try loading most recently written index and iterate backwards - for i := 0; i < t.pollsToArchive; i++ { - newFound := false - data, err := t.readArchive(archiveReadIndex) + mostRecentIndex := t.archiveIndex - 1 + foundRecords := 0 + + // continue executing the loop until either all records are matched or all archive sets have been processed. + for i := 0; i < t.pollsToArchive && foundRecords < len(records); i++ { + modified := false + data, err := t.readArchive(mostRecentIndex) if err != nil { t.set.Logger.Error("error while opening archive", zap.Error(err)) continue } - for _, v := range t.currentPollFiles.Get() { - if v.IsNew() { - newFound = true - if md := data.Match(v.GetFingerprint(), fileset.StartsWith); md != nil { - v.SyncMetadata(md) - } + for _, record := range records { + if md := data.Match(record.Fingerprint, fileset.StartsWith); md != nil && record.Metadata != nil { + // update a record's metadata with the matched metadata. + modified = true + record.Metadata = md + foundRecords++ } } - if !newFound { - // No new reader is available, so there’s no need to go through the rest of the archive. - // Just exit to save time. - break - } - if err := t.updateArchive(archiveReadIndex, data); err != nil { - t.set.Logger.Error("error while opening archive", zap.Error(err)) - continue + if modified { + if err := t.writeArchive(mostRecentIndex, data); err != nil { + t.set.Logger.Error("error while opening archive", zap.Error(err)) + continue + } } - - archiveReadIndex = (archiveReadIndex - 1) % t.pollsToArchive + mostRecentIndex = (mostRecentIndex - 1) % t.pollsToArchive } - } // noStateTracker only tracks the current polled files. Once the poll is @@ -281,4 +288,4 @@ func (t *noStateTracker) EndPoll() {} func (t *noStateTracker) TotalReaders() int { return 0 } -func (t *noStateTracker) SyncOffsets() {} +func (t *noStateTracker) FindFiles([]*Record) {} From cb59bc4454c20f8cc71d2e0f20428eee53dc156d Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Sat, 2 Nov 2024 04:37:51 +0530 Subject: [PATCH 33/45] comments --- pkg/stanza/fileconsumer/internal/tracker/tracker.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 08960990ebba..bcad5225b2d4 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -210,7 +210,7 @@ func (t *fileTracker) FindFiles(records []*Record) { // continue executing the loop until either all records are matched or all archive sets have been processed. for i := 0; i < t.pollsToArchive && foundRecords < len(records); i++ { modified := false - data, err := t.readArchive(mostRecentIndex) + data, err := t.readArchive(mostRecentIndex) // we load one fileset atmost once per poll if err != nil { t.set.Logger.Error("error while opening archive", zap.Error(err)) continue @@ -224,6 +224,7 @@ func (t *fileTracker) FindFiles(records []*Record) { } } if modified { + // we save one fileset atmost once per poll if err := t.writeArchive(mostRecentIndex, data); err != nil { t.set.Logger.Error("error while opening archive", zap.Error(err)) continue From 5560f6a4d3eaffbc882811b0974fb608ac3cd358 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Sat, 2 Nov 2024 04:47:03 +0530 Subject: [PATCH 34/45] lint --- pkg/stanza/fileconsumer/internal/tracker/tracker.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index bcad5225b2d4..7e432d7c4b11 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -199,16 +199,16 @@ func (t *fileTracker) writeArchive(index int, rmds *fileset.Fileset[*reader.Meta } func (t *fileTracker) FindFiles(records []*Record) { - // FindFiles goes through archive, one fileset at a time and tries to match all fingerprints agains that loaded set. + // FindFiles goes through archive, one fileset at a time and tries to match all fingerprints against that loaded set. // To minimize disk access, we first access the index, then review unmatched files and update the metadata, if found. - // We exit if no new reader exists. + // We exit if all fingerprints are matched. mostRecentIndex := t.archiveIndex - 1 - foundRecords := 0 + matchedRecords := 0 // continue executing the loop until either all records are matched or all archive sets have been processed. - for i := 0; i < t.pollsToArchive && foundRecords < len(records); i++ { + for i := 0; i < t.pollsToArchive && matchedRecords < len(records); i++ { modified := false data, err := t.readArchive(mostRecentIndex) // we load one fileset atmost once per poll if err != nil { @@ -217,10 +217,10 @@ func (t *fileTracker) FindFiles(records []*Record) { } for _, record := range records { if md := data.Match(record.Fingerprint, fileset.StartsWith); md != nil && record.Metadata != nil { - // update a record's metadata with the matched metadata. + // populate record's metadata with the matched metadata, to indicate a successful match. modified = true record.Metadata = md - foundRecords++ + matchedRecords++ } } if modified { From b45a4c0c8b94a034e09a189226908024fe1f7ad9 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Thu, 7 Nov 2024 03:07:50 +0530 Subject: [PATCH 35/45] use modulo --- pkg/stanza/fileconsumer/internal/tracker/tracker.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 7e432d7c4b11..78891754e40e 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -198,13 +198,13 @@ func (t *fileTracker) writeArchive(index int, rmds *fileset.Fileset[*reader.Meta return checkpoint.SaveKey(context.Background(), t.persister, rmds.Get(), key) } +// FindFiles goes through archive, one fileset at a time and tries to match all fingerprints against that loaded set. func (t *fileTracker) FindFiles(records []*Record) { - // FindFiles goes through archive, one fileset at a time and tries to match all fingerprints against that loaded set. // To minimize disk access, we first access the index, then review unmatched files and update the metadata, if found. // We exit if all fingerprints are matched. - mostRecentIndex := t.archiveIndex - 1 + mostRecentIndex := (t.archiveIndex - 1) % t.pollsToArchive matchedRecords := 0 // continue executing the loop until either all records are matched or all archive sets have been processed. From 200eadb6f862a65b987c1bc79ce2d03a6fde428d Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Sat, 9 Nov 2024 16:16:11 +0530 Subject: [PATCH 36/45] chore: remove record and update tests --- .../internal/fingerprint/fingerprint.go | 4 ++ .../fileconsumer/internal/tracker/tracker.go | 45 +++++++------- .../internal/tracker/tracker_test.go | 60 +++++++++++++++++++ pkg/stanza/fileconsumer/internal/util/util.go | 4 ++ 4 files changed, 90 insertions(+), 23 deletions(-) create mode 100644 pkg/stanza/fileconsumer/internal/tracker/tracker_test.go diff --git a/pkg/stanza/fileconsumer/internal/fingerprint/fingerprint.go b/pkg/stanza/fileconsumer/internal/fingerprint/fingerprint.go index a61346d8db1c..7584afc92f03 100644 --- a/pkg/stanza/fileconsumer/internal/fingerprint/fingerprint.go +++ b/pkg/stanza/fileconsumer/internal/fingerprint/fingerprint.go @@ -96,6 +96,10 @@ func (f *Fingerprint) UnmarshalJSON(data []byte) error { return nil } +func (f *Fingerprint) GetFingerprint() *Fingerprint { + return f +} + type marshal struct { FirstBytes []byte `json:"first_bytes"` } diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 78891754e40e..b0c6431c6c06 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -6,7 +6,6 @@ package tracker // import "github.com/open-telemetry/opentelemetry-collector-con import ( "context" "fmt" - "os" "go.opentelemetry.io/collector/component" "go.uber.org/zap" @@ -15,17 +14,10 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fileset" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/reader" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/util" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator" ) -// A convenience struct that holds a file fingerprint and its associated metadata. -// This will be used during reader creation after the FindFiles() method. -type Record struct { - File *os.File - Fingerprint *fingerprint.Fingerprint - Metadata *reader.Metadata -} - // Interface for tracking files that are being consumed. type Tracker interface { Add(reader *reader.Reader) @@ -40,7 +32,7 @@ type Tracker interface { EndPoll() EndConsume() int TotalReaders() int - FindFiles(records []*Record) + FindFiles([]*fingerprint.Fingerprint) []fileset.Matchable } // fileTracker tracks known offsets for files that are being consumed by the manager. @@ -199,39 +191,46 @@ func (t *fileTracker) writeArchive(index int, rmds *fileset.Fileset[*reader.Meta } // FindFiles goes through archive, one fileset at a time and tries to match all fingerprints against that loaded set. -func (t *fileTracker) FindFiles(records []*Record) { - +func (t *fileTracker) FindFiles(fps []*fingerprint.Fingerprint) []fileset.Matchable { // To minimize disk access, we first access the index, then review unmatched files and update the metadata, if found. // We exit if all fingerprints are matched. - mostRecentIndex := (t.archiveIndex - 1) % t.pollsToArchive - matchedRecords := 0 + mostRecentIndex := util.Mod(t.archiveIndex-1, t.pollsToArchive) + matchedMetadata := make([]fileset.Matchable, len(fps)) + indices := make(map[int]bool) // Track fp indices of original fps slice + + for i := 0; i < len(fps); i++ { + indices[i] = true + } // continue executing the loop until either all records are matched or all archive sets have been processed. - for i := 0; i < t.pollsToArchive && matchedRecords < len(records); i++ { + for i := 0; i < t.pollsToArchive && len(indices) > 0; i, mostRecentIndex = i+1, util.Mod(mostRecentIndex-1, t.pollsToArchive) { modified := false data, err := t.readArchive(mostRecentIndex) // we load one fileset atmost once per poll if err != nil { t.set.Logger.Error("error while opening archive", zap.Error(err)) continue } - for _, record := range records { - if md := data.Match(record.Fingerprint, fileset.StartsWith); md != nil && record.Metadata != nil { - // populate record's metadata with the matched metadata, to indicate a successful match. + for index := range indices { + if md := data.Match(fps[index], fileset.StartsWith); md != nil { + // append the matched metadata/file pair to the new array + matchedMetadata[index] = md modified = true - record.Metadata = md - matchedRecords++ + delete(indices, index) } } if modified { // we save one fileset atmost once per poll if err := t.writeArchive(mostRecentIndex, data); err != nil { t.set.Logger.Error("error while opening archive", zap.Error(err)) - continue } } - mostRecentIndex = (mostRecentIndex - 1) % t.pollsToArchive } + // append remaining files + for index := range indices { + matchedMetadata[index] = fps[index] + } + return matchedMetadata } // noStateTracker only tracks the current polled files. Once the poll is @@ -289,4 +288,4 @@ func (t *noStateTracker) EndPoll() {} func (t *noStateTracker) TotalReaders() int { return 0 } -func (t *noStateTracker) FindFiles([]*Record) {} +func (t *noStateTracker) FindFiles([]*fingerprint.Fingerprint) []fileset.Matchable { return nil } diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go b/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go new file mode 100644 index 000000000000..2f988b69ec03 --- /dev/null +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go @@ -0,0 +1,60 @@ +package tracker + +import ( + "context" + "math/rand/v2" + "testing" + + "github.com/google/uuid" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/checkpoint" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/reader" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/testutil" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/component/componenttest" +) + +func TestFindFilesOrder(t *testing.T) { + fps := make([]*fingerprint.Fingerprint, 0) + for i := 0; i < 100; i++ { + fps = append(fps, fingerprint.New([]byte(uuid.NewString()))) + } + persister := testutil.NewUnscopedMockPersister() + fpInStorage := populatedPersisterData(persister, fps) + + tracker := NewFileTracker(componenttest.NewNopTelemetrySettings(), 0, 100, persister) + matchables := tracker.FindFiles(fps) + + require.Equal(t, len(fps), len(matchables), "return slice should be of same length as input slice") + + for i := 0; i < len(matchables); i++ { + require.Truef(t, fps[i].Equal(matchables[i].GetFingerprint()), "fingerprint at index %d is not equal to corresponding return value") + if fpInStorage[i] { + // if current fingerprint is present in storage, the corresponding return type should be a "Metadata" + _, ok := matchables[i].(*reader.Metadata) + require.True(t, ok, "index %d should be of reader.Metadata type") + } else { + // if current fingerprint is absent from storage, the corresponding return type should be a "Fingerprint" + _, ok := matchables[i].(*fingerprint.Fingerprint) + require.True(t, ok, "index %d should be of fingerprint.Fingerprint type") + } + } +} + +func populatedPersisterData(persister operator.Persister, fps []*fingerprint.Fingerprint) []bool { + md := make([]*reader.Metadata, 0) + + fpInStorage := make([]bool, len(fps)) + for i, fp := range fps { + // 50-50 chance that a fingerprint exists in the storage + if rand.Float32() < 0.5 { + md = append(md, &reader.Metadata{Fingerprint: fp}) + fpInStorage[i] = true // mark the fingerprint at index i in storage + } + } + // save half keys in knownFiles0 and other half in knownFiles1 + _ = checkpoint.SaveKey(context.Background(), persister, md[:len(md)/2], "knownFiles0") + _ = checkpoint.SaveKey(context.Background(), persister, md[len(md)/2:], "knownFiles1") + return fpInStorage +} diff --git a/pkg/stanza/fileconsumer/internal/util/util.go b/pkg/stanza/fileconsumer/internal/util/util.go index 3d700cf1b3ad..46a490dc0907 100644 --- a/pkg/stanza/fileconsumer/internal/util/util.go +++ b/pkg/stanza/fileconsumer/internal/util/util.go @@ -18,3 +18,7 @@ func MapCopy(m map[string]any) map[string]any { } return newMap } + +func Mod(x, y int) int { + return ((x % y) + y) % y +} From ad46f94b11ea1592a33768eb332adc5b1d7cdc33 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Sat, 9 Nov 2024 16:20:10 +0530 Subject: [PATCH 37/45] comments --- pkg/stanza/fileconsumer/internal/tracker/tracker.go | 6 +++--- pkg/stanza/fileconsumer/internal/tracker/tracker_test.go | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index b0c6431c6c06..abbcb03a8100 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -213,10 +213,10 @@ func (t *fileTracker) FindFiles(fps []*fingerprint.Fingerprint) []fileset.Matcha } for index := range indices { if md := data.Match(fps[index], fileset.StartsWith); md != nil { - // append the matched metadata/file pair to the new array + // update the matched metadata for this index matchedMetadata[index] = md - modified = true delete(indices, index) + modified = true } } if modified { @@ -226,7 +226,7 @@ func (t *fileTracker) FindFiles(fps []*fingerprint.Fingerprint) []fileset.Matcha } } } - // append remaining files + // add remaining fingerprints i.e. unmatched fingerprints for index := range indices { matchedMetadata[index] = fps[index] } diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go b/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go index 2f988b69ec03..8c0a4ff851b3 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go @@ -1,4 +1,7 @@ -package tracker +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package tracker // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/tracker" import ( "context" From b9f65f3097710105d80c11054bbeeae2b99f8019 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Sun, 10 Nov 2024 14:18:20 +0530 Subject: [PATCH 38/45] lint and check --- pkg/stanza/go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/stanza/go.mod b/pkg/stanza/go.mod index 880799e19aef..b19d3f948b6c 100644 --- a/pkg/stanza/go.mod +++ b/pkg/stanza/go.mod @@ -8,6 +8,7 @@ require ( github.com/expr-lang/expr v1.16.9 github.com/fsnotify/fsnotify v1.7.0 github.com/goccy/go-json v0.10.3 + github.com/google/uuid v1.6.0 github.com/jonboulle/clockwork v0.4.0 github.com/jpillora/backoff v1.0.0 github.com/json-iterator/go v1.1.12 @@ -48,7 +49,6 @@ require ( github.com/go-logr/stdr v1.2.2 // indirect github.com/go-viper/mapstructure/v2 v2.1.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect - github.com/google/uuid v1.6.0 // indirect github.com/hashicorp/go-version v1.7.0 // indirect github.com/knadh/koanf/maps v0.1.1 // indirect github.com/knadh/koanf/providers/confmap v0.1.0 // indirect From 90dadb0bb994a798d74ff8623332b96b53e63688 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Sun, 10 Nov 2024 14:40:36 +0530 Subject: [PATCH 39/45] lint and check --- pkg/stanza/fileconsumer/internal/tracker/tracker_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go b/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go index 8c0a4ff851b3..0a68451637d2 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go @@ -32,15 +32,15 @@ func TestFindFilesOrder(t *testing.T) { require.Equal(t, len(fps), len(matchables), "return slice should be of same length as input slice") for i := 0; i < len(matchables); i++ { - require.Truef(t, fps[i].Equal(matchables[i].GetFingerprint()), "fingerprint at index %d is not equal to corresponding return value") + require.Truef(t, fps[i].Equal(matchables[i].GetFingerprint()), "fingerprint at index %d is not equal to corresponding return value", i) if fpInStorage[i] { // if current fingerprint is present in storage, the corresponding return type should be a "Metadata" _, ok := matchables[i].(*reader.Metadata) - require.True(t, ok, "index %d should be of reader.Metadata type") + require.True(t, ok, "resulting index %d should be of reader.Metadata type", i) } else { // if current fingerprint is absent from storage, the corresponding return type should be a "Fingerprint" _, ok := matchables[i].(*fingerprint.Fingerprint) - require.True(t, ok, "index %d should be of fingerprint.Fingerprint type") + require.True(t, ok, "resulting index %d should be of fingerprint.Fingerprint type", i) } } } From 2dac57f6ce49e20335d1d16a577c32120f36c3ec Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Mon, 11 Nov 2024 12:21:39 +0530 Subject: [PATCH 40/45] gci --- pkg/stanza/fileconsumer/internal/tracker/tracker_test.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go b/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go index 0a68451637d2..87b485addf2a 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go @@ -9,13 +9,14 @@ import ( "testing" "github.com/google/uuid" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/component/componenttest" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/checkpoint" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/reader" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/testutil" - "github.com/stretchr/testify/require" - "go.opentelemetry.io/collector/component/componenttest" ) func TestFindFilesOrder(t *testing.T) { From 9981c783fc17fc633e161049270d418bb2ffe27d Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Mon, 2 Dec 2024 21:38:02 +0530 Subject: [PATCH 41/45] chore: cleanup, test cases --- .../internal/fingerprint/fingerprint.go | 4 --- .../fileconsumer/internal/tracker/tracker.go | 30 ++++++++----------- .../internal/tracker/tracker_test.go | 12 ++++---- 3 files changed, 17 insertions(+), 29 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/fingerprint/fingerprint.go b/pkg/stanza/fileconsumer/internal/fingerprint/fingerprint.go index 7584afc92f03..a61346d8db1c 100644 --- a/pkg/stanza/fileconsumer/internal/fingerprint/fingerprint.go +++ b/pkg/stanza/fileconsumer/internal/fingerprint/fingerprint.go @@ -96,10 +96,6 @@ func (f *Fingerprint) UnmarshalJSON(data []byte) error { return nil } -func (f *Fingerprint) GetFingerprint() *Fingerprint { - return f -} - type marshal struct { FirstBytes []byte `json:"first_bytes"` } diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index abbcb03a8100..7094ecb115da 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -32,7 +32,7 @@ type Tracker interface { EndPoll() EndConsume() int TotalReaders() int - FindFiles([]*fingerprint.Fingerprint) []fileset.Matchable + FindFiles([]*fingerprint.Fingerprint) []*reader.Metadata } // fileTracker tracks known offsets for files that are being consumed by the manager. @@ -191,31 +191,29 @@ func (t *fileTracker) writeArchive(index int, rmds *fileset.Fileset[*reader.Meta } // FindFiles goes through archive, one fileset at a time and tries to match all fingerprints against that loaded set. -func (t *fileTracker) FindFiles(fps []*fingerprint.Fingerprint) []fileset.Matchable { +func (t *fileTracker) FindFiles(fps []*fingerprint.Fingerprint) []*reader.Metadata { // To minimize disk access, we first access the index, then review unmatched files and update the metadata, if found. // We exit if all fingerprints are matched. mostRecentIndex := util.Mod(t.archiveIndex-1, t.pollsToArchive) - matchedMetadata := make([]fileset.Matchable, len(fps)) - indices := make(map[int]bool) // Track fp indices of original fps slice - - for i := 0; i < len(fps); i++ { - indices[i] = true - } + matchedMetadata := make([]*reader.Metadata, len(fps)) // continue executing the loop until either all records are matched or all archive sets have been processed. - for i := 0; i < t.pollsToArchive && len(indices) > 0; i, mostRecentIndex = i+1, util.Mod(mostRecentIndex-1, t.pollsToArchive) { + for i := 0; i < t.pollsToArchive; i, mostRecentIndex = i+1, util.Mod(mostRecentIndex-1, t.pollsToArchive) { modified := false data, err := t.readArchive(mostRecentIndex) // we load one fileset atmost once per poll if err != nil { t.set.Logger.Error("error while opening archive", zap.Error(err)) continue } - for index := range indices { - if md := data.Match(fps[index], fileset.StartsWith); md != nil { - // update the matched metadata for this index + for index, fp := range fps { + if matchedMetadata[index] != nil { + // we've already found a match for this index, continue + continue + } + if md := data.Match(fp, fileset.StartsWith); md != nil { + // update the matched metada for the index matchedMetadata[index] = md - delete(indices, index) modified = true } } @@ -226,10 +224,6 @@ func (t *fileTracker) FindFiles(fps []*fingerprint.Fingerprint) []fileset.Matcha } } } - // add remaining fingerprints i.e. unmatched fingerprints - for index := range indices { - matchedMetadata[index] = fps[index] - } return matchedMetadata } @@ -288,4 +282,4 @@ func (t *noStateTracker) EndPoll() {} func (t *noStateTracker) TotalReaders() int { return 0 } -func (t *noStateTracker) FindFiles([]*fingerprint.Fingerprint) []fileset.Matchable { return nil } +func (t *noStateTracker) FindFiles([]*fingerprint.Fingerprint) []*reader.Metadata { return nil } diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go b/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go index 87b485addf2a..f16e2d647032 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker_test.go @@ -33,15 +33,13 @@ func TestFindFilesOrder(t *testing.T) { require.Equal(t, len(fps), len(matchables), "return slice should be of same length as input slice") for i := 0; i < len(matchables); i++ { - require.Truef(t, fps[i].Equal(matchables[i].GetFingerprint()), "fingerprint at index %d is not equal to corresponding return value", i) if fpInStorage[i] { - // if current fingerprint is present in storage, the corresponding return type should be a "Metadata" - _, ok := matchables[i].(*reader.Metadata) - require.True(t, ok, "resulting index %d should be of reader.Metadata type", i) + // if current fingerprint is present in storage, the corresponding return type should not be nil + require.NotNilf(t, matchables[i], "resulting index %d should be not be nil type", i) + require.Truef(t, fps[i].Equal(matchables[i].GetFingerprint()), "fingerprint at index %d is not equal to corresponding return value", i) } else { - // if current fingerprint is absent from storage, the corresponding return type should be a "Fingerprint" - _, ok := matchables[i].(*fingerprint.Fingerprint) - require.True(t, ok, "resulting index %d should be of fingerprint.Fingerprint type", i) + // if current fingerprint is absent from storage, the corresponding index should be empty i.e. "nil" + require.Nil(t, matchables[i], "resulting index %d should be of nil type", i) } } } From fa48e7688a5c929174a6ed97715c906f93d730fd Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 3 Dec 2024 00:34:10 +0530 Subject: [PATCH 42/45] lint --- pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go b/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go index 4f48d820ad9b..8a5a60b7d734 100644 --- a/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go +++ b/pkg/stanza/fileconsumer/internal/checkpoint/checkpoint.go @@ -49,6 +49,7 @@ func SaveKey(ctx context.Context, persister operator.Persister, rmds []*reader.M func Load(ctx context.Context, persister operator.Persister) ([]*reader.Metadata, error) { return LoadKey(ctx, persister, knownFilesKey) } + func LoadKey(ctx context.Context, persister operator.Persister, key string) ([]*reader.Metadata, error) { encoded, err := persister.Get(ctx, key) if err != nil { From 95199f2cec77d91fa899a88fc9a67f0eaad0670e Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Wed, 4 Dec 2024 16:53:30 +0530 Subject: [PATCH 43/45] chore: simplify --- .../fileconsumer/internal/tracker/tracker.go | 20 +++++++++++-------- pkg/stanza/fileconsumer/internal/util/util.go | 4 ---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 7094ecb115da..ae33b41e1d3a 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -14,7 +14,6 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fileset" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/reader" - "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/util" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator" ) @@ -195,13 +194,18 @@ func (t *fileTracker) FindFiles(fps []*fingerprint.Fingerprint) []*reader.Metada // To minimize disk access, we first access the index, then review unmatched files and update the metadata, if found. // We exit if all fingerprints are matched. - mostRecentIndex := util.Mod(t.archiveIndex-1, t.pollsToArchive) + mostRecentIndex := (t.archiveIndex - 1 + t.pollsToArchive) % t.pollsToArchive matchedMetadata := make([]*reader.Metadata, len(fps)) // continue executing the loop until either all records are matched or all archive sets have been processed. - for i := 0; i < t.pollsToArchive; i, mostRecentIndex = i+1, util.Mod(mostRecentIndex-1, t.pollsToArchive) { - modified := false - data, err := t.readArchive(mostRecentIndex) // we load one fileset atmost once per poll + for i := 0; i < t.pollsToArchive; i++ { + metadataUpdated := false + + // Update the mostRecentIndex + currentIndex := mostRecentIndex + mostRecentIndex = (mostRecentIndex - 1 + t.pollsToArchive) % t.pollsToArchive + + data, err := t.readArchive(currentIndex) // we load one fileset atmost once per poll if err != nil { t.set.Logger.Error("error while opening archive", zap.Error(err)) continue @@ -214,12 +218,12 @@ func (t *fileTracker) FindFiles(fps []*fingerprint.Fingerprint) []*reader.Metada if md := data.Match(fp, fileset.StartsWith); md != nil { // update the matched metada for the index matchedMetadata[index] = md - modified = true + metadataUpdated = true } } - if modified { + if metadataUpdated { // we save one fileset atmost once per poll - if err := t.writeArchive(mostRecentIndex, data); err != nil { + if err := t.writeArchive(currentIndex, data); err != nil { t.set.Logger.Error("error while opening archive", zap.Error(err)) } } diff --git a/pkg/stanza/fileconsumer/internal/util/util.go b/pkg/stanza/fileconsumer/internal/util/util.go index 46a490dc0907..3d700cf1b3ad 100644 --- a/pkg/stanza/fileconsumer/internal/util/util.go +++ b/pkg/stanza/fileconsumer/internal/util/util.go @@ -18,7 +18,3 @@ func MapCopy(m map[string]any) map[string]any { } return newMap } - -func Mod(x, y int) int { - return ((x % y) + y) % y -} From 08e3aad1e113339cef1111297fc0491420a20937 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Thu, 5 Dec 2024 00:11:01 +0530 Subject: [PATCH 44/45] chore: improve readability --- .../fileconsumer/internal/tracker/tracker.go | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index ae33b41e1d3a..95b61c289926 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -194,38 +194,47 @@ func (t *fileTracker) FindFiles(fps []*fingerprint.Fingerprint) []*reader.Metada // To minimize disk access, we first access the index, then review unmatched files and update the metadata, if found. // We exit if all fingerprints are matched. - mostRecentIndex := (t.archiveIndex - 1 + t.pollsToArchive) % t.pollsToArchive + // Track number of matched fingerprints so we can exit if all matched. + var numMatched int + + // Determine the index for reading archive, starting from the most recent and moving towards the oldest + nextIndex := (t.archiveIndex - 1 + t.pollsToArchive) % t.pollsToArchive matchedMetadata := make([]*reader.Metadata, len(fps)) // continue executing the loop until either all records are matched or all archive sets have been processed. for i := 0; i < t.pollsToArchive; i++ { - metadataUpdated := false - // Update the mostRecentIndex - currentIndex := mostRecentIndex - mostRecentIndex = (mostRecentIndex - 1 + t.pollsToArchive) % t.pollsToArchive + currentIndex := nextIndex + nextIndex = (nextIndex - 1 + t.pollsToArchive) % t.pollsToArchive data, err := t.readArchive(currentIndex) // we load one fileset atmost once per poll if err != nil { t.set.Logger.Error("error while opening archive", zap.Error(err)) continue } - for index, fp := range fps { - if matchedMetadata[index] != nil { + archiveModified := false + for j, fp := range fps { + if matchedMetadata[j] != nil { // we've already found a match for this index, continue continue } if md := data.Match(fp, fileset.StartsWith); md != nil { // update the matched metada for the index - matchedMetadata[index] = md - metadataUpdated = true + matchedMetadata[j] = md + archiveModified = true + numMatched++ } } - if metadataUpdated { - // we save one fileset atmost once per poll - if err := t.writeArchive(currentIndex, data); err != nil { - t.set.Logger.Error("error while opening archive", zap.Error(err)) - } + if !archiveModified { + continue + } + // we save one fileset atmost once per poll + if err := t.writeArchive(currentIndex, data); err != nil { + t.set.Logger.Error("error while opening archive", zap.Error(err)) + } + // Check if all metadata have been found + if numMatched == len(fps) { + return matchedMetadata } } return matchedMetadata From df5fd1108cdf0fe05de27613c3930ec3e8f045b8 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Thu, 5 Dec 2024 03:20:34 +0530 Subject: [PATCH 45/45] fix: set nextIndex to t.archiveIndex --- pkg/stanza/fileconsumer/internal/tracker/tracker.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/tracker/tracker.go b/pkg/stanza/fileconsumer/internal/tracker/tracker.go index 95b61c289926..c784d1c3485a 100644 --- a/pkg/stanza/fileconsumer/internal/tracker/tracker.go +++ b/pkg/stanza/fileconsumer/internal/tracker/tracker.go @@ -198,16 +198,15 @@ func (t *fileTracker) FindFiles(fps []*fingerprint.Fingerprint) []*reader.Metada var numMatched int // Determine the index for reading archive, starting from the most recent and moving towards the oldest - nextIndex := (t.archiveIndex - 1 + t.pollsToArchive) % t.pollsToArchive + nextIndex := t.archiveIndex matchedMetadata := make([]*reader.Metadata, len(fps)) // continue executing the loop until either all records are matched or all archive sets have been processed. for i := 0; i < t.pollsToArchive; i++ { // Update the mostRecentIndex - currentIndex := nextIndex nextIndex = (nextIndex - 1 + t.pollsToArchive) % t.pollsToArchive - data, err := t.readArchive(currentIndex) // we load one fileset atmost once per poll + data, err := t.readArchive(nextIndex) // we load one fileset atmost once per poll if err != nil { t.set.Logger.Error("error while opening archive", zap.Error(err)) continue @@ -229,7 +228,7 @@ func (t *fileTracker) FindFiles(fps []*fingerprint.Fingerprint) []*reader.Metada continue } // we save one fileset atmost once per poll - if err := t.writeArchive(currentIndex, data); err != nil { + if err := t.writeArchive(nextIndex, data); err != nil { t.set.Logger.Error("error while opening archive", zap.Error(err)) } // Check if all metadata have been found