From 22d850fd05cb92184bfd32d117bd4c45d899f1d2 Mon Sep 17 00:00:00 2001 From: Steven Chung Date: Tue, 21 Nov 2023 21:59:11 -0500 Subject: [PATCH] extractor number pad file names and do not override cache --- pkg/anki/anki_test.go | 6 ++- pkg/extractor/extractor.go | 31 ++++--------- pkg/extractor/extractor_test.go | 40 ++++++++++------- pkg/extractor/extractortest/extractortest.go | 11 ++++- pkg/extractor/instagram/instagram.go | 46 ++++++++++++++++++-- pkg/extractor/instagram/instagram_test.go | 45 ++++++++++++++++++- pkg/util/ioutil/ioutil.go | 18 ++++++++ 7 files changed, 149 insertions(+), 48 deletions(-) diff --git a/pkg/anki/anki_test.go b/pkg/anki/anki_test.go index 3a361045..0e0ac586 100644 --- a/pkg/anki/anki_test.go +++ b/pkg/anki/anki_test.go @@ -5,7 +5,6 @@ import ( "context" "encoding/json" "fmt" - "log/slog" "os" "path" "testing" @@ -16,13 +15,16 @@ import ( "github.com/s12chung/text2anki/pkg/util/logg" "github.com/s12chung/text2anki/pkg/util/test" "github.com/s12chung/text2anki/pkg/util/test/fixture" + "github.com/s12chung/text2anki/pkg/util/test/fixture/flog" ) +var plog = flog.FixtureUpdateNoWrite() + func init() { dir := path.Join(os.TempDir(), test.GenerateName("anki.TestMain")) c := Config{ExportPrefix: "t2a-", NotesCacheDir: dir} if err := os.MkdirAll(dir, ioutil.OwnerRWXGroupRX); err != nil { - slog.Error("anki.init()", logg.Err(err)) //nolint:forbidigo // used in init only + plog.Error("anki.init()", logg.Err(err)) //nolint:forbidigo // used in init only os.Exit(-1) } SetConfig(c) diff --git a/pkg/extractor/extractor.go b/pkg/extractor/extractor.go index 1a758e66..5b739404 100644 --- a/pkg/extractor/extractor.go +++ b/pkg/extractor/extractor.go @@ -72,18 +72,19 @@ func (e Extractor) Extract(s string) (SourceExtraction, error) { hash := source.ID() cacheDir := filepath.Join(e.cacheDir, hash) - if err := os.MkdirAll(cacheDir, ioutil.OwnerRWXGroupRX); err != nil { - return SourceExtraction{}, err - } - if err := source.ExtractToDir(cacheDir); err != nil { - return SourceExtraction{}, err + if _, err := os.Stat(cacheDir); os.IsNotExist(err) { + if err := os.MkdirAll(cacheDir, ioutil.OwnerRWXGroupRX); err != nil { + return SourceExtraction{}, err + } + if err := source.ExtractToDir(cacheDir); err != nil { + return SourceExtraction{}, err + } } - entries, err := os.ReadDir(cacheDir) if err != nil { return SourceExtraction{}, err } - filenames := filenamesWithExtensions(entries, e.factory.Extensions()) + filenames := ioutil.FilenamesWithExtensions(entries, e.factory.Extensions()) if len(filenames) == 0 { return SourceExtraction{}, fmt.Errorf("no filenames that match extensions extracted: %v", strings.Join(e.factory.Extensions(), ", ")) } @@ -103,22 +104,6 @@ func (e Extractor) Extract(s string) (SourceExtraction, error) { return SourceExtraction{Info: info, Parts: parts}, nil } -func filenamesWithExtensions(entries []os.DirEntry, extensions []string) []string { - filenames := make([]string, 0, len(entries)) - for _, file := range entries { - if file.IsDir() { - continue - } - for _, ext := range extensions { - if strings.HasSuffix(file.Name(), ext) { - filenames = append(filenames, file.Name()) - break - } - } - } - return filenames -} - // Map is a map of extractor name to Extractor type Map map[string]Extractor diff --git a/pkg/extractor/extractor_test.go b/pkg/extractor/extractor_test.go index c415069d..02001979 100644 --- a/pkg/extractor/extractor_test.go +++ b/pkg/extractor/extractor_test.go @@ -53,30 +53,40 @@ func TestExtractor_Extract(t *testing.T) { t.Run(tc.name, func(t *testing.T) { require := require.New(t) - source, err := NewExtractor(cacheDir, extractortest.NewFactory(testName)).Extract(tc.s) + extractor := NewExtractor(cacheDir, extractortest.NewFactory(testName)) + source, err := extractor.Extract(tc.s) if tc.err != nil { require.Equal(tc.err, err) return } require.NoError(err) - fixture.CompareReadOrUpdate(t, path.Join(testName, tc.name+"_info.json"), fixture.JSON(t, source.Info)) - - partMap := map[string]string{} - for _, part := range source.Parts { - require.Nil(part.AudioFile) - - file := part.ImageFile - info, err := file.Stat() - require.NoError(err) - bytes, err := io.ReadAll(file) - require.NoError(err) - partMap[info.Name()] = string(bytes) - } - fixture.CompareReadOrUpdate(t, path.Join(testName, tc.name+"_parts.json"), fixture.JSON(t, partMap)) + testSource(t, testName, tc.name, source) + + secondSource, err := extractor.Extract(tc.s) + require.NoError(err) + testSource(t, testName, tc.name, secondSource) }) } } +func testSource(t *testing.T, testName, name string, src SourceExtraction) { + require := require.New(t) + fixture.CompareReadOrUpdate(t, path.Join(testName, name+"_info.json"), fixture.JSON(t, src.Info)) + + partMap := map[string]string{} + for _, part := range src.Parts { + require.Nil(part.AudioFile) + + file := part.ImageFile + info, err := file.Stat() + require.NoError(err) + bytes, err := io.ReadAll(file) + require.NoError(err) + partMap[info.Name()] = string(bytes) + } + fixture.CompareReadOrUpdate(t, path.Join(testName, name+"_parts.json"), fixture.JSON(t, partMap)) +} + func TestVerify(t *testing.T) { testName := "TestVerify" cacheDir := path.Join(os.TempDir(), test.GenerateName(testName)) diff --git a/pkg/extractor/extractortest/extractortest.go b/pkg/extractor/extractortest/extractortest.go index ef990229..e75e37af 100644 --- a/pkg/extractor/extractortest/extractortest.go +++ b/pkg/extractor/extractortest/extractortest.go @@ -2,6 +2,7 @@ package extractortest import ( + "fmt" "os" "path/filepath" @@ -50,7 +51,14 @@ func (t Source) ExtractToDir(cacheDir string) error { if t.s == SkipExtractString { return nil } - err := filepath.Walk(t.fixturePath, func(path string, info os.FileInfo, err error) error { + items, err := os.ReadDir(cacheDir) + if err != nil { + return err + } + if len(items) != 0 { + return fmt.Errorf("extracting to non-empty cacheDir") + } + return filepath.Walk(t.fixturePath, func(path string, info os.FileInfo, err error) error { if err != nil { return err } @@ -59,7 +67,6 @@ func (t Source) ExtractToDir(cacheDir string) error { } return ioutil.CopyFile(filepath.Join(cacheDir, info.Name()), path, ioutil.OwnerGroupR) }) - return err } // Info returns the info from the extraction diff --git a/pkg/extractor/instagram/instagram.go b/pkg/extractor/instagram/instagram.go index 8bb9bfa3..25829c6f 100644 --- a/pkg/extractor/instagram/instagram.go +++ b/pkg/extractor/instagram/instagram.go @@ -8,13 +8,17 @@ import ( "os" "os/exec" "path/filepath" + "strconv" "strings" "github.com/s12chung/text2anki/db/pkg/db" "github.com/s12chung/text2anki/pkg/extractor" "github.com/s12chung/text2anki/pkg/util/archive/xz" + "github.com/s12chung/text2anki/pkg/util/ioutil" ) +var extensions = []string{".jpg"} + // GetLoginFromEnv gets the login from the default ENV var func GetLoginFromEnv() string { return os.Getenv("INSTAGRAM_LOGIN") } @@ -28,7 +32,7 @@ type Factory struct{ login string } func (f Factory) NewSource(url string) extractor.Source { return &Post{login: f.login, url: url} } // Extensions returns the extensions the extractor returns -func (f Factory) Extensions() []string { return []string{".jpg"} } +func (f Factory) Extensions() []string { return append([]string{}, extensions...) } // Post represents an instagram post type Post struct { @@ -67,14 +71,48 @@ func (s *Post) ID() string { return s.id } +var extractToDirArgs = func(login, id string) []string { + return []string{"instaloader", "--login", login, "--dirname-pattern", ".", "--", "-" + id} +} + // ExtractToDir extracts the post to the directory func (s *Post) ExtractToDir(cacheDir string) error { if ok := s.Verify(); !ok { - return fmt.Errorf("url is not vertified for instagram: %v", s.url) + return fmt.Errorf("url is not verified for instagram: %v", s.url) } - cmd := exec.Command("instaloader", "--login", s.login, "--dirname-pattern", ".", "--", "-"+s.ID()) //nolint:gosec //this is how it works + args := extractToDirArgs(s.login, s.ID()) + cmd := exec.Command(args[0], args[1:]...) //nolint:gosec //this is how it works cmd.Dir = cacheDir - return cmd.Run() + if err := cmd.Run(); err != nil { + return err + } + return numberPadFilenames(cacheDir) +} + +func numberPadFilenames(cacheDir string) error { + entries, err := os.ReadDir(cacheDir) + if err != nil { + return err + } + filenames := ioutil.FilenamesWithExtensions(entries, extensions) + + for _, filename := range filenames { + parts := strings.Split(filename, "_") + if len(parts) < 2 { + return fmt.Errorf("file found with no underscore") + } + numberPart := strings.Split(parts[len(parts)-1], ".")[0] + number, err := strconv.Atoi(numberPart) + if err != nil { + return err + } + newFilename := strings.Join(parts[:len(parts)-1], "_") + "_" + fmt.Sprintf("%03d", number) + filepath.Ext(filename) + + if err = os.Rename(filepath.Join(cacheDir, filename), filepath.Join(cacheDir, newFilename)); err != nil { + return err + } + } + return nil } const infoGlob = "*.xz" diff --git a/pkg/extractor/instagram/instagram_test.go b/pkg/extractor/instagram/instagram_test.go index e448d4f4..8792c43e 100644 --- a/pkg/extractor/instagram/instagram_test.go +++ b/pkg/extractor/instagram/instagram_test.go @@ -2,13 +2,18 @@ package instagram import ( "fmt" + "os" + "path" "path/filepath" + "strconv" "strings" "testing" "github.com/stretchr/testify/require" "github.com/s12chung/text2anki/pkg/extractor" + "github.com/s12chung/text2anki/pkg/util/ioutil" + "github.com/s12chung/text2anki/pkg/util/test" "github.com/s12chung/text2anki/pkg/util/test/fixture" ) @@ -60,19 +65,55 @@ func TestSource_ID(t *testing.T) { } } +var extractToDirSuffixes = []int{0, 1, 2, 9, 10, 11, 99, 100, 101, 110} + +const extractToDirPrefix = "2023-11-21_10-42-44_UTC_" + +func init() { + args := make([]string, len(extractToDirSuffixes)+1) + args[0] = "touch" + for i, suffix := range extractToDirSuffixes { + args[i+1] = extractToDirPrefix + strconv.Itoa(suffix) + extensions[0] + } + + extractToDirArgs = func(login, id string) []string { return args } +} + func TestPost_ExtractToDir(t *testing.T) { + cacheDir := path.Join(os.TempDir(), test.GenerateName("Instagram")) + require.NoError(t, os.MkdirAll(cacheDir, ioutil.OwnerRWXGroupRX)) + testCases := []struct { name string url string err error }{ - {name: "broken", url: "https://waka.com", err: fmt.Errorf("url is not vertified for instagram: https://waka.com")}, + {name: "broken", url: "https://waka.com", err: fmt.Errorf("url is not verified for instagram: https://waka.com")}, + {name: "fake", url: testURL}, } for _, tc := range testCases { tc := tc t.Run(tc.name, func(t *testing.T) { require := require.New(t) - require.Equal(tc.err, NewPost(tc.url).ExtractToDir("")) + err := NewPost(tc.url).ExtractToDir(cacheDir) + if tc.err != nil { + require.Equal(tc.err, err) + return + } + require.NoError(err) + + entries, err := os.ReadDir(cacheDir) + require.NoError(err) + entryNames := make([]string, len(entries)) + for i, entry := range entries { + entryNames[i] = entry.Name() + } + + filenames := make([]string, len(extractToDirSuffixes)) + for i, suffix := range extractToDirSuffixes { + filenames[i] = extractToDirPrefix + fmt.Sprintf("%03d", suffix) + extensions[0] + } + require.Equal(filenames, entryNames) }) } } diff --git a/pkg/util/ioutil/ioutil.go b/pkg/util/ioutil/ioutil.go index 58e744c1..7e6de43f 100644 --- a/pkg/util/ioutil/ioutil.go +++ b/pkg/util/ioutil/ioutil.go @@ -5,6 +5,7 @@ import ( "io" "os" "path/filepath" + "strings" ) // OwnerGroupR is the file mode for Owners and Group read @@ -53,3 +54,20 @@ func CopyFile(dst, src string, perm os.FileMode) error { } return os.Rename(tmp.Name(), dst) } + +// FilenamesWithExtensions returns the file names with the given extensions in the directory (non-recursive) +func FilenamesWithExtensions(entries []os.DirEntry, extensions []string) []string { + filenames := make([]string, 0, len(entries)) + for _, file := range entries { + if file.IsDir() { + continue + } + for _, ext := range extensions { + if strings.HasSuffix(file.Name(), ext) { + filenames = append(filenames, file.Name()) + break + } + } + } + return filenames +}