From 83ba49b76874600ace6dae1036b5e1e5de3dcda7 Mon Sep 17 00:00:00 2001 From: Artem Zakharov Date: Tue, 16 Jul 2019 14:30:38 +0300 Subject: [PATCH] initial commit --- .dockerignore | 10 +++ .gitignore | 1 + Dockerfile | 18 ++++ LICENSE | 21 +++++ Makefile | 15 ++++ README.md | 97 +++++++++++++++++++++ VERSION | 1 + main.go | 182 +++++++++++++++++++++++++++++++++++++++ main_test.go | 229 ++++++++++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 574 insertions(+) create mode 100644 .dockerignore create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 README.md create mode 100644 VERSION create mode 100644 main.go create mode 100644 main_test.go diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..1e2666f --- /dev/null +++ b/.dockerignore @@ -0,0 +1,10 @@ +.env +.editorconfig +.git +.github +.gitignore +.cache +*.md +LICENSE +VERSION +Makefile diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e43b0f9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.DS_Store diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2d7e9d3 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,18 @@ +FROM golang:1.12 AS build +ARG APP_VER +WORKDIR /stem +ADD http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz ./ +RUN tar -xzf mystem-3.1-linux-64bit.tar.gz +COPY *.go ./ +RUN CGO_ENABLED=0 go build -o app -ldflags "-X main.version=$APP_VER -s -w" ./ + +FROM ubuntu:18.04 +RUN groupadd -r stem && useradd --no-log-init -r -g stem stem +WORKDIR /stem +RUN mkdir dict && chown stem:stem dict +COPY --chown=stem:stem --from=build /stem/mystem ./ +COPY --chown=stem:stem --from=build /stem/app ./ + +USER stem +EXPOSE 8080 +ENTRYPOINT ["/stem/app"] \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..efb365b --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Artem Zakharov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..703b0ec --- /dev/null +++ b/Makefile @@ -0,0 +1,15 @@ +NAME := azzzak/mystem +VER := `cat VERSION` +GIT_VER := v${VER} +IMAGE := ${NAME}:${VER} +LATEST := ${NAME}:latest + +image: + @docker build --rm --build-arg APP_VER=${GIT_VER} -t ${IMAGE} -t ${LATEST} . + +push: + @docker push ${NAME} + +tag: + @git tag -a ${GIT_VER} -m "Version ${GIT_VER}" + @git push origin ${GIT_VER} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..aecffa3 --- /dev/null +++ b/README.md @@ -0,0 +1,97 @@ +# mystem-docker + +Программа [MyStem](https://yandex.ru/dev/mystem/) — морфологический анализатор русского текста от Яндекса. Задача mystem-docker: упаковать MyStem в Docker-контейнер и работать с программой по http-протоколу. + +Перед использованием необходимо прочитать и принять [лицензионное соглашение](https://yandex.ru/legal/mystem/) MyStem. + +## Настройки + +| Параметр | Тип | Значение по умолчанию | Комментарий | +| :----------------- | :------: | :-------------------- | ---------------------------------------------------------------- | +| USER_DICT | _string_ | - | Подключить [пользовательский словарь](#пользовательский-словарь) | +| GLUE_GRAMMEMES | _bool_ | false | Объединить словоформы при одной лемме | +| HOMONYMS_DETECTION | _bool_ | false | Применить контекстное снятие омонимии | +| TIMEOUT | _int_ | 1000 | Ограничить время обработки каждого запроса (в миллисекундах) | + +Таймаут необходим в силу однопоточной работы приложения, чтобы не допустить его зависания при ошибке. + +## Примеры запуска + +Установить таймаут на 800 миллисекунд: + +`docker run -p 2345:8080 -e TIMEOUT=800 azzzak/mystem` + +Подключить словарь: + +`docker run -v ~/dict:/stem/dict -p 2345:8080 -e USER_DICT=dict.txt azzzak/mystem` + +Пример запуска в проде: + +`docker run -d --restart always -v ~/dict:/stem/dict -p 127.0.0.1:2345:8080 -e USER_DICT=dict.txt -e HOMONYMS_DETECTION=true -e GLUE_GRAMMEMES=true -e TIMEOUT=800 --name mystem azzzak/mystem` + +## Проверка + +`curl -i -d "text=съешь еще этих мягких французских булок" -X POST http://localhost:2345/mystem` + +## Использование + +Для получения морфологического анализа надо отправить `POST` запрос с полем `text=[текст для анализа]` на `/mystem`. Ответ приходит в формате json. Об ошибке сигнализирует ответ со статус-кодом, отличным от `200`. + +## JSON Schema ответа + +``` +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "MyStem JSON Schema", + "type": "array", + "items": { + "type": "object", + "properties": { + "analysis": { + "type": "array", + "items": { + "type": "object", + "properties": { + "lex": { + "description": "лемма", + "type": "string" + }, + "wt": { + "description": "бесконтекстная вероятность леммы", + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "qual": { + "description": "особые отметки", + "type": "string" + }, + "gr": { + "description": "граммемы", + "type": "string" + } + }, + "required": [ + "lex", "wt", "gr" + ] + }, + "text": { + "description": "исходная словоформа", + "type": "string" + } + } + }, + "required": [ + "analysis", "text" + ] + } +} +``` + +[Расшифровка](https://yandex.ru/dev/mystem/doc/grammemes-values-docpage/) обозначений граммем. + +## Пользовательский словарь + +В случае некорректной работы с неологизмами словарь можно дополнить. Формат пользовательского словаря описан [в документации](https://yandex.ru/dev/mystem/doc/usage-examples-docpage/#usage-examples__dicts) MyStem. + +**После изменения словаря нужно перезапустить контейнер.** diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..afaf360 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +1.0.0 \ No newline at end of file diff --git a/main.go b/main.go new file mode 100644 index 0000000..3e112d9 --- /dev/null +++ b/main.go @@ -0,0 +1,182 @@ +package main + +import ( + "bufio" + "context" + "errors" + "fmt" + "io" + "log" + "net/http" + "os" + "os/exec" + "strconv" + "strings" + "time" +) + +// default timeout for request in milliseconds +var timeout time.Duration = 1000 + +var ( + stdin io.WriteCloser + stdout io.ReadCloser +) + +var errEmptyString = errors.New("Empty string") + +var version = "unknown" + +func main() { + fmt.Printf("mystem-docker %s\n", version) + args := []string{"-i", "--eng-gr", "--weight", "--format=json"} + + homonyms, exists := os.LookupEnv("HOMONYMS_DETECTION") + if exists && isTrue(homonyms) { + args = append(args, "-d") + } + + glue, exists := os.LookupEnv("GLUE_GRAMMEMES") + if exists && isTrue(glue) { + args = append(args, "-g") + } + + dict, exists := os.LookupEnv("USER_DICT") + if exists { + file := fmt.Sprintf("/stem/dict/%s", dict) + if !isDictExist(file) { + log.Fatalf("Can't find user dictionary \"%s\"\n", dict) + } + + d := fmt.Sprintf("--fixlist=%s", file) + args = append(args, d) + } + + timeLimit, exists := os.LookupEnv("TIMEOUT") + if exists { + v, err := strconv.Atoi(timeLimit) + if err != nil { + log.Fatalln("Timeout must be integer") + } + timeout = time.Duration(v) + } + + var err error + if stdin, stdout, err = runMystem(args); err != nil { + log.Fatalf("Error: %v\n", err) + } + + defer stdin.Close() + defer stdout.Close() + + http.HandleFunc("/mystem", limit(listen)) + http.ListenAndServe(":8080", nil) +} + +func runMystem(args []string) (io.WriteCloser, io.ReadCloser, error) { + cmd := exec.Command("./mystem", args...) + + var err error + if stdin, err = cmd.StdinPipe(); err != nil { + return nil, nil, err + } + + if stdout, err = cmd.StdoutPipe(); err != nil { + return nil, nil, err + } + + if err = cmd.Start(); err != nil { + return nil, nil, err + } + + return stdin, stdout, nil +} + +func isTrue(s string) bool { + s = strings.ToLower(s) + if s == "true" || s == "yes" { + return true + } + return false +} + +func isDictExist(file string) bool { + if _, err := os.Stat(file); err != nil { + if os.IsNotExist(err) { + return false + } + } + return true +} + +func process(ctx context.Context, text string) (string, error) { + if len(text) == 0 { + return "", errEmptyString + } + + resChan := make(chan string) + errChan := make(chan error) + + go func() { + defer close(resChan) + defer close(errChan) + + fmt.Fprintln(stdin, text) + buf := bufio.NewReader(stdout) + + str, err := buf.ReadString('\n') + if err != nil { + errChan <- err + } + resChan <- strings.TrimSuffix(str, "\n") + }() + + select { + case <-ctx.Done(): + return "", ctx.Err() + case r := <-resChan: + return r, nil + case err := <-errChan: + return "", err + } +} + +func limit(f http.HandlerFunc) http.HandlerFunc { + sema := make(chan struct{}, 1) + return func(w http.ResponseWriter, r *http.Request) { + sema <- struct{}{} + f(w, r) + <-sema + } +} + +func listen(w http.ResponseWriter, r *http.Request) { + defer r.Body.Close() + + err := r.ParseForm() + if err != nil { + log.Printf("Error: %v\n", err) + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + s := r.Form.Get("text") + + ctx := r.Context() + ctx, cancel := context.WithTimeout(ctx, timeout*time.Millisecond) + p, err := process(ctx, s) + cancel() + + if err != nil { + switch err { + case errEmptyString: + http.Error(w, err.Error(), http.StatusNoContent) + default: + http.Error(w, err.Error(), http.StatusInternalServerError) + } + log.Printf("Error: %v\n", err) + return + } + + w.Header().Set("Content-Type", "application/json") + w.Write([]byte(p)) +} diff --git a/main_test.go b/main_test.go new file mode 100644 index 0000000..835a9c7 --- /dev/null +++ b/main_test.go @@ -0,0 +1,229 @@ +package main + +import ( + "bufio" + "bytes" + "context" + "fmt" + "io" + "io/ioutil" + "net/http" + "net/http/httptest" + "net/url" + "strings" + "testing" +) + +func Test_process(t *testing.T) { + type args struct { + ctx context.Context + text string + } + tests := []struct { + name string + args args + want string + wantErr bool + }{ + { + name: "Try word", + args: args{context.Background(), "слон"}, + want: genOutput("слон"), + wantErr: false, + }, + { + name: "Try sentence", + args: args{context.Background(), "Съешь еще этих мягких французских булок"}, + want: genOutput("Съешь еще этих мягких французских булок"), + wantErr: false, + }, + { + name: "Try empty string", + args: args{context.Background(), ""}, + want: "", + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + b := new(bytes.Buffer) + stdin = &helperWriteCloser{ + io.Writer(b), + } + + fmt.Fprintln(stdin, tt.args.text) + + buf := bufio.NewReader(b) + str, err := buf.ReadString('\n') + if err != nil { + t.Log(err) + } + + s := fmt.Sprintln(genOutput(str)) + bout := bytes.NewBufferString(s) + stdout = ioutil.NopCloser(bout) + + got, err := process(tt.args.ctx, tt.args.text) + if (err != nil) != tt.wantErr { + t.Errorf("process() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("process() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_listen(t *testing.T) { + tests := []struct { + name string + text string + want string + wantErr bool + }{ + { + name: "POST слон", + text: "слон", + want: genOutput("слон"), + wantErr: false, + }, + { + name: "POST sentence", + text: "Съешь еще этих мягких французских булок", + want: genOutput("Съешь еще этих мягких французских булок"), + wantErr: false, + }, + { + name: "POST empty string", + text: "", + want: "", + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + b := new(bytes.Buffer) + stdin = &helperWriteCloser{ + io.Writer(b), + } + fmt.Fprintln(stdin, tt.text) + + buf := bufio.NewReader(b) + str, err := buf.ReadString('\n') + if err != nil { + t.Log(err) + } + + res := fmt.Sprintln(genOutput(str)) + bout := bytes.NewBufferString(res) + stdout = ioutil.NopCloser(bout) + + data := url.Values{} + data.Set("text", tt.text) + + req, err := http.NewRequest("POST", "/stem", strings.NewReader(data.Encode())) + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + if err != nil { + t.Fatal(err) + } + + rr := httptest.NewRecorder() + handler := http.HandlerFunc(limit(listen)) + handler.ServeHTTP(rr, req) + status := rr.Code + + if tt.wantErr && status != http.StatusOK { + return + } + + if status != http.StatusOK { + t.Errorf("handler returned wrong status code: got %v want %v", status, http.StatusOK) + } + + if rr.Body.String() != tt.want { + t.Errorf("handler returned unexpected body: got %v want %v", rr.Body.String(), tt.want) + } + }) + } +} + +func Test_isTrue(t *testing.T) { + type args struct { + s string + } + tests := []struct { + name string + args args + want bool + }{ + { + name: "true", + args: args{s: "true"}, + want: true, + }, + { + name: "yes", + args: args{s: "yes"}, + want: true, + }, + { + name: "TrUe", + args: args{s: "TrUe"}, + want: true, + }, + { + name: "YES", + args: args{s: "YES"}, + want: true, + }, + { + name: "false", + args: args{s: "false"}, + want: false, + }, + { + name: "no", + args: args{s: "no"}, + want: false, + }, + { + name: "Empty string", + args: args{s: ""}, + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := isTrue(tt.args.s); got != tt.want { + t.Errorf("isTrue() = %v, want %v", got, tt.want) + } + }) + } +} + +type helperWriteCloser struct { + io.Writer +} + +func (mwc *helperWriteCloser) Close() error { + return nil +} + +func genOutput(sentence string) string { + if len(sentence) == 0 { + return "" + } + + sentence = strings.TrimSuffix(sentence, "\n") + words := strings.Split(sentence, " ") + + var res []string + for _, v := range words { + s := fmt.Sprintf("{\"analysis\":[{\"lex\":\"%s\",\"wt\":1,\"gr\":\"\"}],\"text\":\"%s\"}", v, v) + res = append(res, s) + } + return fmt.Sprintf("[%s]", strings.Join(res, ",")) +}