Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
rain1024 committed Oct 26, 2024
1 parent e9e304a commit 6b3f6f0
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion apps/languagesv2/scripts/script_extract_2000_words.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json


class Word:
def __init__(self, id, text, freq):
self.id = id
Expand All @@ -9,9 +10,11 @@ def __init__(self, id, text, freq):
def __repr__(self):
return f"Word(id={self.id}, text='{self.text}', freq={self.freq})"


def normalize_text(text):
return text.lower()


def extract_words_from_freq_file(n=2000, data_folder="/workspaces/data", filename="freq_vie_1M_2018-freq.txt"):
"""Extracts words from a frequency file and returns a list of Word objects."""
words = []
Expand All @@ -27,6 +30,7 @@ def extract_words_from_freq_file(n=2000, data_folder="/workspaces/data", filenam
break
return words


def export_words_to_js_file(words, output_file="data/VietnameseWords.js"):
"""Exports a list of Word objects to a JavaScript file in the specified format."""
with open(output_file, "w", encoding="utf-8") as file:
Expand All @@ -36,6 +40,7 @@ def export_words_to_js_file(words, output_file="data/VietnameseWords.js"):
file.write("];\n")
print(f"File '{output_file}' created successfully with {len(words)} entries.")


def export_words_to_jsonl_file(words, output_file="data/VietnameseWords.jsonl"):
"""Exports a list of Word objects to a JSONL file."""
with open(output_file, "w", encoding="utf-8") as file:
Expand All @@ -44,10 +49,11 @@ def export_words_to_jsonl_file(words, output_file="data/VietnameseWords.jsonl"):
file.write(json.dumps(word_data, ensure_ascii=False) + "\n")
print(f"File '{output_file}' created successfully with {len(words)} entries.")


if __name__ == "__main__":
# Extract words from the frequency file
words = extract_words_from_freq_file(n=2000)

# Export words to JavaScript and JSON files
export_words_to_js_file(words)
export_words_to_jsonl_file(words)
export_words_to_jsonl_file(words)

0 comments on commit 6b3f6f0

Please sign in to comment.