-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
703 additions
and
160 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
import * as Path from "node:path"; | ||
import { readdir, open, mkdir } from "node:fs/promises"; | ||
import { createGzip } from "node:zlib"; | ||
import { createWriteStream, createReadStream } from "node:fs"; | ||
import { Readable, Duplex, compose } from "node:stream"; | ||
import { pipeline } from "node:stream/promises"; | ||
import sqlLint from "sql-lint"; | ||
import csvtojson from "csvtojson"; | ||
|
||
const SEGMENT_LENGTH = Math.pow(10, 3); | ||
|
||
main(process.argv.slice(2)).catch(async (e) => { | ||
console.error(e); | ||
process.exit(1); | ||
}); | ||
|
||
function formatRawCSV(rawCSV) { | ||
return rawCSV | ||
.map((value, index) => { | ||
if ((0 === index || 2 === index) && value.match(/^\d+$/)) { | ||
return value; | ||
} | ||
if ("" === value) { | ||
return "NULL"; | ||
} | ||
return `'${ | ||
value | ||
.replaceAll(/'/g, it => `'${it}`) | ||
}'`; | ||
}) | ||
.join(","); | ||
} | ||
|
||
function formatChunk(rawCSVList) { | ||
return rawCSVList.map(formatRawCSV).join("),\n ("); | ||
} | ||
|
||
function processNoauiTable(sourceName, tableName) { | ||
return async function* () { | ||
let headerList; | ||
const csv = csvtojson({ | ||
output: "csv", | ||
}); | ||
csv.on("header", (header) => { | ||
headerList = header; | ||
}); | ||
const chunkRowCSVList = await csv.fromFile( | ||
Path.resolve("init_sources", sourceName), | ||
"utf-8" | ||
); | ||
const sql = `INSERT INTO ${tableName} | ||
("${headerList.join(`","`)}") | ||
VALUES | ||
(${formatChunk(chunkRowCSVList)});\n`; | ||
|
||
const errors = await sqlLint.default({ | ||
sql, | ||
}); | ||
if (errors.length) { | ||
console.error(errors); | ||
setTimeout(() => { | ||
throw new Error(sql); | ||
}); | ||
} | ||
yield sql; | ||
}; | ||
} | ||
|
||
function processChhoeTaigiTable({ supressError }) { | ||
let headerList; | ||
const csv = csvtojson({ | ||
output: "csv", | ||
}); | ||
csv.on("header", (header) => { | ||
headerList = header; | ||
}); | ||
return compose( | ||
createReadStream(Path.resolve("init_sources", "JOINED.csv"), "utf-8"), | ||
csv, | ||
async function* (source) { | ||
let rowCSVList = []; | ||
async function flush() { | ||
if (!headerList) { | ||
console.error(kind, detail, headerList, rowCSVList); | ||
console.flush(); | ||
return; | ||
} | ||
const chunkRowCSVList = rowCSVList.splice(0, SEGMENT_LENGTH); | ||
const sql = `INSERT INTO "public"."ChhoeTaigi" | ||
("${headerList.join(`","`)}") | ||
VALUES | ||
(${formatChunk(chunkRowCSVList)});\n`; | ||
|
||
if (true !== supressError) { | ||
const errors = await sqlLint.default({ | ||
sql, | ||
}); | ||
if (errors.length) { | ||
console.error(errors); | ||
setTimeout(() => { | ||
throw new Error(sql); | ||
}); | ||
} | ||
} | ||
return sql; | ||
} | ||
for await (const chunk of source) { | ||
const chunkStr = chunk.toString("utf-8").trim(); | ||
rowCSVList.push( | ||
...JSON.parse(`[ | ||
${ | ||
"," === chunkStr[chunkStr.length - 1] | ||
? chunkStr.substr(0, chunkStr.length - 1) | ||
: chunkStr | ||
} | ||
]`) | ||
); | ||
while (SEGMENT_LENGTH <= rowCSVList.length) { | ||
yield flush(); | ||
} | ||
} | ||
if (rowCSVList.length) { | ||
yield flush(); | ||
} | ||
} | ||
); | ||
} | ||
|
||
async function main() { | ||
const entriesSQLName = `init_entries.sql.gz`; | ||
const writeTo = compose( | ||
createGzip(), | ||
createWriteStream(Path.resolve(entriesSQLName), { | ||
autoClose: false, | ||
flush: true, | ||
}) | ||
); | ||
await pipeline( | ||
Readable.from([ | ||
processNoauiTable( | ||
"Chu-liāu Nôa-ūi Pió (To Gí-giân) - JoinedDictColumnName.csv", | ||
`"public"."NoauiChhoeTaigi"` | ||
), | ||
processNoauiTable( | ||
"Chu-liāu Nôa-ūi Pió (To Gí-giân) - DictCode.csv", | ||
`"public"."NoauiDict"` | ||
), | ||
processChhoeTaigiTable({ supressError: true }), | ||
]).flatMap((it) => Duplex.from(it)), | ||
writeTo | ||
); | ||
writeTo.end(); | ||
console.log(`Done!`); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
FROM library/postgres | ||
COPY init_tables.sql /docker-entrypoint-initdb.d/0001_init_tables.sql | ||
COPY init_entries.sql.gz /docker-entrypoint-initdb.d/0002_init_entries.sql.gz |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
CREATE TABLE "public"."NoauiChhoeTaigi" ( | ||
"id" SERIAL PRIMARY KEY, | ||
"JoinedDictColumnName" varchar(40), | ||
"漢Lô" varchar(40), | ||
"Pe̍h-ōe-jī" varchar(40), | ||
"English" varchar(40), | ||
"日本語" varchar(40), | ||
"中文" varchar(40), | ||
CONSTRAINT "JoinedDictColumnName_key" UNIQUE ("JoinedDictColumnName") | ||
); | ||
CREATE TABLE "public"."NoauiDict" ( | ||
"id" SERIAL PRIMARY KEY, | ||
"DictCode" varchar(40), | ||
"漢Lô" varchar(40), | ||
"Pe̍h-ōe-jī" varchar(40), | ||
"English" varchar(40), | ||
"日本語" varchar(40), | ||
"中文" varchar(40), | ||
"noauiChhoeTaigiIdList" integer [], | ||
CONSTRAINT "DictCode_key" UNIQUE ("DictCode") | ||
); | ||
CREATE INDEX "NoauiDict_DictCode_idx" ON "public"."NoauiDict" USING btree ("DictCode"); | ||
CREATE TABLE "public"."ChhoeTaigi" ( | ||
"JoinedWordID" integer PRIMARY KEY, | ||
"DictCode" varchar(40) REFERENCES "NoauiDict"("DictCode"), | ||
"DictWordID" integer NOT NULL, | ||
"SuBe" text, | ||
"PojUnicode" text, | ||
"PojUnicodeOthers" text, | ||
"PojInput" text, | ||
"PojInputOthers" text, | ||
"KipUnicode" text, | ||
"KipUnicodeOthers" text, | ||
"KipInput" text, | ||
"KipInputOthers" text, | ||
"LmjUnicode" text, | ||
"LmjUnicodeOthers" text, | ||
"HanLoTaibunPoj" text, | ||
"HanLoTaibunKip" text, | ||
"KaisoehPoj" text, | ||
"KaisoehKip" text, | ||
"KaisoehHanLoPoj" text, | ||
"KaisoehHanLoKip" text, | ||
"KaisoehEngbun" text, | ||
"KaisoehJitbunPoj" text, | ||
"LekuPoj" text, | ||
"LekuKip" text, | ||
"LekuHanLoPoj" text, | ||
"LekuHanLoKip" text, | ||
"LekuEngbun" text, | ||
"LekuHoabun" text, | ||
"LesuPoj" text, | ||
"LekuJitbunPoj" text, | ||
"HoaBun" text, | ||
"EngBun" text, | ||
"JitBun" text, | ||
"KhehBun" text, | ||
"HanbunImPojUnicode" text, | ||
"HanbunImPojInput" text, | ||
"HanbunImKipUnicode" text, | ||
"HanbunImKipInput" text, | ||
"KahToo" text, | ||
"Abbreviation" text, | ||
"NounClassifier" text, | ||
"Reduplication" text, | ||
"Synonym" text, | ||
"Opposite" text, | ||
"Confer" text, | ||
"Hunlui" text, | ||
"HunluiJitbun" text, | ||
"KipDictDialects" text, | ||
"KipDictWordProperty" text, | ||
"KipDictHanjiTaibunOthers" text, | ||
"KipDictKaisoehKhehbun" text, | ||
"SoanntengMuitheSekinPoochhiongChuliau" text, | ||
"SoanntengMuitheSekinHongsangJitki" text, | ||
"SoanntengMuitheSekinChute" text, | ||
"PageNumber" text, | ||
"PageNumberChianoePan" text, | ||
"SuBeSoatbeng" text, | ||
"StoreLink" text, | ||
"LaigoanMia" text, | ||
"LaigoanBangchi" text, | ||
"ChhehMia" text, | ||
"ChhoetaigiPhianChu" text, | ||
CONSTRAINT "ChhoeTaigi_DictCode_DictWordID_key" UNIQUE ("DictCode", "DictWordID") | ||
) WITH (oids = false); | ||
CREATE INDEX "ChhoeTaigi_DictCode_idx" ON "public"."ChhoeTaigi" USING btree ("DictCode"); | ||
CREATE INDEX "ChhoeTaigi_EngBun_idx" ON "public"."ChhoeTaigi" USING btree ("EngBun"); | ||
CREATE INDEX "ChhoeTaigi_HoaBun_idx" ON "public"."ChhoeTaigi" USING btree ("HoaBun"); | ||
CREATE INDEX "ChhoeTaigi_JitBun_idx" ON "public"."ChhoeTaigi" USING btree ("JitBun"); | ||
CREATE INDEX "ChhoeTaigi_KipInput_idx" ON "public"."ChhoeTaigi" USING btree ("KipInput"); | ||
CREATE INDEX "ChhoeTaigi_KipUnicode_idx" ON "public"."ChhoeTaigi" USING btree ("KipUnicode"); | ||
CREATE INDEX "ChhoeTaigi_PojInput_idx" ON "public"."ChhoeTaigi" USING btree ("PojInput"); | ||
CREATE INDEX "ChhoeTaigi_PojUnicode_idx" ON "public"."ChhoeTaigi" USING btree ("PojUnicode"); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
{ | ||
"name": "db", | ||
"version": "1.0.0", | ||
"description": "", | ||
"scripts": { | ||
"build": "node build.mjs", | ||
"test": "echo \"Error: no test specified\" && exit 1" | ||
}, | ||
"keywords": [], | ||
"author": "", | ||
"dependencies": { | ||
"csvtojson": "^2.0.10", | ||
"sql-lint": "^1.0.0", | ||
"uuid": "^9.0.1" | ||
} | ||
} |
Oops, something went wrong.