@ -1,14 +1,30 @@ | |||
CREATE TABLE "wanikani.subjects" ( | |||
"id" INT PRIMARY KEY NOT NULL, | |||
"id" INT NOT NULL, | |||
"updatedAt" TIMESTAMPTZ NOT NULL, | |||
"object" TEXT NOT NULL, | |||
"url" TEXT NOT NULL, | |||
"data" JSONB NOT NULL | |||
"data" JSONB NOT NULL, | |||
-- slug: string | |||
-- characters?: string | |||
-- character_images?: { url: string; content_type: string; metadata: any }[] | |||
-- amalgamation_subject_ids: int[] | |||
-- visually_similar_subject_ids?: int[] | |||
-- component_subject_ids?: int[] | |||
-- meanings: { meaning: string; accepted_answer: boolean }[] | |||
-- readings: { reading: string; type: string; accepted_answer: boolean; primary: boolean }[] | |||
-- context_sentences?: { en: string; ja: string }[] | |||
PRIMARY KEY ("id") | |||
); | |||
CREATE INDEX "idx_wanikani.subjects_object" ON "wanikani.subjects" ("object"); | |||
CREATE INDEX "idx_wanikani.subjects_data" ON "wanikani.subjects" | |||
USING pgroonga("data") | |||
WITH ( | |||
plugins='token_filters/stem', | |||
token_filters='TokenFilterStem' | |||
); | |||
CREATE INDEX "idx_wanikani.subjects_data_kana" ON "wanikani.subjects" | |||
USING pgroonga((identity("data"))) | |||
WITH ( | |||
tokenizer='TokenMecab', | |||
normalizer='NormalizerNFKC100("unify_kana", true)' | |||
@ -0,0 +1,100 @@ | |||
import createConnectionPool, { ConnectionPool, sql } from '@databases/pg' | |||
import sqlite3 from 'better-sqlite3' | |||
import { PythonShell } from 'python-shell' | |||
export async function main(db: ConnectionPool) { | |||
const s3 = sqlite3('./assets/kanjidic.db') | |||
const rows = s3 | |||
.prepare( | |||
/* sql */ ` | |||
SELECT "kanji", "onyomi", "kunyomi", "nanori", "english" FROM "kanji" | |||
` | |||
) | |||
.all() | |||
const fPrev = new Set<string>( | |||
await db | |||
.query( | |||
sql` | |||
SELECT "entry" FROM "frequency" | |||
` | |||
) | |||
.then((rs) => rs.map((r) => r.entry)) | |||
) | |||
const py = new PythonShell('./scripts/freq.py', { | |||
mode: 'text', | |||
pythonOptions: ['-u'], | |||
}) | |||
const fMap = new Map<string, number>() | |||
py.on('message', (msg: string) => { | |||
const [w = '', f] = msg.split('=') | |||
fMap.set(w, Number(f)) | |||
}) | |||
;[...new Set(rows.map((r) => r.kanji))] | |||
.filter((v) => !fPrev.has(v)) | |||
.map((v) => py.send(v)) | |||
await new Promise<void>((resolve, reject) => { | |||
py.end((err) => (err ? reject(err) : resolve())) | |||
}) | |||
const batchSize = 5000 | |||
if (fMap.size) { | |||
const fRow = [...fMap] | |||
for (let i = 0; i < fRow.length; i += batchSize) { | |||
console.log(i) | |||
await db.query(sql` | |||
INSERT INTO "frequency" ("entry", "frequency") | |||
VALUES ${sql.join( | |||
fRow.slice(i, i + batchSize).map(([k, v]) => sql`(${k}, ${v})`), | |||
',' | |||
)} | |||
ON CONFLICT ("entry") | |||
DO UPDATE SET "frequency" = EXCLUDED."frequency"; | |||
`) | |||
} | |||
} | |||
for (let i = 0; i < rows.length; i += batchSize) { | |||
console.log(i) | |||
const sublot = rows.slice(i, i + batchSize) | |||
await db.query(sql` | |||
INSERT INTO "entry" ("userId", "type", "entry", "key", "reading", "translation", "_meta") | |||
VALUES ${sql.join( | |||
sublot.map((r) => { | |||
const onyomi = JSON.parse(r.onyomi) | |||
const kunyomi_raw: string[] = JSON.parse(r.kunyomi) | |||
const kunyomi = kunyomi_raw.flatMap((v) => | |||
v.includes('.') ? [v.replace('.', ''), v.split('.')[0]] : v | |||
) | |||
const nanori = JSON.parse(r.nanori) | |||
const english = JSON.parse(r.english) | |||
return sql`(uuid_nil(), 'character', ${[ | |||
r.kanji, | |||
]}, ${`kanjidic:${r.kanji}`}, ${{ | |||
onyomi, | |||
kunyomi, | |||
nanori, | |||
}}, ${{ | |||
en: english, | |||
}}, ${{ | |||
kunyomi: kunyomi_raw, | |||
}})` | |||
}), | |||
',' | |||
)} | |||
ON CONFLICT DO NOTHING; | |||
`) | |||
} | |||
} | |||
if (require.main === module) { | |||
main(createConnectionPool({ bigIntMode: 'number' })) | |||
} |
@ -0,0 +1,98 @@ | |||
import createConnectionPool, { ConnectionPool, sql } from '@databases/pg' | |||
import sqlite3 from 'better-sqlite3' | |||
import { PythonShell } from 'python-shell' | |||
export async function main(db: ConnectionPool) { | |||
const s3 = sqlite3('./assets/radical.db') | |||
const rows = s3 | |||
.prepare( | |||
/* sql */ ` | |||
SELECT "entry", "sub", "sup", "var" FROM "radical" | |||
` | |||
) | |||
.all() | |||
const fPrev = new Set<string>( | |||
await db | |||
.query( | |||
sql` | |||
SELECT "entry" FROM "frequency" | |||
` | |||
) | |||
.then((rs) => rs.map((r) => r.entry)) | |||
) | |||
const py = new PythonShell('./scripts/freq.py', { | |||
mode: 'text', | |||
pythonOptions: ['-u'], | |||
}) | |||
const fMap = new Map<string, number>() | |||
py.on('message', (msg: string) => { | |||
const [w = '', f] = msg.split('=') | |||
fMap.set(w, Number(f)) | |||
}) | |||
;[ | |||
...new Set( | |||
rows.flatMap((r) => | |||
[ | |||
...((r.entry + r.sub + r.sup + r.var) as string).matchAll( | |||
/\p{sc=Han}/gu | |||
), | |||
].map((m) => m[0]!) | |||
) | |||
), | |||
] | |||
.filter((v) => !fPrev.has(v)) | |||
.map((v) => py.send(v)) | |||
await new Promise<void>((resolve, reject) => { | |||
py.end((err) => (err ? reject(err) : resolve())) | |||
}) | |||
const batchSize = 5000 | |||
if (fMap.size) { | |||
const fRow = [...fMap] | |||
for (let i = 0; i < fRow.length; i += batchSize) { | |||
console.log(i) | |||
await db.query(sql` | |||
INSERT INTO "frequency" ("entry", "frequency") | |||
VALUES ${sql.join( | |||
fRow.slice(i, i + batchSize).map(([k, v]) => sql`(${k}, ${v})`), | |||
',' | |||
)} | |||
ON CONFLICT ("entry") | |||
DO UPDATE SET "frequency" = EXCLUDED."frequency"; | |||
`) | |||
} | |||
} | |||
for (let i = 0; i < rows.length; i += batchSize) { | |||
console.log(i) | |||
const sublot = rows.slice(i, i + batchSize) | |||
await db.query(sql` | |||
INSERT INTO "character" ("entry", "sub", "sup", "var") | |||
VALUES ${sql.join( | |||
sublot.map((r) => { | |||
return sql`(${r.entry}, ${[ | |||
...(r.sub as string).matchAll(/\p{sc=Han}/gu), | |||
].map((m) => m[0]!)}, ${[ | |||
...(r.sup as string).matchAll(/\p{sc=Han}/gu), | |||
].map((m) => m[0]!)}, ${[ | |||
...(r.var as string).matchAll(/\p{sc=Han}/gu), | |||
].map((m) => m[0]!)})` | |||
}), | |||
',' | |||
)} | |||
ON CONFLICT DO NOTHING; | |||
`) | |||
} | |||
} | |||
if (require.main === module) { | |||
main(createConnectionPool({ bigIntMode: 'number' })) | |||
} |