Browse Source

add kanjidic and radicals

main
parent
commit
6e2c025e2a
6 changed files with 224 additions and 10 deletions
  1. +5
    -0
      initdb.d/01-function.sql
  2. +18
    -2
      initdb.d/10-wanikani.sql
  3. +1
    -4
      initdb.d/26-character.sql
  4. +2
    -4
      scripts/edict.ts
  5. +100
    -0
      scripts/kanjidic.ts
  6. +98
    -0
      scripts/radical.ts

+ 5
- 0
initdb.d/01-function.sql View File

@ -19,3 +19,8 @@ BEGIN
RETURN ARRAY[split_part($1,'.',1), replace($1,'.','')];
END;
$func$ LANGUAGE plpgsql IMMUTABLE;
CREATE OR REPLACE FUNCTION identity (JSONB) RETURNS JSONB AS
$func$
SELECT $1;
$func$ LANGUAGE SQL IMMUTABLE;

+ 18
- 2
initdb.d/10-wanikani.sql View File

@ -1,14 +1,30 @@
CREATE TABLE "wanikani.subjects" (
"id" INT PRIMARY KEY NOT NULL,
"id" INT NOT NULL,
"updatedAt" TIMESTAMPTZ NOT NULL,
"object" TEXT NOT NULL,
"url" TEXT NOT NULL,
"data" JSONB NOT NULL
"data" JSONB NOT NULL,
-- slug: string
-- characters?: string
-- character_images?: { url: string; content_type: string; metadata: any }[]
-- amalgamation_subject_ids: int[]
-- visually_similar_subject_ids?: int[]
-- component_subject_ids?: int[]
-- meanings: { meaning: string; accepted_answer: boolean }[]
-- readings: { reading: string; type: string; accepted_answer: boolean; primary: boolean }[]
-- context_sentences?: { en: string; ja: string }[]
PRIMARY KEY ("id")
);
CREATE INDEX "idx_wanikani.subjects_object" ON "wanikani.subjects" ("object");
CREATE INDEX "idx_wanikani.subjects_data" ON "wanikani.subjects"
USING pgroonga("data")
WITH (
plugins='token_filters/stem',
token_filters='TokenFilterStem'
);
CREATE INDEX "idx_wanikani.subjects_data_kana" ON "wanikani.subjects"
USING pgroonga((identity("data")))
WITH (
tokenizer='TokenMecab',
normalizer='NormalizerNFKC100("unify_kana", true)'

+ 1
- 4
initdb.d/26-character.sql View File

@ -2,10 +2,7 @@ CREATE TABLE "character" (
"entry" TEXT NOT NULL PRIMARY KEY,
"sub" TEXT[] NOT NULL,
"sup" TEXT[] NOT NULL,
"var" TEXT[] NOT NULL,
"amalgamation" TEXT[] NOT NULL,
"component" TEXT[] NOT NULL,
"visual" TEXT[] NOT NULL
"var" TEXT[] NOT NULL
);
CREATE INDEX idx_character_search ON "character"

+ 2
- 4
scripts/edict.ts View File

@ -88,11 +88,9 @@ export async function main(db: ConnectionPool) {
}
export async function constraint(db: ConnectionPool) {
const sReading = S.object().additionalProperties(
S.list(S.string()).minItems(1)
)
const sReading = S.object().additionalProperties(S.list(S.string()))
const colName = sql`translation`
const colName = sql`reading`
const constraintName = sql`c_entry_${colName}`
await db.query(sql`

+ 100
- 0
scripts/kanjidic.ts View File

@ -0,0 +1,100 @@
import createConnectionPool, { ConnectionPool, sql } from '@databases/pg'
import sqlite3 from 'better-sqlite3'
import { PythonShell } from 'python-shell'
export async function main(db: ConnectionPool) {
const s3 = sqlite3('./assets/kanjidic.db')
const rows = s3
.prepare(
/* sql */ `
SELECT "kanji", "onyomi", "kunyomi", "nanori", "english" FROM "kanji"
`
)
.all()
const fPrev = new Set<string>(
await db
.query(
sql`
SELECT "entry" FROM "frequency"
`
)
.then((rs) => rs.map((r) => r.entry))
)
const py = new PythonShell('./scripts/freq.py', {
mode: 'text',
pythonOptions: ['-u'],
})
const fMap = new Map<string, number>()
py.on('message', (msg: string) => {
const [w = '', f] = msg.split('=')
fMap.set(w, Number(f))
})
;[...new Set(rows.map((r) => r.kanji))]
.filter((v) => !fPrev.has(v))
.map((v) => py.send(v))
await new Promise<void>((resolve, reject) => {
py.end((err) => (err ? reject(err) : resolve()))
})
const batchSize = 5000
if (fMap.size) {
const fRow = [...fMap]
for (let i = 0; i < fRow.length; i += batchSize) {
console.log(i)
await db.query(sql`
INSERT INTO "frequency" ("entry", "frequency")
VALUES ${sql.join(
fRow.slice(i, i + batchSize).map(([k, v]) => sql`(${k}, ${v})`),
','
)}
ON CONFLICT ("entry")
DO UPDATE SET "frequency" = EXCLUDED."frequency";
`)
}
}
for (let i = 0; i < rows.length; i += batchSize) {
console.log(i)
const sublot = rows.slice(i, i + batchSize)
await db.query(sql`
INSERT INTO "entry" ("userId", "type", "entry", "key", "reading", "translation", "_meta")
VALUES ${sql.join(
sublot.map((r) => {
const onyomi = JSON.parse(r.onyomi)
const kunyomi_raw: string[] = JSON.parse(r.kunyomi)
const kunyomi = kunyomi_raw.flatMap((v) =>
v.includes('.') ? [v.replace('.', ''), v.split('.')[0]] : v
)
const nanori = JSON.parse(r.nanori)
const english = JSON.parse(r.english)
return sql`(uuid_nil(), 'character', ${[
r.kanji,
]}, ${`kanjidic:${r.kanji}`}, ${{
onyomi,
kunyomi,
nanori,
}}, ${{
en: english,
}}, ${{
kunyomi: kunyomi_raw,
}})`
}),
','
)}
ON CONFLICT DO NOTHING;
`)
}
}
if (require.main === module) {
main(createConnectionPool({ bigIntMode: 'number' }))
}

+ 98
- 0
scripts/radical.ts View File

@ -0,0 +1,98 @@
import createConnectionPool, { ConnectionPool, sql } from '@databases/pg'
import sqlite3 from 'better-sqlite3'
import { PythonShell } from 'python-shell'
export async function main(db: ConnectionPool) {
const s3 = sqlite3('./assets/radical.db')
const rows = s3
.prepare(
/* sql */ `
SELECT "entry", "sub", "sup", "var" FROM "radical"
`
)
.all()
const fPrev = new Set<string>(
await db
.query(
sql`
SELECT "entry" FROM "frequency"
`
)
.then((rs) => rs.map((r) => r.entry))
)
const py = new PythonShell('./scripts/freq.py', {
mode: 'text',
pythonOptions: ['-u'],
})
const fMap = new Map<string, number>()
py.on('message', (msg: string) => {
const [w = '', f] = msg.split('=')
fMap.set(w, Number(f))
})
;[
...new Set(
rows.flatMap((r) =>
[
...((r.entry + r.sub + r.sup + r.var) as string).matchAll(
/\p{sc=Han}/gu
),
].map((m) => m[0]!)
)
),
]
.filter((v) => !fPrev.has(v))
.map((v) => py.send(v))
await new Promise<void>((resolve, reject) => {
py.end((err) => (err ? reject(err) : resolve()))
})
const batchSize = 5000
if (fMap.size) {
const fRow = [...fMap]
for (let i = 0; i < fRow.length; i += batchSize) {
console.log(i)
await db.query(sql`
INSERT INTO "frequency" ("entry", "frequency")
VALUES ${sql.join(
fRow.slice(i, i + batchSize).map(([k, v]) => sql`(${k}, ${v})`),
','
)}
ON CONFLICT ("entry")
DO UPDATE SET "frequency" = EXCLUDED."frequency";
`)
}
}
for (let i = 0; i < rows.length; i += batchSize) {
console.log(i)
const sublot = rows.slice(i, i + batchSize)
await db.query(sql`
INSERT INTO "character" ("entry", "sub", "sup", "var")
VALUES ${sql.join(
sublot.map((r) => {
return sql`(${r.entry}, ${[
...(r.sub as string).matchAll(/\p{sc=Han}/gu),
].map((m) => m[0]!)}, ${[
...(r.sup as string).matchAll(/\p{sc=Han}/gu),
].map((m) => m[0]!)}, ${[
...(r.var as string).matchAll(/\p{sc=Han}/gu),
].map((m) => m[0]!)})`
}),
','
)}
ON CONFLICT DO NOTHING;
`)
}
}
if (require.main === module) {
main(createConnectionPool({ bigIntMode: 'number' }))
}

Loading…
Cancel
Save