Browse Source

add edict

main
parent
commit
8bb5d996b9
11 changed files with 132 additions and 52 deletions
  1. +1
    -0
      .dockerignore
  2. +1
    -0
      .gitignore
  3. +2
    -5
      initdb.d/01-function.sql
  4. +14
    -40
      initdb.d/22-entry.sql
  5. +6
    -6
      initdb.d/24-preset.sql
  6. +7
    -0
      initdb.d/27-freq.sql
  7. +2
    -1
      package.json
  8. +0
    -0
      scripts/.gitkeep
  9. +87
    -0
      scripts/edict.ts
  10. +7
    -0
      scripts/freq.py
  11. +5
    -0
      yarn.lock

+ 1
- 0
.dockerignore View File

@ -5,3 +5,4 @@
!yarn.lock
!/cache
!/wk-api
!/scripts/

+ 1
- 0
.gitignore View File

@ -258,3 +258,4 @@ cython_debug/
/pgdata
/wk-api
/assets/

+ 2
- 5
initdb.d/01-function.sql View File

@ -10,15 +10,12 @@ CREATE FUNCTION array_distinct(anyarray) RETURNS anyarray AS $f$
SELECT array_agg(DISTINCT x) FROM unnest($1) t(x);
$f$ LANGUAGE SQL IMMUTABLE;
CREATE OR REPLACE FUNCTION normalize_reading (TEXT[]) RETURNS TEXT[] AS
CREATE OR REPLACE FUNCTION normalize_reading (TEXT) RETURNS TEXT[] AS
$func$
DECLARE
s TEXT;
new_arr TEXT[] := '{}';
BEGIN
FOREACH s IN ARRAY $1||'{}'::text[] LOOP
new_arr := new_arr||ARRAY[split_part(s,'.',1), replace(s,'.','')];
END LOOP;
RETURN new_arr;
RETURN ARRAY[split_part($1,'.',1), replace($1,'.','')];
END;
$func$ LANGUAGE plpgsql IMMUTABLE;

+ 14
- 40
initdb.d/22-entry.sql View File

@ -1,17 +1,20 @@
CREATE TABLE "entry" (
"id" UUID NOT NULL PRIMARY KEY DEFAULT uuid_generate_v4(),
"id" UUID NOT NULL DEFAULT uuid_generate_v4(),
"createdAt" TIMESTAMPTZ DEFAULT now(),
"updatedAt" TIMESTAMPTZ DEFAULT now(),
"userId" UUID NOT NULL REFERENCES "user"("id") ON DELETE CASCADE,
"type" TEXT NOT NULL,
"entry" TEXT[] NOT NULL CHECK ("entry"[1] IS NOT NULL),
-- "reading" -- backref "entry.reading"
-- "translation" -- backref "entry.translation"
"reading" JSONB NOT NULL DEFAULT '{}', -- TODO: jsonschema
"translation" JSONB NOT NULL DEFAULT '{}', -- TODO: jsonschema
"description" TEXT NOT NULL DEFAULT '',
"tag" TEXT[] NOT NULL DEFAULT '{}',
"level" FLOAT NOT NULL,
"level.kanji" INT NOT NULL,
"frequency" FLOAT
"level" FLOAT,
"level.kanji" INT,
"frequency" FLOAT,
"key" TEXT UNIQUE,
"_meta" JSONB, -- TODO: jsonschema
PRIMARY KEY ("id")
);
CREATE TRIGGER "t_entry_updatedAt"
@ -19,8 +22,6 @@ CREATE TRIGGER "t_entry_updatedAt"
FOR EACH ROW
EXECUTE PROCEDURE "f_updatedAt"();
CREATE UNIQUE INDEX idx_entry_u ON "entry" (("entry"[1]), "type", "userId");
CREATE INDEX "idx_entry_updatedAt" ON "entry" ("updatedAt");
CREATE INDEX "idx_entry_userId" ON "entry" ("userId");
CREATE INDEX "idx_entry_type" ON "entry" ("type");
@ -41,39 +42,12 @@ CREATE INDEX "idx_entry_entry" ON "entry"
CREATE INDEX "idx_entry_entry_gin" ON "entry" USING GIN("entry");
CREATE INDEX "idx_entry_tag" ON "entry" USING pgroonga ("tag");
CREATE TABLE "entry.reading" (
"id" UUID NOT NULL PRIMARY KEY DEFAULT uuid_generate_v4(),
"createdAt" TIMESTAMPTZ DEFAULT now(),
"updatedAt" TIMESTAMPTZ DEFAULT now(),
"userId" UUID NOT NULL REFERENCES "user"("id") ON DELETE CASCADE,
"entryId" UUID NOT NULL REFERENCES "entry"("id") ON DELETE CASCADE,
"type" TEXT NOT NULL,
"reading" TEXT[] NOT NULL DEFAULT '{}' CHECK ("reading"[1] IS NOT NULL)
);
CREATE INDEX "idx_entry.reading_updatedAt" ON "entry.reading" ("updatedAt");
CREATE INDEX "idx_entry.reading_userId" ON "entry.reading" ("userId");
CREATE INDEX "idx_entry.reading_type" ON "entry.reading" ("type");
CREATE INDEX "idx_entry.reading_reading" ON "entry.reading"
USING pgroonga (normalize_reading("reading"))
CREATE INDEX "idx_entry_meta" ON "entry" USING pgroonga("_meta");
CREATE INDEX "idx_entry_reading" ON "entry"
USING pgroonga("reading")
WITH (
tokenizer='TokenMecab',
normalizer='NormalizerNFKC100("unify_kana", true)'
);
CREATE TABLE "entry.translation" (
"id" UUID NOT NULL PRIMARY KEY DEFAULT uuid_generate_v4(),
"createdAt" TIMESTAMPTZ DEFAULT now(),
"updatedAt" TIMESTAMPTZ DEFAULT now(),
"userId" UUID NOT NULL REFERENCES "user"("id") ON DELETE CASCADE,
"entryId" UUID NOT NULL REFERENCES "entry"("id") ON DELETE CASCADE,
"language" TEXT NOT NULL,
"translation" TEXT[] NOT NULL DEFAULT '{}' CHECK ("translation"[1] IS NOT NULL)
);
CREATE INDEX "idx_entry.translation_updatedAt" ON "entry.translation" ("updatedAt");
CREATE INDEX "idx_entry.translation_userId" ON "entry.translation" ("userId");
CREATE INDEX "idx_entry.translation_language" ON "entry.translation" ("language");
CREATE INDEX "idx_entry.translation_translation" ON "entry.translation"
USING pgroonga("translation")
WITH (plugins='token_filters/stem', token_filters='TokenFilterStem');
CREATE INDEX "idx_entry_translation" ON "entry"
USING pgroonga("translation");

+ 6
- 6
initdb.d/24-preset.sql View File

@ -1,4 +1,4 @@
CREATE TABLE "quiz_preset" (
CREATE TABLE "preset.quiz" (
"id" UUID NOT NULL PRIMARY KEY DEFAULT uuid_generate_v4(),
"createdAt" TIMESTAMPTZ DEFAULT now(),
"updatedAt" TIMESTAMPTZ DEFAULT now(),
@ -7,12 +7,12 @@ CREATE TABLE "quiz_preset" (
"settings" JSONB NOT NULL
);
CREATE TRIGGER "t_quiz_preset_updatedAt"
BEFORE UPDATE ON "quiz_preset"
CREATE TRIGGER "t_preset.quiz_updatedAt"
BEFORE UPDATE ON "preset.quiz"
FOR EACH ROW
EXECUTE PROCEDURE "f_updatedAt"();
CREATE INDEX "idx_quiz_preset_updatedAt" ON "quiz_preset" ("updatedAt");
CREATE INDEX "idx_quiz_preset_userId" ON "quiz_preset" ("userId");
CREATE INDEX "idx_quiz_preset_name" ON "quiz_preset"
CREATE INDEX "idx_preset.quiz_updatedAt" ON "preset.quiz" ("updatedAt");
CREATE INDEX "idx_preset.quiz_userId" ON "preset.quiz" ("userId");
CREATE INDEX "idx_preset.quiz_name" ON "preset.quiz"
USING pgroonga ("name");

+ 7
- 0
initdb.d/27-freq.sql View File

@ -0,0 +1,7 @@
CREATE TABLE "frequency" (
"entry" TEXT NOT NULL,
"frequency" FLOAT NOT NULL,
PRIMARY KEY ("entry")
);
CREATE INDEX "idx_frequency_frequency" ON "frequency"("frequency");

+ 2
- 1
package.json View File

@ -15,7 +15,8 @@
"better-sqlite3": "^7.4.3",
"fast-glob": "^3.2.7",
"js-yaml": "^4.1.0",
"jsonschema-definer": "^1.3.2"
"jsonschema-definer": "^1.3.2",
"python-shell": "^3.0.1"
},
"devDependencies": {
"@types/better-sqlite3": "^7.4.0",

+ 0
- 0
scripts/.gitkeep View File


+ 87
- 0
scripts/edict.ts View File

@ -0,0 +1,87 @@
import createConnectionPool, { ConnectionPool, sql } from '@databases/pg'
import sqlite3 from 'better-sqlite3'
import { PythonShell } from 'python-shell'
async function main(db: ConnectionPool) {
const s3 = sqlite3('./assets/edict.db')
const rows = s3
.prepare(
/* sql */ `
SELECT "id", "entry", "reading", "english" FROM "edict"
`
)
.all()
const fPrev = new Set<string>(
await db
.query(
sql`
SELECT "entry" FROM "frequency"
`
)
.then((rs) => rs.map((r) => r.entry))
)
const py = new PythonShell('./scripts/freq.py', {
mode: 'text',
pythonOptions: ['-u'],
})
const fMap = new Map<string, number>()
py.on('message', (msg: string) => {
const [w = '', f] = msg.split('=')
fMap.set(w, Number(f))
})
;[...new Set(rows.flatMap((r) => JSON.parse(r.entry)))]
.filter((v) => !fPrev.has(v))
.map((v) => py.send(v))
await new Promise<void>((resolve, reject) => {
py.end((err) => (err ? reject(err) : resolve()))
})
const batchSize = 5000
if (fMap.size) {
const fRow = [...fMap]
for (let i = 0; i < fRow.length; i += batchSize) {
console.log(i)
await db.query(sql`
INSERT INTO "frequency" ("entry", "frequency")
VALUES ${sql.join(
fRow.slice(i, i + batchSize).map(([k, v]) => sql`(${k}, ${v})`),
','
)}
ON CONFLICT ("entry")
DO UPDATE SET "frequency" = EXCLUDED."frequency";
`)
}
}
for (let i = 0; i < rows.length; i += batchSize) {
console.log(i)
const sublot = rows.slice(i, i + batchSize)
await db.query(sql`
INSERT INTO "entry" ("userId", "type", "entry", "key", "reading", "translation")
VALUES ${sql.join(
sublot.map(
(r) =>
sql`(uuid_nil(), 'vocabulary', ${JSON.parse(
r.entry
)}, ${`edict:${r.id}`}, ${{
_: JSON.parse(r.reading),
}}, ${{ en: JSON.parse(r.english) }})`
),
','
)}
ON CONFLICT DO NOTHING;
`)
}
}
if (require.main === module) {
main(createConnectionPool({ bigIntMode: 'number' }))
}

+ 7
- 0
scripts/freq.py View File

@ -0,0 +1,7 @@
import sys
from wordfreq import zipf_frequency
for s in sys.stdin:
w = s.rstrip()
print(f"{w}={zipf_frequency(w, 'ja')}")

+ 5
- 0
yarn.lock View File

@ -918,6 +918,11 @@ punycode@^2.1.0:
resolved "https://registry.yarnpkg.com/punycode/-/punycode-2.1.1.tgz#b58b010ac40c22c5657616c8d2c2c02c7bf479ec"
integrity sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==
python-shell@^3.0.1:
version "3.0.1"
resolved "https://registry.yarnpkg.com/python-shell/-/python-shell-3.0.1.tgz#c3d3b11536e6ebdb8d6a2602482f7180d940bb13"
integrity sha512-TWeotuxe1auhXa5bGRScxnc2J+0r41NBntSa6RYZtMBLtAEsvCboKrEbW6DvASosWQepVkhZZlT3B5Ei766G+Q==
queue-microtask@^1.2.2:
version "1.2.3"
resolved "https://registry.yarnpkg.com/queue-microtask/-/queue-microtask-1.2.3.tgz#4929228bbc724dfac43e0efb058caf7b6cfb6243"

Loading…
Cancel
Save