From 725305a2dfa9269190fdaae6d8b214caec8203ac Mon Sep 17 00:00:00 2001 From: MeiMei <30769358+mei23@users.noreply.github.com> Date: Sat, 19 Sep 2020 09:21:21 +0900 Subject: [PATCH] =?UTF-8?q?MeCab=E3=81=AB=E5=A4=96=E9=83=A8=E3=83=A9?= =?UTF-8?q?=E3=82=A4=E3=83=96=E3=83=A9=E3=83=AA=E3=82=92=E4=BD=BF=E3=82=8F?= =?UTF-8?q?=E3=81=AA=E3=81=84=E3=82=88=E3=81=86=E3=81=AB=20(#64)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package.json | 2 +- src/modules/keyword/index.ts | 20 +++++++--------- src/modules/keyword/mecab.ts | 45 ++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 13 deletions(-) create mode 100644 src/modules/keyword/mecab.ts diff --git a/package.json b/package.json index 613f2b6..677c489 100644 --- a/package.json +++ b/package.json @@ -20,7 +20,7 @@ "canvas": "2.6.1", "chalk": "4.1.0", "lokijs": "1.5.11", - "mecab-async": "0.1.2", + "memory-streams": "0.1.3", "misskey-reversi": "0.0.5", "promise-retry": "2.0.1", "random-seed": "0.3.0", diff --git a/src/modules/keyword/index.ts b/src/modules/keyword/index.ts index 2bf422e..43b302d 100644 --- a/src/modules/keyword/index.ts +++ b/src/modules/keyword/index.ts @@ -3,7 +3,7 @@ import * as loki from 'lokijs'; import Module from '../../module'; import config from '../../config'; import serifs from '../../serifs'; -const MeCab = require('mecab-async'); +import { mecab } from './mecab'; function kanaToHira(str: string) { return str.replace(/[\u30a1-\u30f6]/g, match => { @@ -15,7 +15,6 @@ function kanaToHira(str: string) { export default class extends Module { public readonly name = 'keyword'; - private tokenizer: any; private learnedKeywords: loki.Collection<{ keyword: string; learnedAt: number; @@ -29,9 +28,6 @@ export default class extends Module { indices: ['userId'] }); - this.tokenizer = new MeCab(); - this.tokenizer.command = config.mecab; - setInterval(this.learn, 1000 * 60 * 60); return {}; @@ -50,13 +46,13 @@ export default class extends Module { let keywords: string[][] = []; - await Promise.all(interestedNotes.map(note => new Promise((res, rej) => { - this.tokenizer.parse(note.text, (err, tokens) => { - const keywordsInThisNote = tokens.filter(token => token[2] == '固有名詞' && token[8] != null); - keywords = keywords.concat(keywordsInThisNote); - res(); - }); - }))); + for (const note of interestedNotes) { + const tokens = await mecab(note.text, config.mecab); + const keywordsInThisNote = tokens.filter(token => token[2] == '固有名詞' && token[8] != null); + keywords = keywords.concat(keywordsInThisNote); + } + + if (keywords.length === 0) return; const rnd = Math.floor((1 - Math.sqrt(Math.random())) * keywords.length); const keyword = keywords.sort((a, b) => a[0].length < b[0].length ? 1 : -1)[rnd]; diff --git a/src/modules/keyword/mecab.ts b/src/modules/keyword/mecab.ts new file mode 100644 index 0000000..a19abc5 --- /dev/null +++ b/src/modules/keyword/mecab.ts @@ -0,0 +1,45 @@ +import { spawn } from 'child_process'; +import * as util from 'util'; +import * as stream from 'stream'; +import * as memoryStreams from 'memory-streams'; +import { EOL } from 'os'; + +const pipeline = util.promisify(stream.pipeline); + +/** + * Run MeCab + * @param text Text to analyze + * @param mecab mecab bin + * @param dic mecab dictionaly path + */ +export async function mecab(text: string, mecab = 'mecab', dic?: string): Promise { + const args: string[] = []; + if (dic) args.push('-d', dic); + + const lines = await cmd(mecab, args, `${text.replace(/[\n\s\t]/g, ' ')}\n`); + + const results: string[][] = []; + + for (const line of lines) { + if (line === 'EOS') break; + const [word, value = ''] = line.split('\t'); + const array = value.split(','); + array.unshift(word); + results.push(array); + } + + return results; +} + +export async function cmd(command: string, args: string[], stdin: string): Promise { + const mecab = spawn(command, args); + + const writable = new memoryStreams.WritableStream(); + + mecab.stdin.write(stdin); + mecab.stdin.end(); + + await pipeline(mecab.stdout, writable); + + return writable.toString().split(EOL); +}