MeCabに外部ライブラリを使わないように (#64)

This commit is contained in:
MeiMei 2020-09-19 09:21:21 +09:00 committed by GitHub
parent a2e4bd87fb
commit 725305a2df
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 54 additions and 13 deletions

View file

@ -20,7 +20,7 @@
"canvas": "2.6.1",
"chalk": "4.1.0",
"lokijs": "1.5.11",
"mecab-async": "0.1.2",
"memory-streams": "0.1.3",
"misskey-reversi": "0.0.5",
"promise-retry": "2.0.1",
"random-seed": "0.3.0",

View file

@ -3,7 +3,7 @@ import * as loki from 'lokijs';
import Module from '../../module';
import config from '../../config';
import serifs from '../../serifs';
const MeCab = require('mecab-async');
import { mecab } from './mecab';
function kanaToHira(str: string) {
return str.replace(/[\u30a1-\u30f6]/g, match => {
@ -15,7 +15,6 @@ function kanaToHira(str: string) {
export default class extends Module {
public readonly name = 'keyword';
private tokenizer: any;
private learnedKeywords: loki.Collection<{
keyword: string;
learnedAt: number;
@ -29,9 +28,6 @@ export default class extends Module {
indices: ['userId']
});
this.tokenizer = new MeCab();
this.tokenizer.command = config.mecab;
setInterval(this.learn, 1000 * 60 * 60);
return {};
@ -50,13 +46,13 @@ export default class extends Module {
let keywords: string[][] = [];
await Promise.all(interestedNotes.map(note => new Promise((res, rej) => {
this.tokenizer.parse(note.text, (err, tokens) => {
for (const note of interestedNotes) {
const tokens = await mecab(note.text, config.mecab);
const keywordsInThisNote = tokens.filter(token => token[2] == '固有名詞' && token[8] != null);
keywords = keywords.concat(keywordsInThisNote);
res();
});
})));
}
if (keywords.length === 0) return;
const rnd = Math.floor((1 - Math.sqrt(Math.random())) * keywords.length);
const keyword = keywords.sort((a, b) => a[0].length < b[0].length ? 1 : -1)[rnd];

View file

@ -0,0 +1,45 @@
import { spawn } from 'child_process';
import * as util from 'util';
import * as stream from 'stream';
import * as memoryStreams from 'memory-streams';
import { EOL } from 'os';
const pipeline = util.promisify(stream.pipeline);
/**
* Run MeCab
* @param text Text to analyze
* @param mecab mecab bin
* @param dic mecab dictionaly path
*/
export async function mecab(text: string, mecab = 'mecab', dic?: string): Promise<string[][]> {
const args: string[] = [];
if (dic) args.push('-d', dic);
const lines = await cmd(mecab, args, `${text.replace(/[\n\s\t]/g, ' ')}\n`);
const results: string[][] = [];
for (const line of lines) {
if (line === 'EOS') break;
const [word, value = ''] = line.split('\t');
const array = value.split(',');
array.unshift(word);
results.push(array);
}
return results;
}
export async function cmd(command: string, args: string[], stdin: string): Promise<string[]> {
const mecab = spawn(command, args);
const writable = new memoryStreams.WritableStream();
mecab.stdin.write(stdin);
mecab.stdin.end();
await pipeline(mecab.stdout, writable);
return writable.toString().split(EOL);
}