Using Chinese with Orama
At the time of writing, Orama supports Chinese (Mandarin) via a custom tokenizer, which is part of the @orama/tokenizers
package.
To get started, make sure to install all the dependencies you need:
sh
npm i @orama/orama @orama/tokenizers
If you want to add Mandarin stop-words as well, install the @orama/stopwords
package too:
sh
npm i @orama/stopwords
Now you're ready to get started with Orama:
js
import { create, insert, search } from '@orama/orama'
import { createTokenizer } from '@orama/tokenizers'
import { stopwords as mandarinStopwords } from '@orama/stopwords/mandarin'
const db = await create({
schema: {
name: 'string'
},
components: {
tokenizer: await createTokenizer({
stopWords: mandarinStopwords
})
}
})
await insert(db, { name: '北京' }) // Beijing
await insert(db, { name: '上海' }) // Shanghai
await insert(db, { name: '广州' }) // Guangzhou
await insert(db, { name: '深圳' }) // Shenzhen
await insert(db, { name: '成都' }) // Chengdu
await insert(db, { name: '杭州' }) // Hangzhou
await insert(db, { name: '南京' }) // Nanjing
await insert(db, { name: '北京大学' }) // Peking University
await insert(db, { name: '上海交通大学' }) // Shanghai Jiao Tong University
await insert(db, { name: '广州中医药大学' }) // Guangzhou University of Chinese Medicine
const results = await search(db, {
term: '广州',
threshold: 0
})
console.log(results)
// {
// "elapsed": {
// "raw": 89554625,
// "formatted": "89ms"
// },
// "hits": [
// {
// "id": "36666208-3",
// "score": 4.210224897276653,
// "document": {
// "name": "广州"
// }
// },
// {
// "id": "36666208-10",
// "score": 1.9335268122510698,
// "document": {
// "name": "广州中医药大学"
// }
// }
// ],
// "count": 2
// }