diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c9b731d7..34c74d6b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,16 +40,13 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: [3.9] + lindera: ["", "_lindera"] + python-version: ["3.10"] include: - os: ubuntu-latest python-version: "3.12" - os: ubuntu-latest python-version: "3.11" - - os: ubuntu-latest - python-version: "3.10" - - os: ubuntu-latest - python-version: 3.8 runs-on: "${{ matrix.os }}" steps: - name: Harden Runner @@ -67,6 +64,7 @@ jobs: index.crates.io:443 static.rust-lang.org:443 objects.githubusercontent.com:443 + download.johtani.info:443 - uses: actions/checkout@61b9e3751b92087fd0b06925ba6dd6314e06f089 - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # 5.0.0 @@ -79,4 +77,4 @@ jobs: - uses: Swatinem/rust-cache@23bce251a8cd2ffc3c1075eaa2367cf899916d84 # 2.7.3 - run: python3 -m pip install nox - - run: nox -s test-${{ matrix.python-version }} + - run: nox -s test${{matrix.lindera}}-${{ matrix.python-version }} diff --git a/Cargo.lock b/Cargo.lock index 791bf53d..5793ec0d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,23 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + [[package]] name = "ahash" version = "0.8.6" @@ -44,6 +61,12 @@ dependencies = [ "libc", ] +[[package]] +name = "anyhow" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca" + [[package]] name = "arc-swap" version = "1.6.0" @@ -73,6 +96,21 @@ version = "0.21.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9" +[[package]] +name = "base64ct" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" + +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -94,6 +132,15 @@ dependencies = [ "crunchy", ] +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "bumpalo" version = "3.14.0" @@ -106,6 +153,27 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "bzip2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "cc" version = "1.0.83" @@ -139,15 +207,40 @@ dependencies = [ "js-sys", "num-traits", "wasm-bindgen", - "windows-targets", + "windows-targets 0.48.5", +] + +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", ] +[[package]] +name = "constant_time_eq" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" + [[package]] name = "core-foundation-sys" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +[[package]] +name = "cpufeatures" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +dependencies = [ + "libc", +] + [[package]] name = "crc32fast" version = "1.3.2" @@ -206,6 +299,37 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "csv" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +dependencies = [ + "memchr", +] + [[package]] name = "deranged" version = "0.3.9" @@ -216,6 +340,17 @@ dependencies = [ "serde", ] +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + [[package]] name = "downcast-rs" version = "1.2.0" @@ -228,14 +363,109 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + +[[package]] +name = "encoding_rs" +version = "0.8.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "encoding_rs_io" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" +dependencies = [ + "encoding_rs", +] + +[[package]] +name = "env_logger" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580" +dependencies = [ + "humantime", + "is-terminal", + "log", + "regex", + "termcolor", +] + [[package]] name = "errno" -version = "0.3.5" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -250,12 +480,43 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" +[[package]] +name = "filetime" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ee447700ac8aa0b2f2bd7bc4462ad686ba06baa6727ac149a2d6277f0d240fd" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "windows-sys 0.52.0", +] + +[[package]] +name = "flate2" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + [[package]] name = "fs4" version = "0.6.6" @@ -263,7 +524,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2eeb4ed9e12f43b7fa0baae3f9cdda28352770132ef2e09a23760c29cae8bd47" dependencies = [ "rustix", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -368,6 +629,16 @@ dependencies = [ "windows", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.10" @@ -379,6 +650,12 @@ dependencies = [ "wasi", ] +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "hashbrown" version = "0.14.2" @@ -401,12 +678,27 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + [[package]] name = "htmlescape" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163" +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + [[package]] name = "iana-time-zone" version = "0.1.58" @@ -430,12 +722,31 @@ dependencies = [ "cc", ] +[[package]] +name = "idna" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + [[package]] name = "indoc" version = "2.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e186cfbae8084e513daff4240b4797e342f988cecda4fb6c939150f96315fd8" +[[package]] +name = "inout" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5" +dependencies = [ + "generic-array", +] + [[package]] name = "instant" version = "0.1.12" @@ -448,6 +759,17 @@ dependencies = [ "web-sys", ] +[[package]] +name = "is-terminal" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "itertools" version = "0.11.0" @@ -504,15 +826,273 @@ checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25" [[package]] name = "libc" -version = "0.2.149" +version = "0.2.153" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" + +[[package]] +name = "lindera-cc-cedict" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b6bf27b9e3d76effb160531cda3d11c965616600459a6e17adc61aada8a3df2" +dependencies = [ + "bincode", + "byteorder", + "encoding", + "lindera-cc-cedict-builder", + "lindera-core", + "lindera-decompress", + "once_cell", + "zip", +] + +[[package]] +name = "lindera-cc-cedict-builder" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a90d23f7cef31c6ab7ac0d4f3b23940754207f7b5a80b080c39193caffe99ac2" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "encoding", + "env_logger", + "glob", + "lindera-compress", + "lindera-core", + "lindera-decompress", + "log", + "yada", +] + +[[package]] +name = "lindera-compress" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1927b7d2bd4ffc19e07691bf8609722663c341f80260a1c636cee8f1ec420dce" +dependencies = [ + "anyhow", + "flate2", + "lindera-decompress", +] + +[[package]] +name = "lindera-core" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3299caa2b81c9a076535a4651a83bf7d624c15f2349f243187fffc64b5a78251" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "encoding_rs", + "log", + "once_cell", + "serde", + "thiserror", + "yada", +] + +[[package]] +name = "lindera-decompress" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b82b8d2323a67dc8ff0c40751d199b7ba94cd5e3c13a5b31622d318acc79e5b" +dependencies = [ + "anyhow", + "flate2", + "serde", +] + +[[package]] +name = "lindera-dictionary" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" +checksum = "cddf783b459d54b130d956889bec052c25fcb478a304e03fa9b2289387572bc5" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "lindera-cc-cedict", + "lindera-cc-cedict-builder", + "lindera-core", + "lindera-ipadic", + "lindera-ipadic-builder", + "lindera-ipadic-neologd-builder", + "lindera-ko-dic", + "lindera-ko-dic-builder", + "lindera-unidic", + "lindera-unidic-builder", + "serde", +] + +[[package]] +name = "lindera-ipadic" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b21c060c9309d29ac7e5c8fb7309c85dbf31b60f2fc4236f9dcda17854131276" +dependencies = [ + "bincode", + "byteorder", + "encoding", + "flate2", + "lindera-core", + "lindera-decompress", + "lindera-ipadic-builder", + "once_cell", + "tar", +] + +[[package]] +name = "lindera-ipadic-builder" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27c708f08f14b0806f6c4cce5324b4bcba27209463026b78c31f399f8be9d30d" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "encoding_rs", + "encoding_rs_io", + "env_logger", + "glob", + "lindera-compress", + "lindera-core", + "lindera-decompress", + "log", + "serde", + "yada", +] + +[[package]] +name = "lindera-ipadic-neologd-builder" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5e67eb91652203d202f7d27ead220d1d8c9099552709b8429eae9c70f2312fb" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "encoding_rs", + "encoding_rs_io", + "env_logger", + "glob", + "lindera-core", + "lindera-decompress", + "log", + "serde", + "yada", +] + +[[package]] +name = "lindera-ko-dic" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d45da8d9a5888f4d4e78bb29fc82ff9ae519962efb0d2d92343b6cf8e373952f" +dependencies = [ + "bincode", + "byteorder", + "encoding", + "flate2", + "lindera-core", + "lindera-decompress", + "lindera-ko-dic-builder", + "once_cell", + "tar", +] + +[[package]] +name = "lindera-ko-dic-builder" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41c0933295dc945178bbc08f34111dc3ef22bfee38820f78453c8f8d4f3463d1" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "encoding", + "env_logger", + "glob", + "lindera-compress", + "lindera-core", + "lindera-decompress", + "log", + "yada", +] + +[[package]] +name = "lindera-tantivy" +version = "0.27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9adeeed94c1a9e22aabe7e44f54733492fd1cee9b73b38f80847e506354c6ebf" +dependencies = [ + "lindera-core", + "lindera-dictionary", + "lindera-tokenizer", + "tantivy-tokenizer-api", +] + +[[package]] +name = "lindera-tokenizer" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "348ce9bb3f2e5edc577420b98cca05b2177f3af50ef5ae278a1d8a1351d56197" +dependencies = [ + "bincode", + "byteorder", + "lindera-core", + "lindera-dictionary", + "once_cell", + "serde", + "serde_json", +] + +[[package]] +name = "lindera-unidic" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74022a57c395ed7e213a9cd5833207e3c583145078ee9a164aeaec68b30c9d8e" +dependencies = [ + "bincode", + "byteorder", + "encoding", + "lindera-core", + "lindera-decompress", + "lindera-unidic-builder", + "once_cell", + "ureq", + "zip", +] + +[[package]] +name = "lindera-unidic-builder" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a34e5564ee81af82603cd6a03c3abe6e17cc0ae598bfa5078809f06e59e96e08" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "encoding", + "env_logger", + "glob", + "lindera-compress", + "lindera-core", + "lindera-decompress", + "log", + "yada", +] [[package]] name = "linux-raw-sys" -version = "0.4.10" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" [[package]] name = "lock_api" @@ -608,6 +1188,15 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" +[[package]] +name = "miniz_oxide" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" +dependencies = [ + "adler", +] + [[package]] name = "murmurhash32" version = "0.3.0" @@ -703,9 +1292,38 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-targets", + "windows-targets 0.48.5", ] +[[package]] +name = "password-hash" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700" +dependencies = [ + "base64ct", + "rand_core", + "subtle", +] + +[[package]] +name = "pbkdf2" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917" +dependencies = [ + "digest", + "hmac", + "password-hash", + "sha2", +] + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + [[package]] name = "pin-project-lite" version = "0.2.13" @@ -820,6 +1438,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" + [[package]] name = "rayon" version = "1.8.0" @@ -893,6 +1517,20 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +[[package]] +name = "ring" +version = "0.17.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "688c63d65483050968b2a8937f7995f443e27041a0f7700aa59b0822aedebb74" +dependencies = [ + "cc", + "getrandom", + "libc", + "spin", + "untrusted", + "windows-sys 0.48.0", +] + [[package]] name = "rust-stemmers" version = "1.2.0" @@ -911,15 +1549,37 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustix" -version = "0.38.21" +version = "0.38.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3" +checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" dependencies = [ "bitflags 2.4.1", "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustls" +version = "0.21.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d5a6813c0759e4609cd494e8e725babae6a2ca7b62a5536a13daaec6fcb7ba" +dependencies = [ + "log", + "ring", + "rustls-webpki", + "sct", +] + +[[package]] +name = "rustls-webpki" +version = "0.101.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" +dependencies = [ + "ring", + "untrusted", ] [[package]] @@ -946,6 +1606,16 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "sct" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" +dependencies = [ + "ring", + "untrusted", +] + [[package]] name = "serde" version = "1.0.190" @@ -977,6 +1647,28 @@ dependencies = [ "serde", ] +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -1010,12 +1702,24 @@ version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + [[package]] name = "stable_deref_trait" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "subtle" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" + [[package]] name = "syn" version = "2.0.38" @@ -1035,6 +1739,9 @@ dependencies = [ "chrono", "futures", "itertools 0.12.0", + "lindera-core", + "lindera-dictionary", + "lindera-tantivy", "pyo3", "pyo3-build-config", "pythonize", @@ -1161,7 +1868,7 @@ checksum = "fc0c1bb43e5e8b8e05eb8009610344dbf285f06066c844032fbb3e546b3c71df" dependencies = [ "tantivy-common", "tantivy-fst", - "zstd", + "zstd 0.12.4", ] [[package]] @@ -1183,6 +1890,17 @@ dependencies = [ "serde", ] +[[package]] +name = "tar" +version = "0.4.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb" +dependencies = [ + "filetime", + "libc", + "xattr", +] + [[package]] name = "target-lexicon" version = "0.12.12" @@ -1199,7 +1917,16 @@ dependencies = [ "fastrand", "redox_syscall", "rustix", - "windows-sys", + "windows-sys 0.48.0", +] + +[[package]] +name = "termcolor" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" +dependencies = [ + "winapi-util", ] [[package]] @@ -1261,6 +1988,21 @@ dependencies = [ "time-core", ] +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "tracing" version = "0.1.40" @@ -1322,18 +2064,71 @@ dependencies = [ "tracing-log", ] +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + +[[package]] +name = "unicode-bidi" +version = "0.3.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" + [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "unicode-normalization" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +dependencies = [ + "tinyvec", +] + [[package]] name = "unindent" version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "ureq" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cdd25c339e200129fe4de81451814e5228c9b771d57378817d6117cc2b3f97" +dependencies = [ + "base64", + "log", + "once_cell", + "rustls", + "rustls-webpki", + "url", + "webpki-roots", +] + +[[package]] +name = "url" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + [[package]] name = "utf8-ranges" version = "1.0.5" @@ -1432,6 +2227,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpki-roots" +version = "0.25.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1" + [[package]] name = "winapi" version = "0.3.9" @@ -1448,6 +2249,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +dependencies = [ + "winapi", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -1460,7 +2270,7 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" dependencies = [ - "windows-targets", + "windows-targets 0.48.5", ] [[package]] @@ -1469,7 +2279,7 @@ version = "0.51.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1f8cf84f35d2db49a46868f947758c7a1138116f7fac3bc844f43ade1292e64" dependencies = [ - "windows-targets", + "windows-targets 0.48.5", ] [[package]] @@ -1478,7 +2288,16 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets", + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.0", ] [[package]] @@ -1487,13 +2306,28 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +dependencies = [ + "windows_aarch64_gnullvm 0.52.0", + "windows_aarch64_msvc 0.52.0", + "windows_i686_gnu 0.52.0", + "windows_i686_msvc 0.52.0", + "windows_x86_64_gnu 0.52.0", + "windows_x86_64_gnullvm 0.52.0", + "windows_x86_64_msvc 0.52.0", ] [[package]] @@ -1502,42 +2336,101 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" + [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" +[[package]] +name = "windows_i686_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" + [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" +[[package]] +name = "windows_i686_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" + +[[package]] +name = "xattr" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8da84f1a25939b27f6820d92aed108f83ff920fdf11a7b19366c27c4cda81d4f" +dependencies = [ + "libc", + "linux-raw-sys", + "rustix", +] + +[[package]] +name = "yada" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d12cb7a57bbf2ab670ed9545bae3648048547f9039279a89ce000208e585c1" + [[package]] name = "zerocopy" version = "0.7.31" @@ -1558,13 +2451,52 @@ dependencies = [ "syn", ] +[[package]] +name = "zip" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" +dependencies = [ + "aes", + "byteorder", + "bzip2", + "constant_time_eq", + "crc32fast", + "crossbeam-utils", + "flate2", + "hmac", + "pbkdf2", + "sha1", + "time", + "zstd 0.11.2+zstd.1.5.2", +] + +[[package]] +name = "zstd" +version = "0.11.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" +dependencies = [ + "zstd-safe 5.0.2+zstd.1.5.2", +] + [[package]] name = "zstd" version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" dependencies = [ - "zstd-safe", + "zstd-safe 6.0.6", +] + +[[package]] +name = "zstd-safe" +version = "5.0.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" +dependencies = [ + "libc", + "zstd-sys", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index e8c7dd17..a7a0931e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,14 @@ futures = "0.3.26" pythonize = "0.20.0" serde = "1.0" serde_json = "1.0.91" +# Lindera +lindera-core = { version = "0.27.2", optional = true } +lindera-dictionary = { version = "0.27.2", optional = true } +lindera-tantivy = { version = "0.27.1", optional = true, features = ["unidic-compress", "unidic", "ko-dic-compress", "cc-cedict-compress", "ko-dic", "cc-cedict", "ipadic", "ipadic-compress"] } [dependencies.pyo3] version = "0.20.0" features = ["chrono", "extension-module"] + +[features] +lindera = ["lindera-core", "lindera-dictionary", "lindera-tantivy"] diff --git a/noxfile.py b/noxfile.py index 61652492..ed220aa2 100644 --- a/noxfile.py +++ b/noxfile.py @@ -5,4 +5,17 @@ def test(session): session.install("-rrequirements-dev.txt") session.install("-e", ".", "--no-build-isolation") + session.run("pytest", "-m", "not lindera", *session.posargs) + + +@nox.session(python=["3.8", "3.9", "3.10", "3.11", "3.12"]) +def test_lindera(session): + session.install("-rrequirements-dev.txt") + session.install( + "--no-build-isolation", + '--config-settings', + 'build-args="--features=lindera"', + "-e", + ".", + ) session.run("pytest", *session.posargs) diff --git a/pyproject.toml b/pyproject.toml index 9383f3e0..c8d7791e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,9 @@ dev = [ bindings = "pyo3" [tool.pytest.ini_options] +markers = [ + "lindera: mark a test as requiring lindera", +] # Set the durations option and doctest modules # See https://docs.pytest.org/en/latest/usage.html#durations addopts = "--doctest-modules --durations=10" diff --git a/src/index.rs b/src/index.rs index 55780dbe..5a0f7dde 100644 --- a/src/index.rs +++ b/src/index.rs @@ -163,20 +163,20 @@ impl IndexWriter { Value::Bytes(_) => { return Err(exceptions::PyValueError::new_err(format!( "Field `{field_name}` is bytes type not deletable." - ))) + ))); } Value::PreTokStr(_pretok) => { return Err(exceptions::PyValueError::new_err(format!( "Field `{field_name}` is pretokenized. This is not authorized for delete." - ))) + ))); } Value::JsonObject(_) => { return Err(exceptions::PyValueError::new_err(format!( "Field `{field_name}` is json object type not deletable." - ))) - }, + ))); + } Value::Bool(b) => Term::from_field_bool(field, b), - Value::IpAddr(i) => Term::from_field_ip_addr(field, i) + Value::IpAddr(i) => Term::from_field_ip_addr(field, i), }; Ok(self.inner()?.delete_term(term)) } @@ -246,6 +246,70 @@ impl Index { Ok(Index { index, reader }) } + /// Register the lindera tokenizer + /// + /// This will only be available if tantivy-py was built with the "lindera" + /// feature. Please see the documentation for how to do this. + /// Args: + /// tokenizer_name (str): The name of the tokenizer. Example: "lang_ja" + /// mode (Optional[LNormal, LDecompose]): The mode of the tokenizer. + /// If not provided, the mode will be `Normal`. These modes are + /// documented [in the lindera documentation](https://docs.rs/lindera-core/latest/lindera_core/mode/enum.Mode.html#). + /// To provide these from Python code, please provide an instance of + /// either the + /// `tantivy.tantivy.LNormal` or `tantivy.tantivy.LDecompose` classes. + /// Example: + /// ```python + /// from tantivy import Index + /// from tantivy.lindera import LNormal, LDecompose, LinderaDictionaryKind + /// ... + /// index = Index(schema) + /// index.register_lindera_tokenizer( + /// "lang_ja", + /// LNormal(), + /// LinderaDictionaryKind.IPADIC, + /// ) + /// ``` + /// dictionary_kind (LinderaDictionaryKind): The dictionary kind of the + /// tokenizer. This is an enum with the following possible values: + /// - `LinderaDictionaryKind.IPADIC` + /// - `LinderaDictionaryKind.IPADIC`, + /// - `LinderaDictionaryKind.IPADICNEologd`, + /// - `LinderaDictionaryKind.UniDic`, + /// - `LinderaDictionaryKind.KoDic`, + /// - `LinderaDictionaryKind.CcCedict`, + #[cfg(feature = "lindera")] + #[pyo3(signature = (tokenizer_name, mode, dictionary_kind))] + fn register_lindera_tokenizer( + &self, + tokenizer_name: String, + mode: Option<&PyAny>, + dictionary_kind: crate::lindera_tokenizer::LinderaDictionaryKind, + ) -> PyResult<()> { + use crate::lindera_tokenizer::{LDecompose, LNormal}; + + let mode = match mode { + None => lindera_core::mode::Mode::Normal, + Some(mode) => { + if let Ok(obj) = mode.extract::() { + obj.into() + } else if let Ok(obj) = mode.extract::() { + obj.into() + } else { + return Err(exceptions::PyTypeError::new_err( + "Invalid mode, valid choices are: 'normal' and 'decompose'" + )); + } + } + }; + let tokenizer = crate::lindera_tokenizer::create_tokenizer( + mode, + dictionary_kind.into(), + ); + self.index.tokenizers().register(&tokenizer_name, tokenizer); + Ok(()) + } + /// Create a `IndexWriter` for the index. /// /// The writer will be multithreaded and the provided heap size will be diff --git a/src/lib.rs b/src/lib.rs index 2bf9e3ec..8d861695 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,6 +5,8 @@ use pyo3::{exceptions, prelude::*, wrap_pymodule}; mod document; mod facet; mod index; +#[cfg(feature = "lindera")] +mod lindera_tokenizer; mod parser_error; mod query; mod schema; @@ -90,6 +92,19 @@ fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> { m.add_wrapped(wrap_pymodule!(query_parser_error))?; + #[cfg(feature = "lindera")] + m.add_wrapped(wrap_pymodule!(lindera))?; + + Ok(()) +} + +#[cfg(feature = "lindera")] +#[pymodule] +fn lindera(_py: Python, m: &PyModule) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/src/lindera_tokenizer.rs b/src/lindera_tokenizer.rs new file mode 100644 index 00000000..4be50e25 --- /dev/null +++ b/src/lindera_tokenizer.rs @@ -0,0 +1,141 @@ +use lindera_core::mode::Mode; +use lindera_dictionary::{ + load_dictionary_from_config, DictionaryConfig, DictionaryKind, +}; +use lindera_tantivy::tokenizer::LinderaTokenizer; +use pyo3::{pyclass, pymethods, FromPyObject}; + +#[pyclass] +#[derive(Clone)] +pub enum LinderaDictionaryKind { + IPADIC, + IPADICNEologd, + UniDic, + KoDic, + CcCedict, +} + +impl From for DictionaryKind { + fn from(kind: LinderaDictionaryKind) -> Self { + match kind { + LinderaDictionaryKind::IPADIC => DictionaryKind::IPADIC, + LinderaDictionaryKind::IPADICNEologd => { + DictionaryKind::IPADICNEologd + } + LinderaDictionaryKind::UniDic => DictionaryKind::UniDic, + LinderaDictionaryKind::KoDic => DictionaryKind::KoDic, + LinderaDictionaryKind::CcCedict => DictionaryKind::CcCedict, + } + } +} + +#[pyclass(get_all, set_all)] +#[derive(Clone)] +pub struct LinderaModeDecomposePenalty { + kanji_penalty_length_threshold: usize, + kanji_penalty_length_penalty: i32, + other_penalty_length_threshold: usize, + other_penalty_length_penalty: i32, +} + +impl Default for LinderaModeDecomposePenalty { + fn default() -> Self { + LinderaModeDecomposePenalty { + kanji_penalty_length_threshold: 2, + kanji_penalty_length_penalty: 3000, + other_penalty_length_threshold: 7, + other_penalty_length_penalty: 1700, + } + } +} + +#[pymethods] +impl LinderaModeDecomposePenalty { + #[new] + #[pyo3(signature = ( + kanji_penalty_length_threshold = 2, + kanji_penalty_length_penalty = 3000, + other_penalty_length_threshold = 7, + other_penalty_length_penalty = 1700 + ))] + pub fn new( + kanji_penalty_length_threshold: usize, + kanji_penalty_length_penalty: i32, + other_penalty_length_threshold: usize, + other_penalty_length_penalty: i32, + ) -> Self { + LinderaModeDecomposePenalty { + kanji_penalty_length_threshold, + kanji_penalty_length_penalty, + other_penalty_length_threshold, + other_penalty_length_penalty, + } + } +} + +impl From for lindera_core::mode::Penalty { + fn from(penalty: LinderaModeDecomposePenalty) -> Self { + lindera_core::mode::Penalty { + kanji_penalty_length_threshold: penalty + .kanji_penalty_length_threshold, + kanji_penalty_length_penalty: penalty.kanji_penalty_length_penalty, + other_penalty_length_threshold: penalty + .other_penalty_length_threshold, + other_penalty_length_penalty: penalty.other_penalty_length_penalty, + } + } +} + +#[pyclass(frozen)] +#[derive(Clone)] +pub struct LNormal {} + +impl From for Mode { + fn from(_: LNormal) -> Self { + Mode::Normal + } +} + +#[pymethods] +impl LNormal { + #[new] + pub fn new() -> Self { + LNormal {} + } +} + +#[pyclass(frozen, get_all)] +#[derive(Clone)] +pub struct LDecompose { + penalty: LinderaModeDecomposePenalty, +} + +impl From for Mode { + fn from(decompose: LDecompose) -> Self { + Mode::Decompose(decompose.penalty.into()) + } +} + +#[pymethods] +impl LDecompose { + #[new] + pub fn new(penalty: Option) -> Self { + LDecompose { + penalty: penalty.unwrap_or_default(), + } + } +} + +pub fn create_tokenizer( + mode: Mode, + dictionary_kind: DictionaryKind, +) -> LinderaTokenizer { + let dictionary_config = DictionaryConfig { + kind: Some(dictionary_kind.into()), + path: None, + }; + let dictionary = load_dictionary_from_config(dictionary_config).unwrap(); + let tokenizer = LinderaTokenizer::new(dictionary, None, mode); + + tokenizer +} diff --git a/tests/test_lindera.py b/tests/test_lindera.py new file mode 100644 index 00000000..132b2bc4 --- /dev/null +++ b/tests/test_lindera.py @@ -0,0 +1,35 @@ +import pytest +pytestmark = pytest.mark.lindera + +from tantivy import SchemaBuilder, Index, Document + + +@pytest.mark.parametrize("mode", [ + "normal", + "decompose", +]) +def test_basic(mode): + # The import is here so that the non-lindera tests + # can run without lindera installed. + from tantivy import lindera + + if mode == "normal": + mode = lindera.LNormal() + else: + mode = lindera.LDecompose() + + sb = SchemaBuilder() + sb.add_text_field("title", stored=True, tokenizer_name="lang_ja") + schema = sb.build() + index = Index(schema) + index.register_lindera_tokenizer( + "lang_ja", + mode, + lindera.LinderaDictionaryKind.IPADIC, + ) + writer = index.writer(50_000_000) + doc = Document() + doc.add_text("title", "成田国際空港") + writer.add_document(doc) + writer.commit() + index.reload()