Compare commits

..

No commits in common. "master" and "parallel" have entirely different histories.

6 changed files with 600 additions and 717 deletions

3
.gitignore vendored
View File

@ -1,5 +1,4 @@
/target /target
**/*.rs.bk **/*.rs.bk
baza*.zip baza.zip
/json/ /json/
json.zip

738
Cargo.lock generated
View File

@ -3,31 +3,21 @@
version = 3 version = 3
[[package]] [[package]]
name = "async-compression" name = "adler"
version = "0.4.1" version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62b74f44609f0f91493e3082d3734d98497e094777144380ea4db9f9905dd5b6" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
dependencies = [
"futures-core",
"futures-io",
"memchr",
"pin-project-lite",
"zstd",
"zstd-safe",
]
[[package]] [[package]]
name = "async_zip" name = "aes"
version = "0.0.15" version = "0.7.5"
source = "git+https://github.com/Majored/rs-async-zip?rev=ff0d985#ff0d985ef54cf00d73c497dbca0beea7541e37dc" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e8b47f52ea9bae42228d07ec09eb676433d7c4ed1ebdf0f1d1c29ed446f1ab8"
dependencies = [ dependencies = [
"async-compression", "cfg-if",
"crc32fast", "cipher",
"futures-util", "cpufeatures",
"pin-project", "opaque-debug",
"thiserror",
"tokio",
"tokio-util",
] ]
[[package]] [[package]]
@ -37,10 +27,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]] [[package]]
name = "bitflags" name = "base64ct"
version = "1.3.2" version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" checksum = "8a32fd6af2b5827bce66c29053ba0e7c42b9dcab01835835058558c10851a46b"
[[package]]
name = "block-buffer"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0bf7fe51849ea569fd452f37822f606a5cabb684dc918707a0193fd4664ff324"
dependencies = [
"generic-array",
]
[[package]] [[package]]
name = "byteorder" name = "byteorder"
@ -49,26 +48,31 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
[[package]] [[package]]
name = "bytes" name = "bzip2"
version = "0.4.12" version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "206fdffcfa2df7cbe15601ef46c813fce0965eb3286db6b56c583b814b51c81c" checksum = "6afcd980b5f3a45017c57e57a2fcccbb351cc43a356ce117ef760ef8052b89b0"
dependencies = [ dependencies = [
"byteorder", "bzip2-sys",
"iovec", "libc",
] ]
[[package]] [[package]]
name = "bytes" name = "bzip2-sys"
version = "1.2.1" version = "0.1.11+1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec8a7b6a70fde80372154c65702f00a0f56f3e1c36abbc6c440484be248856db" checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc"
dependencies = [
"cc",
"libc",
"pkg-config",
]
[[package]] [[package]]
name = "cc" name = "cc"
version = "1.0.79" version = "1.0.73"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11"
dependencies = [ dependencies = [
"jobserver", "jobserver",
] ]
@ -83,10 +87,35 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
name = "chgk_txt2json" name = "chgk_txt2json"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"async_zip", "encoding",
"futures-util",
"json", "json",
"tokio", "rayon",
"textstream",
"zip",
]
[[package]]
name = "cipher"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ee52072ec15386f770805afd189a01c8841be8696bed250fa2f13c4c0d6dfb7"
dependencies = [
"generic-array",
]
[[package]]
name = "constant_time_eq"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc"
[[package]]
name = "cpufeatures"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59a6001667ab124aebae2a495118e11d30984c3a653e99d86d58971708cf5e4b"
dependencies = [
"libc",
] ]
[[package]] [[package]]
@ -99,61 +128,159 @@ dependencies = [
] ]
[[package]] [[package]]
name = "futures" name = "crossbeam-channel"
version = "0.1.31" version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a471a38ef8ed83cd6e40aa59c1ffe17db6855c18e3604d9c4ed8c08ebc28678" checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521"
[[package]]
name = "futures-core"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c"
[[package]]
name = "futures-io"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964"
[[package]]
name = "futures-macro"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
dependencies = [ dependencies = [
"proc-macro2", "cfg-if",
"quote", "crossbeam-utils",
"syn 2.0.28",
] ]
[[package]] [[package]]
name = "futures-sink" name = "crossbeam-deque"
version = "0.3.28" version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc"
[[package]]
name = "futures-task"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65"
[[package]]
name = "futures-util"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533"
dependencies = [ dependencies = [
"futures-core", "cfg-if",
"futures-io", "crossbeam-epoch",
"futures-macro", "crossbeam-utils",
"futures-task", ]
"memchr",
"pin-project-lite", [[package]]
"pin-utils", name = "crossbeam-epoch"
"slab", version = "0.9.10"
"tokio-io", source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "045ebe27666471bb549370b4b0b3e51b07f56325befa4284db65fc89c02511b1"
dependencies = [
"autocfg",
"cfg-if",
"crossbeam-utils",
"memoffset",
"once_cell",
"scopeguard",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc"
dependencies = [
"cfg-if",
"once_cell",
]
[[package]]
name = "crypto-common"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
dependencies = [
"generic-array",
"typenum",
]
[[package]]
name = "digest"
version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2fb860ca6fafa5552fb6d0e816a69c8e49f0908bf524e30a90d97c85892d506"
dependencies = [
"block-buffer",
"crypto-common",
"subtle",
]
[[package]]
name = "either"
version = "1.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f107b87b6afc2a64fd13cac55fe06d6c8859f12d4b14cbcdd2c67d0976781be"
[[package]]
name = "encoding"
version = "0.2.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec"
dependencies = [
"encoding-index-japanese",
"encoding-index-korean",
"encoding-index-simpchinese",
"encoding-index-singlebyte",
"encoding-index-tradchinese",
]
[[package]]
name = "encoding-index-japanese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-korean"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-simpchinese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-singlebyte"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-tradchinese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding_index_tests"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
[[package]]
name = "flate2"
version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f82b0f4c27ad9f8bfd1f3208d882da2b09c301bc1c828fd3a00d0216d2fbbff6"
dependencies = [
"crc32fast",
"miniz_oxide",
]
[[package]]
name = "generic-array"
version = "0.14.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9"
dependencies = [
"typenum",
"version_check",
] ]
[[package]] [[package]]
@ -166,19 +293,25 @@ dependencies = [
] ]
[[package]] [[package]]
name = "iovec" name = "hmac"
version = "0.1.4" version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
dependencies = [ dependencies = [
"libc", "digest",
] ]
[[package]] [[package]]
name = "jobserver" name = "itoa"
version = "0.1.26" version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754"
[[package]]
name = "jobserver"
version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa"
dependencies = [ dependencies = [
"libc", "libc",
] ]
@ -191,45 +324,35 @@ checksum = "078e285eafdfb6c4b434e0d31e8cfcb5115b651496faca5749b88fafd4f23bfd"
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.132" version = "0.2.131"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8371e4e5341c3a96db127eb2465ac681ced4c433e01dd0e938adbef26ba93ba5" checksum = "04c3b4822ccebfa39c02fc03d1534441b22ead323fa0f48bb7ddd8e6ba076a40"
[[package]]
name = "lock_api"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f80bf5aacaf25cbfc8210d1cfb718f2bf3b11c4c54e5afe36c236853a8ec390"
dependencies = [
"autocfg",
"scopeguard",
]
[[package]]
name = "log"
version = "0.4.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
dependencies = [
"cfg-if",
]
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.5.0" version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" checksum = "148fab2e51b4f1cfc66da2a7c32981d1d3c083a803978268bb11fe4b86925e7a"
[[package]]
name = "mio"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57ee1c23c7c63b0c9250c339ffdc69255f110b298b901b9f6c82547b7b87caaf"
dependencies = [ dependencies = [
"libc", "libc",
"log", ]
"wasi",
"windows-sys", [[package]]
name = "memoffset"
version = "0.6.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce"
dependencies = [
"autocfg",
]
[[package]]
name = "miniz_oxide"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f5c75688da582b8ffc1f1799e9db273f32133c49e048f614d22ec3256773ccc"
dependencies = [
"adler",
] ]
[[package]] [[package]]
@ -243,97 +366,83 @@ dependencies = [
] ]
[[package]] [[package]]
name = "once_cell" name = "num_threads"
version = "1.14.0" version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f7254b99e31cad77da24b08ebf628882739a608578bb1bcdfc1f9c21260d7c0" checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44"
[[package]]
name = "parking_lot"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
dependencies = [ dependencies = [
"lock_api",
"parking_lot_core",
]
[[package]]
name = "parking_lot_core"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09a279cbf25cb0757810394fbc1e359949b59e348145c643a939a525692e6929"
dependencies = [
"cfg-if",
"libc", "libc",
"redox_syscall",
"smallvec",
"windows-sys",
] ]
[[package]] [[package]]
name = "pin-project" name = "once_cell"
version = "1.1.2" version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "030ad2bc4db10a8944cb0d837f158bdfec4d4a4873ab701a95046770d11f8842" checksum = "18a6dbe30758c9f83eb00cbea4ac95966305f5a7772f3f42ebfc7fc7eddbd8e1"
[[package]]
name = "opaque-debug"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5"
[[package]]
name = "password-hash"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d791538a6dcc1e7cb7fe6f6b58aca40e7f79403c45b2bc274008b5e647af1d8"
dependencies = [ dependencies = [
"pin-project-internal", "base64ct",
"rand_core",
"subtle",
] ]
[[package]] [[package]]
name = "pin-project-internal" name = "pbkdf2"
version = "1.1.2" version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec2e072ecce94ec471b13398d5402c188e76ac03cf74dd1a975161b23a3f6d9c" checksum = "271779f35b581956db91a3e55737327a03aa051e90b1c47aeb189508533adfd7"
dependencies = [ dependencies = [
"proc-macro2", "digest",
"quote", "hmac",
"syn 2.0.28", "password-hash",
"sha2",
] ]
[[package]]
name = "pin-project-lite"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116"
[[package]]
name = "pin-utils"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]] [[package]]
name = "pkg-config" name = "pkg-config"
version = "0.3.27" version = "0.3.25"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae"
[[package]] [[package]]
name = "proc-macro2" name = "rand_core"
version = "1.0.66" version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7"
[[package]]
name = "rayon"
version = "1.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d"
dependencies = [ dependencies = [
"unicode-ident", "autocfg",
"crossbeam-deque",
"either",
"rayon-core",
] ]
[[package]] [[package]]
name = "quote" name = "rayon-core"
version = "1.0.32" version = "1.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965" checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f"
dependencies = [ dependencies = [
"proc-macro2", "crossbeam-channel",
] "crossbeam-deque",
"crossbeam-utils",
[[package]] "num_cpus",
name = "redox_syscall"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
dependencies = [
"bitflags",
] ]
[[package]] [[package]]
@ -343,229 +452,107 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]] [[package]]
name = "signal-hook-registry" name = "sha1"
version = "1.4.0" version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" checksum = "c77f4e7f65455545c2153c1253d25056825e77ee2533f0e41deb65a93a34852f"
dependencies = [ dependencies = [
"libc", "cfg-if",
"cpufeatures",
"digest",
] ]
[[package]] [[package]]
name = "slab" name = "sha2"
version = "0.4.8" version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d" checksum = "55deaec60f81eefe3cce0dc50bda92d6d8e88f2a27df7c5033b42afeb1ed2676"
dependencies = [ dependencies = [
"autocfg", "cfg-if",
"cpufeatures",
"digest",
] ]
[[package]] [[package]]
name = "smallvec" name = "subtle"
version = "1.9.0" version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"
[[package]] [[package]]
name = "socket2" name = "textstream"
version = "0.4.7" version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" checksum = "e7ed81b342f6566026755e7f4b7798810b1c159722e427d212ce72c2c58ffdaa"
dependencies = [ dependencies = [
"libc", "encoding",
"winapi",
]
[[package]]
name = "syn"
version = "1.0.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "syn"
version = "2.0.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "thiserror"
version = "1.0.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c1b05ca9d106ba7d2e31a9dab4a64e7be2cce415321966ea3132c49a656e252"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8f2591983642de85c921015f3f070c665a197ed69e417af436115e3a1407487"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.99",
]
[[package]]
name = "tokio"
version = "1.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89797afd69d206ccd11fb0ea560a44bbb87731d020670e79416d442919257d42"
dependencies = [
"autocfg",
"bytes 1.2.1",
"libc",
"memchr", "memchr",
"mio",
"num_cpus",
"once_cell",
"parking_lot",
"pin-project-lite",
"signal-hook-registry",
"socket2",
"tokio-macros",
"winapi",
] ]
[[package]] [[package]]
name = "tokio-io" name = "time"
version = "0.1.13" version = "0.3.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57fc868aae093479e3131e3d165c93b1c7474109d13c90ec0dda2a1bbfff0674" checksum = "db76ff9fa4b1458b3c7f077f3ff9887394058460d21e634355b273aaf11eea45"
dependencies = [ dependencies = [
"bytes 0.4.12", "itoa",
"futures", "libc",
"log", "num_threads",
"time-macros",
] ]
[[package]] [[package]]
name = "tokio-macros" name = "time-macros"
version = "1.8.0" version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9724f9a975fb987ef7a3cd9be0350edcbe130698af5b8f7a631e23d42d052484" checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792"
[[package]]
name = "typenum"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987"
[[package]]
name = "version_check"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "zip"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf225bcf73bb52cbb496e70475c7bd7a3f769df699c0020f6c7bd9a96dcf0b8d"
dependencies = [ dependencies = [
"proc-macro2", "aes",
"quote", "byteorder",
"syn 1.0.99", "bzip2",
"constant_time_eq",
"crc32fast",
"crossbeam-utils",
"flate2",
"hmac",
"pbkdf2",
"sha1",
"time",
"zstd",
] ]
[[package]]
name = "tokio-util"
version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e267c18a719545b481171952a79f8c25c80361463ba44bc7fa9eba7c742ef4f"
dependencies = [
"bytes 1.2.1",
"futures-core",
"futures-io",
"futures-sink",
"pin-project-lite",
"tokio",
]
[[package]]
name = "unicode-ident"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4f5b37a154999a8f3f98cc23a628d850e154479cd94decf3414696e12e31aaf"
[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-sys"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2"
dependencies = [
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_msvc"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47"
[[package]]
name = "windows_i686_gnu"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6"
[[package]]
name = "windows_i686_msvc"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024"
[[package]]
name = "windows_x86_64_gnu"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1"
[[package]]
name = "windows_x86_64_msvc"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680"
[[package]] [[package]]
name = "zstd" name = "zstd"
version = "0.12.4" version = "0.10.2+zstd.1.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" checksum = "5f4a6bd64f22b5e3e94b4e238669ff9f10815c27a5180108b849d24174a83847"
dependencies = [ dependencies = [
"zstd-safe", "zstd-safe",
] ]
[[package]] [[package]]
name = "zstd-safe" name = "zstd-safe"
version = "6.0.6" version = "4.1.6+zstd.1.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581" checksum = "94b61c51bb270702d6167b8ce67340d2754b088d0c091b06e593aa772c3ee9bb"
dependencies = [ dependencies = [
"libc", "libc",
"zstd-sys", "zstd-sys",
@ -573,11 +560,10 @@ dependencies = [
[[package]] [[package]]
name = "zstd-sys" name = "zstd-sys"
version = "2.0.8+zstd.1.5.5" version = "1.6.3+zstd.1.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" checksum = "fc49afa5c8d634e75761feda8c592051e7eeb4683ba827211eb0d731d3402ea8"
dependencies = [ dependencies = [
"cc", "cc",
"libc", "libc",
"pkg-config",
] ]

View File

@ -7,14 +7,11 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
json = "0.12" zip = "0.6"
tokio = { version = "1.21.0", features = ["full"] } encoding = "0.2"
async_zip = { git = "https://github.com/Majored/rs-async-zip", rev = "ff0d985", features = [ textstream = "0.1"
"zstd", json="0.12"
"tokio", rayon="1.5"
"tokio-fs",
] }
futures-util = { version = "0.3.28", features = ["io", "tokio-io"] }
[profile.release] [profile.release]
opt-level = 3 opt-level = 3

21
LICENSE
View File

@ -1,21 +0,0 @@
MIT License
Copyright (c) 2022 Dmitry Belyaev
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1,26 +0,0 @@
# chgk_txt2json
Конвертер **текстовых** файлов с вопросами в **JSON** формат.
Исходные файлы вопросов должны находиться в ZIP файле `baza_utf8.zip`, результат будет записан в файл `json.zip`.
## Особенности
### Кодировка исходных файлов
В оригинальных файлах базы используется кодировка `KOI8-R`.
Но из за того что [tokio::AsyncBufReadExt::lines](https://docs.rs/tokio/latest/tokio/io/trait.AsyncBufReadExt.html#method.lines) работает только с `UTF-8`, эта утилита работает с файлами которые уже в кодировке `UTF-8`.
### Алгоритм сжатия выходного архива
Для сжатия файлов в выходном архиве используется метод `Zstandard`, т.к. он достаточно быстр и по степени сжатия сопоставим с обычным `Deflate`.
Но для просмотра содержимого таких ZIP файлов нужно использовать ПО c поддержкой `Zstandard`, например:
- [Modern7z](https://www.tc4shell.com/ru/7zip/modern7z)
- [7-Zip-zstd](https://github.com/mcmilk/7-Zip-zstd)
## Ссылки
- Исходная база вопросов: http://db.chgk.info
- Копия файлов базы вопросов: https://gitlab.com/b4tman/db_chgk
- Утилита конвертации кодировки текстовых файлов в ZIP архиве: https://gitea.b4tman.ru/temp/ztb_recode

View File

@ -1,20 +1,19 @@
extern crate async_zip; extern crate encoding;
extern crate json; extern crate json;
extern crate tokio; extern crate rayon;
extern crate textstream;
extern crate zip;
use async_zip::tokio::read::seek::ZipFileReader; use encoding::all::KOI8_R;
use async_zip::tokio::write::ZipFileWriter; use encoding::DecoderTrap;
use async_zip::{Compression, ZipEntryBuilder}; use rayon::prelude::*;
use futures_util::io::{AsyncBufReadExt, BufReader};
use futures_util::stream::StreamExt;
use std::path::PathBuf; use std::path::PathBuf;
use std::str::FromStr; use std::str::FromStr;
use tokio::fs; use std::{fs, io};
use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender}; use textstream::TextReader;
const INPUT_FILENAME: &str = "baza_utf8.zip"; const BASE_FILENAME: &str = "baza.zip";
const OUTPUT_FILENAME: &str = "json.zip"; const OUTPUT_PATH: &str = "json";
const OUTPUT_COMPRESSION: Compression = Compression::Zstd;
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
enum KeywordType { enum KeywordType {
@ -26,24 +25,6 @@ enum KeywordType {
CurrentScope, CurrentScope,
} }
impl FromStr for KeywordType {
type Err = ();
fn from_str(pattern: &str) -> Result<Self, Self::Err> {
use KeywordType::*;
Ok(match pattern {
"Мета:" => Ignore,
"Чемпионат:" | "Пакет:" => Global,
"Тур:" => QuestionPre,
"Вопрос " | "Вопрос:" => QuestionStart,
"Ответ:" | "Зачет:" => QuestionContent,
_ => CurrentScope,
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
})
}
}
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
enum DataScope { enum DataScope {
Global, Global,
@ -51,7 +32,7 @@ enum DataScope {
QuestionContent, QuestionContent,
} }
struct QuestionsParser { struct Context {
// global output value // global output value
data: json::JsonValue, data: json::JsonValue,
// temp questions array // temp questions array
@ -75,9 +56,64 @@ struct QuestionsParser {
last_tag: String, last_tag: String,
} }
/// Text questions parser // check questions before push
impl QuestionsParser { trait PushIfValid {
const PATTERNS: &'static [&'static str] = &[ fn is_valid(&self) -> bool;
fn push_if_valid(&mut self, value: json::JsonValue);
}
impl PushIfValid for json::JsonValue {
fn is_valid(&self) -> bool {
self.has_key("Вопрос") && self.has_key("Ответ")
}
fn push_if_valid(&mut self, value: json::JsonValue) {
if value.is_valid() {
self.push(value).unwrap_or(())
}
}
}
impl Context {
fn new() -> Context {
Context {
data: json::JsonValue::new_object(),
questions: json::JsonValue::new_array(),
cur_keyword_type: None,
cur_question: json::JsonValue::new_object(),
cur_question_pre: json::JsonValue::new_object(),
cur_tag: String::new(),
cur_content: Vec::<String>::new(),
cur_scope: DataScope::Global,
have_new_question: false,
last_keyword_type: None,
last_tag: String::new(),
}
}
}
impl FromStr for KeywordType {
type Err = ();
fn from_str(pattern: &str) -> Result<Self, Self::Err> {
use KeywordType::*;
Ok(match pattern {
"Мета:" => Ignore,
"Чемпионат:" | "Пакет:" => Global,
"Тур:" => QuestionPre,
"Вопрос " | "Вопрос:" => QuestionStart,
"Ответ:" | "Зачет:" => QuestionContent,
_ => CurrentScope,
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
})
}
}
fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<dyn std::error::Error>> {
let buf = io::BufReader::new(file);
let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore);
let patterns = vec![
"Чемпионат:", "Чемпионат:",
"Пакет:", "Пакет:",
"URL:", "URL:",
@ -102,276 +138,188 @@ impl QuestionsParser {
"Комментарий:", "Комментарий:",
"Комментарии:", "Комментарии:",
]; ];
let mut context = Context::new();
let mut ctx = &mut context;
/// create new parser reader
pub fn new() -> QuestionsParser { .lines()
QuestionsParser { .map(|line| String::from(line.unwrap().trim()))
data: json::JsonValue::new_object(), .filter(|line| !line.is_empty()) // ignore empty lines
questions: json::JsonValue::new_array(), .for_each(|line| {
cur_keyword_type: None, match patterns
cur_question: json::JsonValue::new_object(), .iter() // find keyword
cur_question_pre: json::JsonValue::new_object(), .find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
cur_tag: String::new(), {
cur_content: Vec::<String>::new(), Some(pattern) => {
cur_scope: DataScope::Global, use KeywordType::*;
have_new_question: false,
last_keyword_type: None,
last_tag: String::new(),
}
}
/// join current content lines
fn get_current_content(&self) -> String {
self.cur_content.join("\n")
}
/// clear current content
fn clear_current_content(&mut self) {
self.cur_content.clear()
}
/// add new line to current content
fn append_to_current_content(&mut self, line: String) {
self.cur_content.push(line);
}
/// check current question have required fields
fn is_current_question_valid(&self) -> bool {
self.cur_question.has_key("Вопрос") && self.cur_question.has_key("Ответ")
}
/// add current question to parsed array
fn add_cur_question(&mut self) {
if self.is_current_question_valid() {
let current = std::mem::replace(&mut self.cur_question, self.cur_question_pre.clone());
self.questions.push(current).unwrap()
}
}
/// set current content to last tag(keyword) to data scope
fn apply_content_to(&mut self, scope: DataScope) {
let content = self.get_current_content();
// match value to store data
let scope_data = match scope {
DataScope::Global => &mut self.data,
DataScope::QuestionPre => &mut self.cur_question_pre,
DataScope::QuestionContent => &mut self.cur_question,
};
scope_data[&self.last_tag] = content.into();
self.clear_current_content();
}
/// set current content to last tag(keyword) to current scope
fn apply_content_to_cur_scope(&mut self) {
self.apply_content_to(self.cur_scope);
}
/// set current scope
fn set_scope(&mut self, scope: DataScope) {
self.cur_scope = scope;
}
/// set current scope and set current content to last tag(keyword) to data scope
fn set_scope_and_apply(&mut self, scope: DataScope) {
self.set_scope(scope);
self.apply_content_to_cur_scope();
}
/// add last question (if have) and start collecting new one
fn start_new_question(&mut self) {
// store prev question before reading new
if self.have_new_question {
self.add_cur_question();
}
self.have_new_question = true;
}
/// check last tag(keyword) and set current content to corresponding data scope ctx.last_keyword_type = ctx.cur_keyword_type;
fn apply_content_for_last_keyword(&mut self) { ctx.last_tag = ctx.cur_tag.clone();
// apply accumulated content when new keyword found ctx.cur_keyword_type = Some(pattern.parse().unwrap());
match self.last_keyword_type { ctx.cur_tag = pattern.replace(' ', "").replace(':', "");
Some(KeywordType::Global) => {
self.set_scope_and_apply(DataScope::Global);
}
Some(KeywordType::QuestionPre) => {
self.set_scope_and_apply(DataScope::QuestionPre);
}
Some(KeywordType::QuestionStart) => {
self.start_new_question();
self.set_scope_and_apply(DataScope::QuestionContent);
}
Some(KeywordType::QuestionContent) => {
self.apply_content_to(DataScope::QuestionContent);
}
Some(KeywordType::CurrentScope) => {
self.apply_content_to_cur_scope();
}
_ => (), //None or Ignore
};
}
/// set current keyword(tag) and type as last, and set new as current
fn set_new_keyword(&mut self, keyword: &str) {
self.last_keyword_type =
std::mem::replace(&mut self.cur_keyword_type, Some(keyword.parse().unwrap()));
self.last_tag = std::mem::replace(
&mut self.cur_tag,
keyword.trim_end().trim_end_matches(':').to_string(),
);
}
/// if line matched keyword
fn on_keyword_match(&mut self, line: &str, keyword: &str) {
self.set_new_keyword(keyword);
// remember question id // remember question id
if let Some(KeywordType::QuestionStart) = self.cur_keyword_type { if let Some(QuestionStart) = ctx.cur_keyword_type {
self.cur_question_pre["id"] = line.trim_end().trim_end_matches(':').into(); ctx.cur_question_pre["id"] = line.replace(':', "").as_str().into();
}; };
self.apply_content_for_last_keyword(); // apply accumulated content when new keyword found
} match ctx.last_keyword_type {
Some(Global) => {
/// parse next line ctx.cur_scope = DataScope::Global;
pub fn parse_line(&mut self, line: &str) { ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into()
match QuestionsParser::PATTERNS }
.iter() // find keyword Some(QuestionPre) => {
.find(|&&pattern| line.starts_with(pattern) && line.ends_with(':')) ctx.cur_scope = DataScope::QuestionPre;
{ ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into();
Some(pattern) => { }
self.on_keyword_match(line, pattern); Some(QuestionStart) => {
} ctx.cur_scope = DataScope::QuestionContent;
None => { // store prev question before reading new
self.append_to_current_content(line.to_string()); if ctx.have_new_question {
} ctx.questions.push_if_valid(ctx.cur_question.clone());
} }
} // prepare to read new question data with cur_question_pre values
ctx.cur_question = ctx.cur_question_pre.clone();
/// finish parsing ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
pub fn finish(&mut self) { ctx.have_new_question = true;
if self.have_new_question && !self.cur_content.is_empty() { }
self.cur_question[&self.cur_tag] = self.get_current_content().into(); Some(QuestionContent) => {
self.add_cur_question(); ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
self.clear_current_content(); }
self.have_new_question = false; Some(CurrentScope) => {
} // match value to store data
self.data["Вопросы"] = std::mem::replace(&mut self.questions, json::JsonValue::new_array()); let scope_data = match ctx.cur_scope {
} DataScope::Global => &mut ctx.data,
DataScope::QuestionPre => &mut ctx.cur_question_pre,
/// get parsed data DataScope::QuestionContent => &mut ctx.cur_question,
pub fn get_parsed(self) -> json::JsonValue { };
self.data scope_data[&ctx.last_tag] = ctx.cur_content.join("\n").into();
} }
} _ => (), //None or Ignore
};
#[derive(Debug)] // clear content
struct FileText { ctx.cur_content.clear();
name: String, }
text: String, None => {
} // accumulate content if line is not a keyword
ctx.cur_content.push(line);
#[derive(Debug)]
enum TextReaderMessage {
NextLine(String),
EndOfFile(String),
}
/// read txt files from zip and convert to json
async fn zip_text_reader(tx: UnboundedSender<TextReaderMessage>) {
// open archive just to list files
let mut file = fs::File::open(INPUT_FILENAME).await.expect("open zip");
let archive = ZipFileReader::with_tokio(&mut file)
.await
.expect("open zip file reader");
let source_files: Vec<(usize, String)> = archive
.file()
.entries()
.iter()
.enumerate()
.filter(|(_, entry)| !entry.dir().unwrap_or(true))
.map(|(index, entry)| (index, entry.filename().as_str().unwrap().to_string()))
.filter(|(_, filename)| filename.ends_with(".txt"))
.collect();
//
drop(archive);
let mut file = fs::File::open(INPUT_FILENAME).await.expect("open zip 2");
let mut archive = ZipFileReader::with_tokio(&mut file)
.await
.expect("open zip file reader 2");
for (index, name) in source_files {
let entry_reader = archive.reader_with_entry(index).await.expect("read entry");
let buf_reader = BufReader::new(entry_reader);
let lines = buf_reader.lines();
tokio::pin!(lines);
while let Some(Ok(line)) = lines.next().await {
tx.send(TextReaderMessage::NextLine(line))
.expect("send line");
}
tx.send(TextReaderMessage::EndOfFile(name))
.expect("send end");
}
println!("read done ✅");
}
/// convert text questions to json format
async fn questions_converter(
mut rx: UnboundedReceiver<TextReaderMessage>,
tx: UnboundedSender<FileText>,
) {
let mut parser = QuestionsParser::new();
while let Some(msg) = rx.recv().await {
match msg {
TextReaderMessage::NextLine(line) => {
let line = line.trim();
if line.is_empty() {
continue;
} }
parser.parse_line(line);
}
TextReaderMessage::EndOfFile(name) => {
parser.finish();
let data_json = parser.get_parsed();
let text = data_json.pretty(2);
tx.send(FileText { name, text }).expect("send json");
parser = QuestionsParser::new();
} }
});
// finish reading last question
if ctx.have_new_question && !ctx.cur_content.is_empty() {
ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").into();
ctx.questions.push_if_valid(ctx.cur_question.clone());
ctx.have_new_question = false;
}
ctx.data["Вопросы"] = ctx.questions.clone();
Ok(ctx.data.clone())
}
// split vector to a vector of [num] slices
trait SplitTo<T> {
fn split_to(&self, num: usize) -> Vec<&[T]>;
}
impl<T> SplitTo<T> for Vec<T> {
fn split_to(&self, num: usize) -> Vec<&[T]> {
let part_len = self.len() / num;
let add_len = self.len() % num;
let mut result = Vec::<&[T]>::with_capacity(num);
if 0 == part_len {
result.push(self);
return result;
} }
for i in 0..num {
let size = if (num - 1) == i {
part_len + add_len
} else {
part_len
};
let start = part_len * i;
result.push(&self[start..(start + size)]);
}
result
} }
println!("convert done ✅");
} }
/// write json data to zip files fn process_files(files: &&[PathBuf]) {
async fn zip_json_writer(mut rx: UnboundedReceiver<FileText>) { if files.is_empty() {
let mut file = fs::File::create(OUTPUT_FILENAME) return;
.await }
.expect("create file");
let mut writer = ZipFileWriter::with_tokio(&mut file); let start_file = files[0].to_str().unwrap();
println!("-> start from \"{}\" ({} files)", start_file, files.len());
let zip_file = fs::File::open(BASE_FILENAME).unwrap();
let zip_reader = io::BufReader::new(zip_file);
let mut archive = zip::ZipArchive::new(zip_reader).unwrap();
files.iter().for_each(|name| {
let name_str = name.to_str().unwrap();
// parse txt file
let file = archive.by_name(name_str).unwrap();
let data = parse_file(file).unwrap();
while let Some(FileText { name, text: data }) = rx.recv().await {
// make output filename // make output filename
let mut outfilename = PathBuf::from(name); let mut outfilename = PathBuf::from(OUTPUT_PATH);
outfilename.push(name);
outfilename.set_extension("json"); outfilename.set_extension("json");
let outfilename = outfilename.to_str().unwrap().to_string();
let opts = ZipEntryBuilder::new(outfilename.into(), OUTPUT_COMPRESSION);
// write new zip entry // save json to file
writer let mut outfile = fs::File::create(outfilename).unwrap();
.write_entry_whole(opts, data.as_bytes()) data.write_pretty(&mut outfile, 1).unwrap();
.await });
.expect("write entry");
}
writer.close().await.expect("close writer");
println!("write done ✅"); println!("<- done {} files (from \"{}\")", files.len(), start_file);
} }
#[tokio::main] fn main() -> Result<(), Box<dyn std::error::Error>> {
async fn main() -> Result<(), Box<dyn std::error::Error>> { // open archive just to list files
// check output filename let zip_file = fs::File::open(BASE_FILENAME)?;
match fs::metadata(OUTPUT_FILENAME).await { let zip_reader = io::BufReader::new(zip_file);
Ok(x) if x.is_dir() => return Err("output file is a directory!".into()), let mut archive = zip::ZipArchive::new(zip_reader)?;
_ => (),
let source_files: Vec<PathBuf> = (0..archive.len())
.map(|i| archive.by_index(i).unwrap().mangled_name())
.filter(|name| {
// skip files without "txt" extension
match name.extension() {
Some(ext) => match ext.to_str() {
Some(ext_str) => ext_str.eq_ignore_ascii_case("txt"),
_ => false, // extension is not valid unicode or not txt
},
_ => false, // no extension in filename
}
})
.collect();
drop(archive);
// check output directory
let out_dir: PathBuf = OUTPUT_PATH.into();
if out_dir.is_file() {
return Err("output directory is file!".into());
} else if !out_dir.exists() {
fs::create_dir_all(out_dir)?;
}; };
let (reader_tx, reader_rx) = mpsc::unbounded_channel::<TextReaderMessage>(); println!(
let (json_tx, json_rx) = mpsc::unbounded_channel::<FileText>(); "processing {} files with {} threads...",
source_files.len(),
rayon::current_num_threads()
);
tokio::try_join!( // split vector and process its parts in parallel
tokio::spawn(zip_text_reader(reader_tx)), source_files
tokio::spawn(questions_converter(reader_rx, json_tx)), .split_to(rayon::current_num_threads())
tokio::spawn(zip_json_writer(json_rx)) .par_iter()
)?; .for_each(process_files);
println!("all done ✅"); println!("done");
Ok(()) Ok(())
} }