Compare commits

...

22 Commits

Author SHA1 Message Date
65c64f17d7
simplify filename filter 2023-08-01 17:12:23 +03:00
130e5ee2aa
upd async_zip 2023-08-01 16:49:15 +03:00
a53bc4643b
add LICENSE 2022-10-01 15:42:39 +03:00
38e41f2eab
add README 2022-10-01 15:40:17 +03:00
01d05fab01
remove recode.ps1 2022-10-01 15:39:53 +03:00
2a06eabd15
split reader/converter 2022-10-01 12:42:32 +03:00
24f675ca16
use mem::replace in add_cur_question 2022-09-23 23:20:17 +03:00
aed59f4663
use BufWriter 2022-09-23 23:01:57 +03:00
73f485eb59
std::mem::replace instead of clones 2022-09-23 22:50:42 +03:00
9d1885b245
trim instead of replace 2022-09-23 22:37:17 +03:00
fbca249149
use unbounded_channel instead of VecDeque 2022-09-20 17:17:59 +03:00
bc7bdee83a
out compression: zstd 2022-09-20 15:34:39 +03:00
61587132a3
reader/writer rename 2022-09-20 14:35:47 +03:00
d77e164d07
parser refactor 2022-09-20 14:31:37 +03:00
7eaa394d7d
cleanup 2022-09-20 11:39:53 +03:00
a0a3ea0fbb
fmt 2022-09-20 00:16:04 +03:00
5999e2bb92
separate read/write with queue 2022-09-20 00:12:49 +03:00
56a20dc6b1
zip to zip 2022-09-12 22:34:17 +03:00
bb126256c7
cleanup deps 2022-09-12 16:49:53 +03:00
c29a247284
spawn_blocking on pretty 2022-09-12 16:38:23 +03:00
ed87fd17ea
no spawn 2022-09-12 16:30:13 +03:00
9cb95c435a
async v1 2022-09-12 16:25:15 +03:00
6 changed files with 735 additions and 618 deletions

3
.gitignore vendored
View File

@ -1,4 +1,5 @@
/target /target
**/*.rs.bk **/*.rs.bk
baza.zip baza*.zip
/json/ /json/
json.zip

776
Cargo.lock generated
View File

@ -3,21 +3,31 @@
version = 3 version = 3
[[package]] [[package]]
name = "adler" name = "async-compression"
version = "1.0.2" version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" checksum = "62b74f44609f0f91493e3082d3734d98497e094777144380ea4db9f9905dd5b6"
dependencies = [
"futures-core",
"futures-io",
"memchr",
"pin-project-lite",
"zstd",
"zstd-safe",
]
[[package]] [[package]]
name = "aes" name = "async_zip"
version = "0.7.5" version = "0.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "git+https://github.com/Majored/rs-async-zip?rev=ff0d985#ff0d985ef54cf00d73c497dbca0beea7541e37dc"
checksum = "9e8b47f52ea9bae42228d07ec09eb676433d7c4ed1ebdf0f1d1c29ed446f1ab8"
dependencies = [ dependencies = [
"cfg-if", "async-compression",
"cipher", "crc32fast",
"cpufeatures", "futures-util",
"opaque-debug", "pin-project",
"thiserror",
"tokio",
"tokio-util",
] ]
[[package]] [[package]]
@ -27,19 +37,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]] [[package]]
name = "base64ct" name = "bitflags"
version = "1.0.1" version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a32fd6af2b5827bce66c29053ba0e7c42b9dcab01835835058558c10851a46b" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "block-buffer"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0bf7fe51849ea569fd452f37822f606a5cabb684dc918707a0193fd4664ff324"
dependencies = [
"generic-array",
]
[[package]] [[package]]
name = "byteorder" name = "byteorder"
@ -48,31 +49,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
[[package]] [[package]]
name = "bzip2" name = "bytes"
version = "0.4.3" version = "0.4.12"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6afcd980b5f3a45017c57e57a2fcccbb351cc43a356ce117ef760ef8052b89b0" checksum = "206fdffcfa2df7cbe15601ef46c813fce0965eb3286db6b56c583b814b51c81c"
dependencies = [ dependencies = [
"bzip2-sys", "byteorder",
"libc", "iovec",
] ]
[[package]] [[package]]
name = "bzip2-sys" name = "bytes"
version = "0.1.11+1.0.8" version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" checksum = "ec8a7b6a70fde80372154c65702f00a0f56f3e1c36abbc6c440484be248856db"
dependencies = [
"cc",
"libc",
"pkg-config",
]
[[package]] [[package]]
name = "cc" name = "cc"
version = "1.0.73" version = "1.0.79"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
dependencies = [ dependencies = [
"jobserver", "jobserver",
] ]
@ -87,35 +83,10 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
name = "chgk_txt2json" name = "chgk_txt2json"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"encoding", "async_zip",
"futures-util",
"json", "json",
"rayon", "tokio",
"textstream",
"zip",
]
[[package]]
name = "cipher"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ee52072ec15386f770805afd189a01c8841be8696bed250fa2f13c4c0d6dfb7"
dependencies = [
"generic-array",
]
[[package]]
name = "constant_time_eq"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc"
[[package]]
name = "cpufeatures"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59a6001667ab124aebae2a495118e11d30984c3a653e99d86d58971708cf5e4b"
dependencies = [
"libc",
] ]
[[package]] [[package]]
@ -128,159 +99,61 @@ dependencies = [
] ]
[[package]] [[package]]
name = "crossbeam-channel" name = "futures"
version = "0.5.6" version = "0.1.31"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" checksum = "3a471a38ef8ed83cd6e40aa59c1ffe17db6855c18e3604d9c4ed8c08ebc28678"
[[package]]
name = "futures-core"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c"
[[package]]
name = "futures-io"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964"
[[package]]
name = "futures-macro"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
dependencies = [ dependencies = [
"cfg-if", "proc-macro2",
"crossbeam-utils", "quote",
"syn 2.0.28",
] ]
[[package]] [[package]]
name = "crossbeam-deque" name = "futures-sink"
version = "0.8.2" version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e"
[[package]]
name = "futures-task"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65"
[[package]]
name = "futures-util"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533"
dependencies = [ dependencies = [
"cfg-if", "futures-core",
"crossbeam-epoch", "futures-io",
"crossbeam-utils", "futures-macro",
] "futures-task",
"memchr",
[[package]] "pin-project-lite",
name = "crossbeam-epoch" "pin-utils",
version = "0.9.10" "slab",
source = "registry+https://github.com/rust-lang/crates.io-index" "tokio-io",
checksum = "045ebe27666471bb549370b4b0b3e51b07f56325befa4284db65fc89c02511b1"
dependencies = [
"autocfg",
"cfg-if",
"crossbeam-utils",
"memoffset",
"once_cell",
"scopeguard",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc"
dependencies = [
"cfg-if",
"once_cell",
]
[[package]]
name = "crypto-common"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
dependencies = [
"generic-array",
"typenum",
]
[[package]]
name = "digest"
version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2fb860ca6fafa5552fb6d0e816a69c8e49f0908bf524e30a90d97c85892d506"
dependencies = [
"block-buffer",
"crypto-common",
"subtle",
]
[[package]]
name = "either"
version = "1.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f107b87b6afc2a64fd13cac55fe06d6c8859f12d4b14cbcdd2c67d0976781be"
[[package]]
name = "encoding"
version = "0.2.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec"
dependencies = [
"encoding-index-japanese",
"encoding-index-korean",
"encoding-index-simpchinese",
"encoding-index-singlebyte",
"encoding-index-tradchinese",
]
[[package]]
name = "encoding-index-japanese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-korean"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-simpchinese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-singlebyte"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-tradchinese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding_index_tests"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
[[package]]
name = "flate2"
version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f82b0f4c27ad9f8bfd1f3208d882da2b09c301bc1c828fd3a00d0216d2fbbff6"
dependencies = [
"crc32fast",
"miniz_oxide",
]
[[package]]
name = "generic-array"
version = "0.14.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9"
dependencies = [
"typenum",
"version_check",
] ]
[[package]] [[package]]
@ -293,25 +166,19 @@ dependencies = [
] ]
[[package]] [[package]]
name = "hmac" name = "iovec"
version = "0.12.1" version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e"
dependencies = [ dependencies = [
"digest", "libc",
] ]
[[package]]
name = "itoa"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754"
[[package]] [[package]]
name = "jobserver" name = "jobserver"
version = "0.1.24" version = "0.1.26"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa" checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
dependencies = [ dependencies = [
"libc", "libc",
] ]
@ -324,35 +191,45 @@ checksum = "078e285eafdfb6c4b434e0d31e8cfcb5115b651496faca5749b88fafd4f23bfd"
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.131" version = "0.2.132"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04c3b4822ccebfa39c02fc03d1534441b22ead323fa0f48bb7ddd8e6ba076a40" checksum = "8371e4e5341c3a96db127eb2465ac681ced4c433e01dd0e938adbef26ba93ba5"
[[package]]
name = "lock_api"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f80bf5aacaf25cbfc8210d1cfb718f2bf3b11c4c54e5afe36c236853a8ec390"
dependencies = [
"autocfg",
"scopeguard",
]
[[package]]
name = "log"
version = "0.4.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
dependencies = [
"cfg-if",
]
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "1.0.2" version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "148fab2e51b4f1cfc66da2a7c32981d1d3c083a803978268bb11fe4b86925e7a" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "mio"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57ee1c23c7c63b0c9250c339ffdc69255f110b298b901b9f6c82547b7b87caaf"
dependencies = [ dependencies = [
"libc", "libc",
] "log",
"wasi",
[[package]] "windows-sys",
name = "memoffset"
version = "0.6.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce"
dependencies = [
"autocfg",
]
[[package]]
name = "miniz_oxide"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f5c75688da582b8ffc1f1799e9db273f32133c49e048f614d22ec3256773ccc"
dependencies = [
"adler",
] ]
[[package]] [[package]]
@ -365,84 +242,98 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "num_threads"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44"
dependencies = [
"libc",
]
[[package]] [[package]]
name = "once_cell" name = "once_cell"
version = "1.13.0" version = "1.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18a6dbe30758c9f83eb00cbea4ac95966305f5a7772f3f42ebfc7fc7eddbd8e1" checksum = "2f7254b99e31cad77da24b08ebf628882739a608578bb1bcdfc1f9c21260d7c0"
[[package]] [[package]]
name = "opaque-debug" name = "parking_lot"
version = "0.3.0" version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
[[package]]
name = "password-hash"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d791538a6dcc1e7cb7fe6f6b58aca40e7f79403c45b2bc274008b5e647af1d8"
dependencies = [ dependencies = [
"base64ct", "lock_api",
"rand_core", "parking_lot_core",
"subtle",
] ]
[[package]] [[package]]
name = "pbkdf2" name = "parking_lot_core"
version = "0.10.1" version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "271779f35b581956db91a3e55737327a03aa051e90b1c47aeb189508533adfd7" checksum = "09a279cbf25cb0757810394fbc1e359949b59e348145c643a939a525692e6929"
dependencies = [ dependencies = [
"digest", "cfg-if",
"hmac", "libc",
"password-hash", "redox_syscall",
"sha2", "smallvec",
"windows-sys",
] ]
[[package]]
name = "pin-project"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "030ad2bc4db10a8944cb0d837f158bdfec4d4a4873ab701a95046770d11f8842"
dependencies = [
"pin-project-internal",
]
[[package]]
name = "pin-project-internal"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec2e072ecce94ec471b13398d5402c188e76ac03cf74dd1a975161b23a3f6d9c"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
]
[[package]]
name = "pin-project-lite"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116"
[[package]]
name = "pin-utils"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]] [[package]]
name = "pkg-config" name = "pkg-config"
version = "0.3.25" version = "0.3.27"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae" checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
[[package]] [[package]]
name = "rand_core" name = "proc-macro2"
version = "0.6.3" version = "1.0.66"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
[[package]]
name = "rayon"
version = "1.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d"
dependencies = [ dependencies = [
"autocfg", "unicode-ident",
"crossbeam-deque",
"either",
"rayon-core",
] ]
[[package]] [[package]]
name = "rayon-core" name = "quote"
version = "1.9.3" version = "1.0.32"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
dependencies = [ dependencies = [
"crossbeam-channel", "proc-macro2",
"crossbeam-deque", ]
"crossbeam-utils",
"num_cpus", [[package]]
name = "redox_syscall"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
dependencies = [
"bitflags",
] ]
[[package]] [[package]]
@ -452,107 +343,229 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]] [[package]]
name = "sha1" name = "signal-hook-registry"
version = "0.10.1" version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c77f4e7f65455545c2153c1253d25056825e77ee2533f0e41deb65a93a34852f" checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0"
dependencies = [ dependencies = [
"cfg-if",
"cpufeatures",
"digest",
]
[[package]]
name = "sha2"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55deaec60f81eefe3cce0dc50bda92d6d8e88f2a27df7c5033b42afeb1ed2676"
dependencies = [
"cfg-if",
"cpufeatures",
"digest",
]
[[package]]
name = "subtle"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"
[[package]]
name = "textstream"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7ed81b342f6566026755e7f4b7798810b1c159722e427d212ce72c2c58ffdaa"
dependencies = [
"encoding",
"memchr",
]
[[package]]
name = "time"
version = "0.3.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db76ff9fa4b1458b3c7f077f3ff9887394058460d21e634355b273aaf11eea45"
dependencies = [
"itoa",
"libc", "libc",
"num_threads",
"time-macros",
] ]
[[package]] [[package]]
name = "time-macros" name = "slab"
version = "0.2.4" version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792" checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d"
[[package]]
name = "typenum"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987"
[[package]]
name = "version_check"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "zip"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf225bcf73bb52cbb496e70475c7bd7a3f769df699c0020f6c7bd9a96dcf0b8d"
dependencies = [ dependencies = [
"aes", "autocfg",
"byteorder",
"bzip2",
"constant_time_eq",
"crc32fast",
"crossbeam-utils",
"flate2",
"hmac",
"pbkdf2",
"sha1",
"time",
"zstd",
] ]
[[package]]
name = "smallvec"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1"
[[package]]
name = "socket2"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd"
dependencies = [
"libc",
"winapi",
]
[[package]]
name = "syn"
version = "1.0.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "syn"
version = "2.0.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "thiserror"
version = "1.0.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c1b05ca9d106ba7d2e31a9dab4a64e7be2cce415321966ea3132c49a656e252"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8f2591983642de85c921015f3f070c665a197ed69e417af436115e3a1407487"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.99",
]
[[package]]
name = "tokio"
version = "1.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89797afd69d206ccd11fb0ea560a44bbb87731d020670e79416d442919257d42"
dependencies = [
"autocfg",
"bytes 1.2.1",
"libc",
"memchr",
"mio",
"num_cpus",
"once_cell",
"parking_lot",
"pin-project-lite",
"signal-hook-registry",
"socket2",
"tokio-macros",
"winapi",
]
[[package]]
name = "tokio-io"
version = "0.1.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57fc868aae093479e3131e3d165c93b1c7474109d13c90ec0dda2a1bbfff0674"
dependencies = [
"bytes 0.4.12",
"futures",
"log",
]
[[package]]
name = "tokio-macros"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9724f9a975fb987ef7a3cd9be0350edcbe130698af5b8f7a631e23d42d052484"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.99",
]
[[package]]
name = "tokio-util"
version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e267c18a719545b481171952a79f8c25c80361463ba44bc7fa9eba7c742ef4f"
dependencies = [
"bytes 1.2.1",
"futures-core",
"futures-io",
"futures-sink",
"pin-project-lite",
"tokio",
]
[[package]]
name = "unicode-ident"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4f5b37a154999a8f3f98cc23a628d850e154479cd94decf3414696e12e31aaf"
[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-sys"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2"
dependencies = [
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_msvc"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47"
[[package]]
name = "windows_i686_gnu"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6"
[[package]]
name = "windows_i686_msvc"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024"
[[package]]
name = "windows_x86_64_gnu"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1"
[[package]]
name = "windows_x86_64_msvc"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680"
[[package]] [[package]]
name = "zstd" name = "zstd"
version = "0.10.2+zstd.1.5.2" version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f4a6bd64f22b5e3e94b4e238669ff9f10815c27a5180108b849d24174a83847" checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
dependencies = [ dependencies = [
"zstd-safe", "zstd-safe",
] ]
[[package]] [[package]]
name = "zstd-safe" name = "zstd-safe"
version = "4.1.6+zstd.1.5.2" version = "6.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94b61c51bb270702d6167b8ce67340d2754b088d0c091b06e593aa772c3ee9bb" checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
dependencies = [ dependencies = [
"libc", "libc",
"zstd-sys", "zstd-sys",
@ -560,10 +573,11 @@ dependencies = [
[[package]] [[package]]
name = "zstd-sys" name = "zstd-sys"
version = "1.6.3+zstd.1.5.2" version = "2.0.8+zstd.1.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc49afa5c8d634e75761feda8c592051e7eeb4683ba827211eb0d731d3402ea8" checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
dependencies = [ dependencies = [
"cc", "cc",
"libc", "libc",
"pkg-config",
] ]

View File

@ -7,11 +7,14 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
zip = "0.6"
encoding = "0.2"
textstream = "0.1"
json = "0.12" json = "0.12"
rayon="1.5" tokio = { version = "1.21.0", features = ["full"] }
async_zip = { git = "https://github.com/Majored/rs-async-zip", rev = "ff0d985", features = [
"zstd",
"tokio",
"tokio-fs",
] }
futures-util = { version = "0.3.28", features = ["io", "tokio-io"] }
[profile.release] [profile.release]
opt-level = 3 opt-level = 3

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2022 Dmitry Belyaev
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

26
README.md Normal file
View File

@ -0,0 +1,26 @@
# chgk_txt2json
Конвертер **текстовых** файлов с вопросами в **JSON** формат.
Исходные файлы вопросов должны находиться в ZIP файле `baza_utf8.zip`, результат будет записан в файл `json.zip`.
## Особенности
### Кодировка исходных файлов
В оригинальных файлах базы используется кодировка `KOI8-R`.
Но из за того что [tokio::AsyncBufReadExt::lines](https://docs.rs/tokio/latest/tokio/io/trait.AsyncBufReadExt.html#method.lines) работает только с `UTF-8`, эта утилита работает с файлами которые уже в кодировке `UTF-8`.
### Алгоритм сжатия выходного архива
Для сжатия файлов в выходном архиве используется метод `Zstandard`, т.к. он достаточно быстр и по степени сжатия сопоставим с обычным `Deflate`.
Но для просмотра содержимого таких ZIP файлов нужно использовать ПО c поддержкой `Zstandard`, например:
- [Modern7z](https://www.tc4shell.com/ru/7zip/modern7z)
- [7-Zip-zstd](https://github.com/mcmilk/7-Zip-zstd)
## Ссылки
- Исходная база вопросов: http://db.chgk.info
- Копия файлов базы вопросов: https://gitlab.com/b4tman/db_chgk
- Утилита конвертации кодировки текстовых файлов в ZIP архиве: https://gitea.b4tman.ru/temp/ztb_recode

View File

@ -1,19 +1,20 @@
extern crate encoding; extern crate async_zip;
extern crate json; extern crate json;
extern crate rayon; extern crate tokio;
extern crate textstream;
extern crate zip;
use encoding::all::KOI8_R; use async_zip::tokio::read::seek::ZipFileReader;
use encoding::DecoderTrap; use async_zip::tokio::write::ZipFileWriter;
use rayon::prelude::*; use async_zip::{Compression, ZipEntryBuilder};
use futures_util::io::{AsyncBufReadExt, BufReader};
use futures_util::stream::StreamExt;
use std::path::PathBuf; use std::path::PathBuf;
use std::str::FromStr; use std::str::FromStr;
use std::{fs, io}; use tokio::fs;
use textstream::TextReader; use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender};
const BASE_FILENAME: &str = "baza.zip"; const INPUT_FILENAME: &str = "baza_utf8.zip";
const OUTPUT_PATH: &str = "json"; const OUTPUT_FILENAME: &str = "json.zip";
const OUTPUT_COMPRESSION: Compression = Compression::Zstd;
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
enum KeywordType { enum KeywordType {
@ -25,6 +26,24 @@ enum KeywordType {
CurrentScope, CurrentScope,
} }
impl FromStr for KeywordType {
type Err = ();
fn from_str(pattern: &str) -> Result<Self, Self::Err> {
use KeywordType::*;
Ok(match pattern {
"Мета:" => Ignore,
"Чемпионат:" | "Пакет:" => Global,
"Тур:" => QuestionPre,
"Вопрос " | "Вопрос:" => QuestionStart,
"Ответ:" | "Зачет:" => QuestionContent,
_ => CurrentScope,
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
})
}
}
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
enum DataScope { enum DataScope {
Global, Global,
@ -32,7 +51,7 @@ enum DataScope {
QuestionContent, QuestionContent,
} }
struct Context { struct QuestionsParser {
// global output value // global output value
data: json::JsonValue, data: json::JsonValue,
// temp questions array // temp questions array
@ -56,64 +75,9 @@ struct Context {
last_tag: String, last_tag: String,
} }
// check questions before push /// Text questions parser
trait PushIfValid { impl QuestionsParser {
fn is_valid(&self) -> bool; const PATTERNS: &'static [&'static str] = &[
fn push_if_valid(&mut self, value: json::JsonValue);
}
impl PushIfValid for json::JsonValue {
fn is_valid(&self) -> bool {
self.has_key("Вопрос") && self.has_key("Ответ")
}
fn push_if_valid(&mut self, value: json::JsonValue) {
if value.is_valid() {
self.push(value).unwrap_or(())
}
}
}
impl Context {
fn new() -> Context {
Context {
data: json::JsonValue::new_object(),
questions: json::JsonValue::new_array(),
cur_keyword_type: None,
cur_question: json::JsonValue::new_object(),
cur_question_pre: json::JsonValue::new_object(),
cur_tag: String::new(),
cur_content: Vec::<String>::new(),
cur_scope: DataScope::Global,
have_new_question: false,
last_keyword_type: None,
last_tag: String::new(),
}
}
}
impl FromStr for KeywordType {
type Err = ();
fn from_str(pattern: &str) -> Result<Self, Self::Err> {
use KeywordType::*;
Ok(match pattern {
"Мета:" => Ignore,
"Чемпионат:" | "Пакет:" => Global,
"Тур:" => QuestionPre,
"Вопрос " | "Вопрос:" => QuestionStart,
"Ответ:" | "Зачет:" => QuestionContent,
_ => CurrentScope,
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
})
}
}
fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<dyn std::error::Error>> {
let buf = io::BufReader::new(file);
let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore);
let patterns = vec![
"Чемпионат:", "Чемпионат:",
"Пакет:", "Пакет:",
"URL:", "URL:",
@ -138,188 +102,276 @@ fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<dyn std::error
"Комментарий:", "Комментарий:",
"Комментарии:", "Комментарии:",
]; ];
let mut context = Context::new();
let mut ctx = &mut context;
reader /// create new parser
.lines() pub fn new() -> QuestionsParser {
.map(|line| String::from(line.unwrap().trim())) QuestionsParser {
.filter(|line| !line.is_empty()) // ignore empty lines data: json::JsonValue::new_object(),
.for_each(|line| { questions: json::JsonValue::new_array(),
match patterns cur_keyword_type: None,
cur_question: json::JsonValue::new_object(),
cur_question_pre: json::JsonValue::new_object(),
cur_tag: String::new(),
cur_content: Vec::<String>::new(),
cur_scope: DataScope::Global,
have_new_question: false,
last_keyword_type: None,
last_tag: String::new(),
}
}
/// join current content lines
fn get_current_content(&self) -> String {
self.cur_content.join("\n")
}
/// clear current content
fn clear_current_content(&mut self) {
self.cur_content.clear()
}
/// add new line to current content
fn append_to_current_content(&mut self, line: String) {
self.cur_content.push(line);
}
/// check current question have required fields
fn is_current_question_valid(&self) -> bool {
self.cur_question.has_key("Вопрос") && self.cur_question.has_key("Ответ")
}
/// add current question to parsed array
fn add_cur_question(&mut self) {
if self.is_current_question_valid() {
let current = std::mem::replace(&mut self.cur_question, self.cur_question_pre.clone());
self.questions.push(current).unwrap()
}
}
/// set current content to last tag(keyword) to data scope
fn apply_content_to(&mut self, scope: DataScope) {
let content = self.get_current_content();
// match value to store data
let scope_data = match scope {
DataScope::Global => &mut self.data,
DataScope::QuestionPre => &mut self.cur_question_pre,
DataScope::QuestionContent => &mut self.cur_question,
};
scope_data[&self.last_tag] = content.into();
self.clear_current_content();
}
/// set current content to last tag(keyword) to current scope
fn apply_content_to_cur_scope(&mut self) {
self.apply_content_to(self.cur_scope);
}
/// set current scope
fn set_scope(&mut self, scope: DataScope) {
self.cur_scope = scope;
}
/// set current scope and set current content to last tag(keyword) to data scope
fn set_scope_and_apply(&mut self, scope: DataScope) {
self.set_scope(scope);
self.apply_content_to_cur_scope();
}
/// add last question (if have) and start collecting new one
fn start_new_question(&mut self) {
// store prev question before reading new
if self.have_new_question {
self.add_cur_question();
}
self.have_new_question = true;
}
/// check last tag(keyword) and set current content to corresponding data scope
fn apply_content_for_last_keyword(&mut self) {
// apply accumulated content when new keyword found
match self.last_keyword_type {
Some(KeywordType::Global) => {
self.set_scope_and_apply(DataScope::Global);
}
Some(KeywordType::QuestionPre) => {
self.set_scope_and_apply(DataScope::QuestionPre);
}
Some(KeywordType::QuestionStart) => {
self.start_new_question();
self.set_scope_and_apply(DataScope::QuestionContent);
}
Some(KeywordType::QuestionContent) => {
self.apply_content_to(DataScope::QuestionContent);
}
Some(KeywordType::CurrentScope) => {
self.apply_content_to_cur_scope();
}
_ => (), //None or Ignore
};
}
/// set current keyword(tag) and type as last, and set new as current
fn set_new_keyword(&mut self, keyword: &str) {
self.last_keyword_type =
std::mem::replace(&mut self.cur_keyword_type, Some(keyword.parse().unwrap()));
self.last_tag = std::mem::replace(
&mut self.cur_tag,
keyword.trim_end().trim_end_matches(':').to_string(),
);
}
/// if line matched keyword
fn on_keyword_match(&mut self, line: &str, keyword: &str) {
self.set_new_keyword(keyword);
// remember question id
if let Some(KeywordType::QuestionStart) = self.cur_keyword_type {
self.cur_question_pre["id"] = line.trim_end().trim_end_matches(':').into();
};
self.apply_content_for_last_keyword();
}
/// parse next line
pub fn parse_line(&mut self, line: &str) {
match QuestionsParser::PATTERNS
.iter() // find keyword .iter() // find keyword
.find(|&&pattern| line.starts_with(pattern) && line.ends_with(':')) .find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
{ {
Some(pattern) => { Some(pattern) => {
use KeywordType::*; self.on_keyword_match(line, pattern);
ctx.last_keyword_type = ctx.cur_keyword_type;
ctx.last_tag = ctx.cur_tag.clone();
ctx.cur_keyword_type = Some(pattern.parse().unwrap());
ctx.cur_tag = pattern.replace(' ', "").replace(':', "");
// remember question id
if let Some(QuestionStart) = ctx.cur_keyword_type {
ctx.cur_question_pre["id"] = line.replace(':', "").as_str().into();
};
// apply accumulated content when new keyword found
match ctx.last_keyword_type {
Some(Global) => {
ctx.cur_scope = DataScope::Global;
ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into()
}
Some(QuestionPre) => {
ctx.cur_scope = DataScope::QuestionPre;
ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(QuestionStart) => {
ctx.cur_scope = DataScope::QuestionContent;
// store prev question before reading new
if ctx.have_new_question {
ctx.questions.push_if_valid(ctx.cur_question.clone());
}
// prepare to read new question data with cur_question_pre values
ctx.cur_question = ctx.cur_question_pre.clone();
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
ctx.have_new_question = true;
}
Some(QuestionContent) => {
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(CurrentScope) => {
// match value to store data
let scope_data = match ctx.cur_scope {
DataScope::Global => &mut ctx.data,
DataScope::QuestionPre => &mut ctx.cur_question_pre,
DataScope::QuestionContent => &mut ctx.cur_question,
};
scope_data[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
_ => (), //None or Ignore
};
// clear content
ctx.cur_content.clear();
} }
None => { None => {
// accumulate content if line is not a keyword self.append_to_current_content(line.to_string());
ctx.cur_content.push(line);
} }
} }
});
// finish reading last question
if ctx.have_new_question && !ctx.cur_content.is_empty() {
ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").into();
ctx.questions.push_if_valid(ctx.cur_question.clone());
ctx.have_new_question = false;
}
ctx.data["Вопросы"] = ctx.questions.clone();
Ok(ctx.data.clone())
}
// split vector to a vector of [num] slices
trait SplitTo<T> {
fn split_to(&self, num: usize) -> Vec<&[T]>;
}
impl<T> SplitTo<T> for Vec<T> {
fn split_to(&self, num: usize) -> Vec<&[T]> {
let part_len = self.len() / num;
let add_len = self.len() % num;
let mut result = Vec::<&[T]>::with_capacity(num);
if 0 == part_len {
result.push(self);
return result;
}
for i in 0..num {
let size = if (num - 1) == i {
part_len + add_len
} else {
part_len
};
let start = part_len * i;
result.push(&self[start..(start + size)]);
}
result
}
} }
fn process_files(files: &&[PathBuf]) { /// finish parsing
if files.is_empty() { pub fn finish(&mut self) {
return; if self.have_new_question && !self.cur_content.is_empty() {
self.cur_question[&self.cur_tag] = self.get_current_content().into();
self.add_cur_question();
self.clear_current_content();
self.have_new_question = false;
}
self.data["Вопросы"] = std::mem::replace(&mut self.questions, json::JsonValue::new_array());
} }
let start_file = files[0].to_str().unwrap(); /// get parsed data
println!("-> start from \"{}\" ({} files)", start_file, files.len()); pub fn get_parsed(self) -> json::JsonValue {
self.data
let zip_file = fs::File::open(BASE_FILENAME).unwrap(); }
let zip_reader = io::BufReader::new(zip_file);
let mut archive = zip::ZipArchive::new(zip_reader).unwrap();
files.iter().for_each(|name| {
let name_str = name.to_str().unwrap();
// parse txt file
let file = archive.by_name(name_str).unwrap();
let data = parse_file(file).unwrap();
// make output filename
let mut outfilename = PathBuf::from(OUTPUT_PATH);
outfilename.push(name);
outfilename.set_extension("json");
// save json to file
let mut outfile = fs::File::create(outfilename).unwrap();
data.write_pretty(&mut outfile, 1).unwrap();
});
println!("<- done {} files (from \"{}\")", files.len(), start_file);
} }
fn main() -> Result<(), Box<dyn std::error::Error>> { #[derive(Debug)]
struct FileText {
name: String,
text: String,
}
#[derive(Debug)]
enum TextReaderMessage {
NextLine(String),
EndOfFile(String),
}
/// read txt files from zip and convert to json
async fn zip_text_reader(tx: UnboundedSender<TextReaderMessage>) {
// open archive just to list files // open archive just to list files
let zip_file = fs::File::open(BASE_FILENAME)?; let mut file = fs::File::open(INPUT_FILENAME).await.expect("open zip");
let zip_reader = io::BufReader::new(zip_file); let archive = ZipFileReader::with_tokio(&mut file)
let mut archive = zip::ZipArchive::new(zip_reader)?; .await
.expect("open zip file reader");
let source_files: Vec<PathBuf> = (0..archive.len()) let source_files: Vec<(usize, String)> = archive
.map(|i| archive.by_index(i).unwrap().mangled_name()) .file()
.filter(|name| { .entries()
// skip files without "txt" extension .iter()
match name.extension() { .enumerate()
Some(ext) => match ext.to_str() { .filter(|(_, entry)| !entry.dir().unwrap_or(true))
Some(ext_str) => ext_str.eq_ignore_ascii_case("txt"), .map(|(index, entry)| (index, entry.filename().as_str().unwrap().to_string()))
_ => false, // extension is not valid unicode or not txt .filter(|(_, filename)| filename.ends_with(".txt"))
},
_ => false, // no extension in filename
}
})
.collect(); .collect();
//
drop(archive); drop(archive);
let mut file = fs::File::open(INPUT_FILENAME).await.expect("open zip 2");
let mut archive = ZipFileReader::with_tokio(&mut file)
.await
.expect("open zip file reader 2");
for (index, name) in source_files {
let entry_reader = archive.reader_with_entry(index).await.expect("read entry");
let buf_reader = BufReader::new(entry_reader);
let lines = buf_reader.lines();
tokio::pin!(lines);
while let Some(Ok(line)) = lines.next().await {
tx.send(TextReaderMessage::NextLine(line))
.expect("send line");
}
tx.send(TextReaderMessage::EndOfFile(name))
.expect("send end");
}
// check output directory println!("read done ✅");
let out_dir: PathBuf = OUTPUT_PATH.into(); }
if out_dir.is_file() {
return Err("output directory is file!".into()); /// convert text questions to json format
} else if !out_dir.exists() { async fn questions_converter(
fs::create_dir_all(out_dir)?; mut rx: UnboundedReceiver<TextReaderMessage>,
tx: UnboundedSender<FileText>,
) {
let mut parser = QuestionsParser::new();
while let Some(msg) = rx.recv().await {
match msg {
TextReaderMessage::NextLine(line) => {
let line = line.trim();
if line.is_empty() {
continue;
}
parser.parse_line(line);
}
TextReaderMessage::EndOfFile(name) => {
parser.finish();
let data_json = parser.get_parsed();
let text = data_json.pretty(2);
tx.send(FileText { name, text }).expect("send json");
parser = QuestionsParser::new();
}
}
}
println!("convert done ✅");
}
/// write json data to zip files
async fn zip_json_writer(mut rx: UnboundedReceiver<FileText>) {
let mut file = fs::File::create(OUTPUT_FILENAME)
.await
.expect("create file");
let mut writer = ZipFileWriter::with_tokio(&mut file);
while let Some(FileText { name, text: data }) = rx.recv().await {
// make output filename
let mut outfilename = PathBuf::from(name);
outfilename.set_extension("json");
let outfilename = outfilename.to_str().unwrap().to_string();
let opts = ZipEntryBuilder::new(outfilename.into(), OUTPUT_COMPRESSION);
// write new zip entry
writer
.write_entry_whole(opts, data.as_bytes())
.await
.expect("write entry");
}
writer.close().await.expect("close writer");
println!("write done ✅");
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// check output filename
match fs::metadata(OUTPUT_FILENAME).await {
Ok(x) if x.is_dir() => return Err("output file is a directory!".into()),
_ => (),
}; };
println!( let (reader_tx, reader_rx) = mpsc::unbounded_channel::<TextReaderMessage>();
"processing {} files with {} threads...", let (json_tx, json_rx) = mpsc::unbounded_channel::<FileText>();
source_files.len(),
rayon::current_num_threads()
);
// split vector and process its parts in parallel tokio::try_join!(
source_files tokio::spawn(zip_text_reader(reader_tx)),
.split_to(rayon::current_num_threads()) tokio::spawn(questions_converter(reader_rx, json_tx)),
.par_iter() tokio::spawn(zip_json_writer(json_rx))
.for_each(process_files); )?;
println!("done"); println!("all done");
Ok(()) Ok(())
} }