From d44ccdffa3d85af01e758f08b07e437c6d6f83f4 Mon Sep 17 00:00:00 2001 From: Dmitry Date: Mon, 12 Sep 2022 20:50:31 +0300 Subject: [PATCH] initial commit --- .gitignore | 3 + Cargo.lock | 456 ++++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 16 ++ src/main.rs | 129 +++++++++++++++ 4 files changed, 604 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/main.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..48a0746 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/target +baza*.zip + diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..a372df3 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,456 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "aes" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e8b47f52ea9bae42228d07ec09eb676433d7c4ed1ebdf0f1d1c29ed446f1ab8" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", + "opaque-debug", +] + +[[package]] +name = "base64ct" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a32fd6af2b5827bce66c29053ba0e7c42b9dcab01835835058558c10851a46b" + +[[package]] +name = "block-buffer" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cce20737498f97b993470a6e536b8523f0af7892a4f928cceb1ac5e52ebe7e" +dependencies = [ + "generic-array", +] + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "bzip2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6afcd980b5f3a45017c57e57a2fcccbb351cc43a356ce117ef760ef8052b89b0" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "cc" +version = "1.0.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" +dependencies = [ + "jobserver", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "cipher" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ee52072ec15386f770805afd189a01c8841be8696bed250fa2f13c4c0d6dfb7" +dependencies = [ + "generic-array", +] + +[[package]] +name = "constant_time_eq" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" + +[[package]] +name = "cpufeatures" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc" +dependencies = [ + "cfg-if", + "once_cell", +] + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "digest" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2fb860ca6fafa5552fb6d0e816a69c8e49f0908bf524e30a90d97c85892d506" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + +[[package]] +name = "flate2" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f82b0f4c27ad9f8bfd1f3208d882da2b09c301bc1c828fd3a00d0216d2fbbff6" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "generic-array" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "itoa" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754" + +[[package]] +name = "jobserver" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa" +dependencies = [ + "libc", +] + +[[package]] +name = "libc" +version = "0.2.132" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8371e4e5341c3a96db127eb2465ac681ced4c433e01dd0e938adbef26ba93ba5" + +[[package]] +name = "memchr" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "148fab2e51b4f1cfc66da2a7c32981d1d3c083a803978268bb11fe4b86925e7a" +dependencies = [ + "libc", +] + +[[package]] +name = "miniz_oxide" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96590ba8f175222643a85693f33d26e9c8a015f599c216509b1a6894af675d34" +dependencies = [ + "adler", +] + +[[package]] +name = "num_threads" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" +dependencies = [ + "libc", +] + +[[package]] +name = "once_cell" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f7254b99e31cad77da24b08ebf628882739a608578bb1bcdfc1f9c21260d7c0" + +[[package]] +name = "opaque-debug" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" + +[[package]] +name = "password-hash" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d791538a6dcc1e7cb7fe6f6b58aca40e7f79403c45b2bc274008b5e647af1d8" +dependencies = [ + "base64ct", + "rand_core", + "subtle", +] + +[[package]] +name = "pbkdf2" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271779f35b581956db91a3e55737327a03aa051e90b1c47aeb189508533adfd7" +dependencies = [ + "digest", + "hmac", + "password-hash", + "sha2", +] + +[[package]] +name = "pkg-config" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae" + +[[package]] +name = "rand_core" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" + +[[package]] +name = "sha1" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "006769ba83e921b3085caa8334186b00cf92b4cb1a6cf4632fbccc8eff5c7549" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf9db03534dff993187064c4e0c05a5708d2a9728ace9a8959b77bedf415dac5" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "subtle" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" + +[[package]] +name = "textstream" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7ed81b342f6566026755e7f4b7798810b1c159722e427d212ce72c2c58ffdaa" +dependencies = [ + "encoding", + "memchr", +] + +[[package]] +name = "time" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c3f9a28b618c3a6b9251b6908e9c99e04b9e5c02e6581ccbb67d59c34ef7f9b" +dependencies = [ + "itoa", + "libc", + "num_threads", + "time-macros", +] + +[[package]] +name = "time-macros" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792" + +[[package]] +name = "typenum" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "zip" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf225bcf73bb52cbb496e70475c7bd7a3f769df699c0020f6c7bd9a96dcf0b8d" +dependencies = [ + "aes", + "byteorder", + "bzip2", + "constant_time_eq", + "crc32fast", + "crossbeam-utils", + "flate2", + "hmac", + "pbkdf2", + "sha1", + "time", + "zstd", +] + +[[package]] +name = "zstd" +version = "0.10.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f4a6bd64f22b5e3e94b4e238669ff9f10815c27a5180108b849d24174a83847" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "4.1.6+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94b61c51bb270702d6167b8ce67340d2754b088d0c091b06e593aa772c3ee9bb" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "1.6.3+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc49afa5c8d634e75761feda8c592051e7eeb4683ba827211eb0d731d3402ea8" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "ztb_recode" +version = "0.1.0" +dependencies = [ + "encoding", + "textstream", + "zip", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..d511ae2 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "ztb_recode" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +encoding = "0.2.33" +textstream = "0.1.1" +zip = "0.6.2" + +[profile.release] +opt-level = 3 +debug = false +lto = true diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..8207136 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,129 @@ +extern crate encoding; +extern crate textstream; +extern crate zip; + +use encoding::all::{KOI8_R, UTF_8}; +use encoding::Encoding; +use encoding::{DecoderTrap, EncoderTrap}; +use std::io::{Read, Seek, Write}; +use std::path::PathBuf; +use std::{fs, io}; +use textstream::TextReader; +use zip::write::FileOptions; +use zip::ZipWriter; + +const INPUT_FILENAME: &str = "baza.zip"; +const OUTPUT_FILENAME: &str = "baza_utf8.zip"; + +struct OutFileDescr<'a> { + name: String, + options: &'a FileOptions, + encoding: &'a dyn Encoding, + trap: EncoderTrap, +} + +impl<'a> OutFileDescr<'a> { + fn new( + name: String, + options: &'a FileOptions, + enc: &'a dyn Encoding, + trap: EncoderTrap, + ) -> OutFileDescr<'a> { + OutFileDescr { + name, + options, + encoding: enc, + trap, + } + } +} + +fn read_file(file: impl Read, enc: &dyn Encoding) -> Result> { + let buf = io::BufReader::new(file); + let mut reader = TextReader::new(buf, enc, DecoderTrap::Ignore); + let mut result = String::new(); + reader.read_to_end(&mut result).or(Err("decode error"))?; + + Ok(result) +} + +fn write_file( + arc: &mut ZipWriter, + f: OutFileDescr, + data: String, +) -> Result<(), Box> { + arc.start_file(f.name, *f.options)?; + let data = f.encoding.encode(data.as_str(), f.trap)?; + arc.write_all(&data)?; + Ok(()) +} + +fn process_files(files: &[PathBuf]) { + if files.is_empty() { + return; + } + + let enc_input = KOI8_R; + let enc_output = UTF_8; + let options = zip::write::FileOptions::default() + .compression_method(zip::CompressionMethod::Deflated) + .compression_level(Some(9)); + + let zip_file = fs::File::open(INPUT_FILENAME).unwrap(); + let zip_reader = io::BufReader::new(zip_file); + let mut archive = zip::ZipArchive::new(zip_reader).unwrap(); + + let mut outfile = fs::File::create(OUTPUT_FILENAME).unwrap(); + let mut zip_writer = ZipWriter::new(&mut outfile); + + files.iter().for_each(|name| { + let name_str = name.to_str().unwrap(); + + // read string from file in input zip + let file = archive.by_name(name_str).unwrap(); + let data = read_file(file, enc_input).unwrap(); + + // write string to file in output zip + let out_file = OutFileDescr::new( + name_str.to_string(), + &options, + enc_output, + EncoderTrap::Ignore, + ); + write_file(&mut zip_writer, out_file, data).unwrap(); + }); + + zip_writer.finish().unwrap(); +} + +fn main() -> Result<(), Box> { + // open archive just to list files + let zip_file = fs::File::open(INPUT_FILENAME)?; + let zip_reader = io::BufReader::new(zip_file); + let mut archive = zip::ZipArchive::new(zip_reader)?; + + let mut source_files: Vec = (0..archive.len()) + .map(|i| archive.by_index(i).unwrap().mangled_name()) + .filter(|name| { + // skip files without "txt" extension + match name.extension() { + Some(ext) => match ext.to_str() { + Some(ext_str) => ext_str.eq_ignore_ascii_case("txt"), + _ => false, // extension is not valid unicode or not txt + }, + _ => false, // no extension in filename + } + }) + .collect(); + drop(archive); + + println!("processing {} files...", source_files.len()); + + source_files.sort(); + let source_files = source_files; + + process_files(&source_files); + + println!("done"); + Ok(()) +}