diff --git a/Cargo.lock b/Cargo.lock index 11960e3..f0a4b71 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5,6 +5,14 @@ name = "adler32" version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "arrayvec" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "byteorder" version = "1.3.2" @@ -44,6 +52,7 @@ version = "0.1.0" dependencies = [ "encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)", "json 0.11.14 (registry+https://github.com/rust-lang/crates.io-index)", + "rayon 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "textstream 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "zip 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -56,6 +65,50 @@ dependencies = [ "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "crossbeam-deque" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-epoch 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "arrayvec 0.4.11 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "memoffset 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-queue" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-utils" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "either" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "encoding" version = "0.2.33" @@ -118,6 +171,11 @@ name = "json" version = "0.11.14" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "lazy_static" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "libc" version = "0.2.60" @@ -143,11 +201,54 @@ dependencies = [ "libc 0.2.60 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "memoffset" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "nodrop" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "num_cpus" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.60 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "podio" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "rayon" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-deque 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", + "either 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)", + "rayon-core 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rayon-core" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-deque 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-queue 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "num_cpus 1.10.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "redox_syscall" version = "0.1.56" @@ -158,6 +259,32 @@ name = "rle-decode-fast" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "scopeguard" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "take_mut" version = "0.2.2" @@ -215,12 +342,18 @@ dependencies = [ [metadata] "checksum adler32 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "7e522997b529f05601e05166c07ed17789691f562762c7f3b987263d2dedee5c" +"checksum arrayvec 0.4.11 (registry+https://github.com/rust-lang/crates.io-index)" = "b8d73f9beda665eaa98ab9e4f7442bd4e7de6652587de55b2525e52e29c1b0ba" "checksum byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c3dd8985a7111efc5c80b44e23ecdd8c007de8ade3b96595387e812b957cf5" "checksum bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "42b7c3cbf0fa9c1b82308d57191728ca0256cb821220f4e2fd410a72ade26e3b" "checksum bzip2-sys 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "6584aa36f5ad4c9247f5323b0a42f37802b37a836f0ad87084d7a33961abe25f" "checksum cc 1.0.38 (registry+https://github.com/rust-lang/crates.io-index)" = "ce400c638d48ee0e9ab75aef7997609ec57367ccfe1463f21bf53c3eca67bf46" "checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33" "checksum crc32fast 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1" +"checksum crossbeam-deque 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "05e44b8cf3e1a625844d1750e1f7820da46044ff6d28f4d43e455ba3e5bb2c13" +"checksum crossbeam-epoch 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "fedcd6772e37f3da2a9af9bf12ebe046c0dfe657992377b4df982a2b54cd37a9" +"checksum crossbeam-queue 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7c979cd6cfe72335896575c6b5688da489e420d36a27a0b9eb0c73db574b4a4b" +"checksum crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)" = "04973fa96e96579258a5091af6003abde64af786b860f18622b82e026cca60e6" +"checksum either 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5527cfe0d098f36e3f8839852688e63c8fff1c90b2b405aef730615f9a7bcf7b" "checksum encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)" = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" "checksum encoding-index-japanese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" "checksum encoding-index-korean 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" @@ -229,12 +362,22 @@ dependencies = [ "checksum encoding-index-tradchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" "checksum encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" "checksum json 0.11.14 (registry+https://github.com/rust-lang/crates.io-index)" = "01d7903059b22f1f09ced2fb9562507e3556a953caa2f835c64ab022bb6148c2" +"checksum lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bc5729f27f159ddd61f4df6228e827e86643d4d3e7c32183cb30a1c08f604a14" "checksum libc 0.2.60 (registry+https://github.com/rust-lang/crates.io-index)" = "d44e80633f007889c7eff624b709ab43c92d708caad982295768a7b13ca3b5eb" "checksum libflate 0.1.25 (registry+https://github.com/rust-lang/crates.io-index)" = "90c6f86f4b0caa347206f916f8b687b51d77c6ef8ff18d52dd007491fd580529" "checksum memchr 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "148fab2e51b4f1cfc66da2a7c32981d1d3c083a803978268bb11fe4b86925e7a" +"checksum memoffset 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ce6075db033bbbb7ee5a0bbd3a3186bbae616f57fb001c485c7ff77955f8177f" +"checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945" +"checksum num_cpus 1.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "bcef43580c035376c0705c42792c294b66974abbfd2789b511784023f71f3273" "checksum podio 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "780fb4b6698bbf9cf2444ea5d22411cef2953f0824b98f33cf454ec5615645bd" +"checksum rayon 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a4b0186e22767d5b9738a05eab7c6ac90b15db17e5b5f9bd87976dd7d89a10a4" +"checksum rayon-core 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ebbe0df8435ac0c397d467b6cad6d25543d06e8a019ef3f6af3c384597515bd2" "checksum redox_syscall 0.1.56 (registry+https://github.com/rust-lang/crates.io-index)" = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84" "checksum rle-decode-fast 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cabe4fa914dec5870285fa7f71f602645da47c486e68486d2b4ceb4a343e90ac" +"checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +"checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d" +"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" "checksum take_mut 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60" "checksum textstream 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "e7ed81b342f6566026755e7f4b7798810b1c159722e427d212ce72c2c58ffdaa" "checksum time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)" = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f" diff --git a/Cargo.toml b/Cargo.toml index 2e42127..5b8221b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,3 +11,4 @@ zip = "0.5" encoding = "0.2" textstream = "0.1" json="0.11" +rayon="1.1" diff --git a/src/main.rs b/src/main.rs index 50d60d7..0034cbd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,15 +1,19 @@ extern crate encoding; extern crate json; +extern crate rayon; extern crate textstream; extern crate zip; use encoding::all::KOI8_R; use encoding::DecoderTrap; +use rayon::prelude::*; use std::fs; use std::io; use std::path::PathBuf; use textstream::TextReader; +const BASE_FILENAME: &str = "./baza.zip"; + #[derive(Debug, Clone, Copy)] enum KeywordType { Ignore, @@ -192,33 +196,82 @@ fn parse_file(file: impl io::Read) -> Result Result<(), Box> { - let fname = std::path::Path::new("./baza.zip"); - let zip_file = fs::File::open(&fname)?; +// split vector to a vector of slices +#[allow(clippy::ptr_arg)] +fn split_vec<'a, T>(src: &'a Vec, num: usize) -> Vec<&'a [T]> { + let all_len = src.len(); + let part_len = all_len / num; + let add_len = all_len % num; + let mut result = Vec::<&'a [T]>::new(); + + if 1 > part_len { + result.push(src.as_slice()); + return result; + } + for i in 0..num { + let size = if (num - 1) == i { + part_len + add_len + } else { + part_len + }; + let start = part_len * i; + result.push(&src[start..(start + size)]); + } + result +} + +fn process_files(files: Vec) { + let zip_file = fs::File::open(BASE_FILENAME).unwrap(); let zip_reader = io::BufReader::new(zip_file); + let mut archive = zip::ZipArchive::new(zip_reader).unwrap(); - let mut archive = zip::ZipArchive::new(zip_reader)?; + files.iter().for_each(|name| { + let name_str = name.to_str().unwrap(); + println!("{:}", name_str); - for i in 0..archive.len() { - let file = archive.by_index(i)?; - let name = file.sanitized_name(); - // skip files without "txt" extension - match name.extension() { - Some(ext) => match ext.to_str() { - Some(ext_str) if ext_str.eq_ignore_ascii_case("txt") => (), - _ => continue, // extension is not valid unicode or not txt - }, - _ => continue, // no extension in filename - } - println!("{}", name.as_path().display()); - let data: json::JsonValue = parse_file(file)?; + // parse txt file + let file = archive.by_name(name_str).unwrap(); + let data = parse_file(file).unwrap(); + + // make output filename let mut outfilename = PathBuf::from("./json"); outfilename.push(name); outfilename.set_extension("json"); - let mut outfile = fs::File::create(outfilename)?; + // save json to file + let mut outfile = fs::File::create(outfilename).unwrap(); + data.write_pretty(&mut outfile, 1).unwrap(); + }); +} + +fn main() -> Result<(), Box> { + // open archive just to list files + let zip_file = fs::File::open(BASE_FILENAME)?; + let zip_reader = io::BufReader::new(zip_file); + let mut archive = zip::ZipArchive::new(zip_reader)?; + + let source_files: Vec = (0..archive.len()) + .map(|i| archive.by_index(i).unwrap().sanitized_name()) + .filter(|name| { + // skip files without "txt" extension + match name.extension() { + Some(ext) => match ext.to_str() { + Some(ext_str) if ext_str.eq_ignore_ascii_case("txt") => true, + _ => false, // extension is not valid unicode or not txt + }, + _ => false, // no extension in filename + } + }) + .collect(); + drop(archive); + + //split vector and iterate on it parts in parallel + split_vec(&source_files, rayon::current_num_threads()) + .par_iter() + .for_each(|slice| { + let source_files_part = slice.to_vec(); + process_files(source_files_part); + }); - data.write_pretty(&mut outfile, 1)?; - } Ok(()) }