parallel processing using rayon

This commit is contained in:
Dmitry Belyaev 2019-07-26 23:04:41 +03:00
parent 390097b5b8
commit 0240bf2fdf
Signed by: b4tman
GPG Key ID: 41A00BF15EA7E5F3
3 changed files with 217 additions and 20 deletions

143
Cargo.lock generated
View File

@ -5,6 +5,14 @@ name = "adler32"
version = "1.0.3" version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "arrayvec"
version = "0.4.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]] [[package]]
name = "byteorder" name = "byteorder"
version = "1.3.2" version = "1.3.2"
@ -44,6 +52,7 @@ version = "0.1.0"
dependencies = [ dependencies = [
"encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)", "encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)",
"json 0.11.14 (registry+https://github.com/rust-lang/crates.io-index)", "json 0.11.14 (registry+https://github.com/rust-lang/crates.io-index)",
"rayon 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"textstream 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "textstream 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"zip 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", "zip 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
@ -56,6 +65,50 @@ dependencies = [
"cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]]
name = "crossbeam-deque"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"crossbeam-epoch 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "crossbeam-epoch"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"arrayvec 0.4.11 (registry+https://github.com/rust-lang/crates.io-index)",
"cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"memoffset 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
"scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "crossbeam-queue"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "crossbeam-utils"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "either"
version = "1.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "encoding" name = "encoding"
version = "0.2.33" version = "0.2.33"
@ -118,6 +171,11 @@ name = "json"
version = "0.11.14" version = "0.11.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "lazy_static"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.60" version = "0.2.60"
@ -143,11 +201,54 @@ dependencies = [
"libc 0.2.60 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.60 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]]
name = "memoffset"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "nodrop"
version = "0.1.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "num_cpus"
version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.60 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]] [[package]]
name = "podio" name = "podio"
version = "0.1.6" version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "rayon"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"crossbeam-deque 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
"either 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
"rayon-core 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rayon-core"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"crossbeam-deque 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam-queue 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"num_cpus 1.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]] [[package]]
name = "redox_syscall" name = "redox_syscall"
version = "0.1.56" version = "0.1.56"
@ -158,6 +259,32 @@ name = "rle-decode-fast"
version = "1.0.1" version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "rustc_version"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "scopeguard"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "semver"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "semver-parser"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "take_mut" name = "take_mut"
version = "0.2.2" version = "0.2.2"
@ -215,12 +342,18 @@ dependencies = [
[metadata] [metadata]
"checksum adler32 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "7e522997b529f05601e05166c07ed17789691f562762c7f3b987263d2dedee5c" "checksum adler32 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "7e522997b529f05601e05166c07ed17789691f562762c7f3b987263d2dedee5c"
"checksum arrayvec 0.4.11 (registry+https://github.com/rust-lang/crates.io-index)" = "b8d73f9beda665eaa98ab9e4f7442bd4e7de6652587de55b2525e52e29c1b0ba"
"checksum byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c3dd8985a7111efc5c80b44e23ecdd8c007de8ade3b96595387e812b957cf5" "checksum byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c3dd8985a7111efc5c80b44e23ecdd8c007de8ade3b96595387e812b957cf5"
"checksum bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "42b7c3cbf0fa9c1b82308d57191728ca0256cb821220f4e2fd410a72ade26e3b" "checksum bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "42b7c3cbf0fa9c1b82308d57191728ca0256cb821220f4e2fd410a72ade26e3b"
"checksum bzip2-sys 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "6584aa36f5ad4c9247f5323b0a42f37802b37a836f0ad87084d7a33961abe25f" "checksum bzip2-sys 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "6584aa36f5ad4c9247f5323b0a42f37802b37a836f0ad87084d7a33961abe25f"
"checksum cc 1.0.38 (registry+https://github.com/rust-lang/crates.io-index)" = "ce400c638d48ee0e9ab75aef7997609ec57367ccfe1463f21bf53c3eca67bf46" "checksum cc 1.0.38 (registry+https://github.com/rust-lang/crates.io-index)" = "ce400c638d48ee0e9ab75aef7997609ec57367ccfe1463f21bf53c3eca67bf46"
"checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33" "checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33"
"checksum crc32fast 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1" "checksum crc32fast 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1"
"checksum crossbeam-deque 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "05e44b8cf3e1a625844d1750e1f7820da46044ff6d28f4d43e455ba3e5bb2c13"
"checksum crossbeam-epoch 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "fedcd6772e37f3da2a9af9bf12ebe046c0dfe657992377b4df982a2b54cd37a9"
"checksum crossbeam-queue 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7c979cd6cfe72335896575c6b5688da489e420d36a27a0b9eb0c73db574b4a4b"
"checksum crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)" = "04973fa96e96579258a5091af6003abde64af786b860f18622b82e026cca60e6"
"checksum either 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5527cfe0d098f36e3f8839852688e63c8fff1c90b2b405aef730615f9a7bcf7b"
"checksum encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)" = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" "checksum encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)" = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec"
"checksum encoding-index-japanese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" "checksum encoding-index-japanese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91"
"checksum encoding-index-korean 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" "checksum encoding-index-korean 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81"
@ -229,12 +362,22 @@ dependencies = [
"checksum encoding-index-tradchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" "checksum encoding-index-tradchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18"
"checksum encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" "checksum encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
"checksum json 0.11.14 (registry+https://github.com/rust-lang/crates.io-index)" = "01d7903059b22f1f09ced2fb9562507e3556a953caa2f835c64ab022bb6148c2" "checksum json 0.11.14 (registry+https://github.com/rust-lang/crates.io-index)" = "01d7903059b22f1f09ced2fb9562507e3556a953caa2f835c64ab022bb6148c2"
"checksum lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bc5729f27f159ddd61f4df6228e827e86643d4d3e7c32183cb30a1c08f604a14"
"checksum libc 0.2.60 (registry+https://github.com/rust-lang/crates.io-index)" = "d44e80633f007889c7eff624b709ab43c92d708caad982295768a7b13ca3b5eb" "checksum libc 0.2.60 (registry+https://github.com/rust-lang/crates.io-index)" = "d44e80633f007889c7eff624b709ab43c92d708caad982295768a7b13ca3b5eb"
"checksum libflate 0.1.25 (registry+https://github.com/rust-lang/crates.io-index)" = "90c6f86f4b0caa347206f916f8b687b51d77c6ef8ff18d52dd007491fd580529" "checksum libflate 0.1.25 (registry+https://github.com/rust-lang/crates.io-index)" = "90c6f86f4b0caa347206f916f8b687b51d77c6ef8ff18d52dd007491fd580529"
"checksum memchr 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "148fab2e51b4f1cfc66da2a7c32981d1d3c083a803978268bb11fe4b86925e7a" "checksum memchr 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "148fab2e51b4f1cfc66da2a7c32981d1d3c083a803978268bb11fe4b86925e7a"
"checksum memoffset 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ce6075db033bbbb7ee5a0bbd3a3186bbae616f57fb001c485c7ff77955f8177f"
"checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945"
"checksum num_cpus 1.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "bcef43580c035376c0705c42792c294b66974abbfd2789b511784023f71f3273"
"checksum podio 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "780fb4b6698bbf9cf2444ea5d22411cef2953f0824b98f33cf454ec5615645bd" "checksum podio 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "780fb4b6698bbf9cf2444ea5d22411cef2953f0824b98f33cf454ec5615645bd"
"checksum rayon 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a4b0186e22767d5b9738a05eab7c6ac90b15db17e5b5f9bd87976dd7d89a10a4"
"checksum rayon-core 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ebbe0df8435ac0c397d467b6cad6d25543d06e8a019ef3f6af3c384597515bd2"
"checksum redox_syscall 0.1.56 (registry+https://github.com/rust-lang/crates.io-index)" = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84" "checksum redox_syscall 0.1.56 (registry+https://github.com/rust-lang/crates.io-index)" = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84"
"checksum rle-decode-fast 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cabe4fa914dec5870285fa7f71f602645da47c486e68486d2b4ceb4a343e90ac" "checksum rle-decode-fast 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cabe4fa914dec5870285fa7f71f602645da47c486e68486d2b4ceb4a343e90ac"
"checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
"checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d"
"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
"checksum take_mut 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60" "checksum take_mut 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60"
"checksum textstream 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "e7ed81b342f6566026755e7f4b7798810b1c159722e427d212ce72c2c58ffdaa" "checksum textstream 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "e7ed81b342f6566026755e7f4b7798810b1c159722e427d212ce72c2c58ffdaa"
"checksum time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)" = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f" "checksum time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)" = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f"

View File

@ -11,3 +11,4 @@ zip = "0.5"
encoding = "0.2" encoding = "0.2"
textstream = "0.1" textstream = "0.1"
json="0.11" json="0.11"
rayon="1.1"

View File

@ -1,15 +1,19 @@
extern crate encoding; extern crate encoding;
extern crate json; extern crate json;
extern crate rayon;
extern crate textstream; extern crate textstream;
extern crate zip; extern crate zip;
use encoding::all::KOI8_R; use encoding::all::KOI8_R;
use encoding::DecoderTrap; use encoding::DecoderTrap;
use rayon::prelude::*;
use std::fs; use std::fs;
use std::io; use std::io;
use std::path::PathBuf; use std::path::PathBuf;
use textstream::TextReader; use textstream::TextReader;
const BASE_FILENAME: &str = "./baza.zip";
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
enum KeywordType { enum KeywordType {
Ignore, Ignore,
@ -192,33 +196,82 @@ fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<std::error::Er
Ok(ctx.data.clone()) Ok(ctx.data.clone())
} }
fn main() -> Result<(), Box<std::error::Error>> { // split vector to a vector of slices
let fname = std::path::Path::new("./baza.zip"); #[allow(clippy::ptr_arg)]
let zip_file = fs::File::open(&fname)?; fn split_vec<'a, T>(src: &'a Vec<T>, num: usize) -> Vec<&'a [T]> {
let all_len = src.len();
let part_len = all_len / num;
let add_len = all_len % num;
let mut result = Vec::<&'a [T]>::new();
if 1 > part_len {
result.push(src.as_slice());
return result;
}
for i in 0..num {
let size = if (num - 1) == i {
part_len + add_len
} else {
part_len
};
let start = part_len * i;
result.push(&src[start..(start + size)]);
}
result
}
fn process_files(files: Vec<PathBuf>) {
let zip_file = fs::File::open(BASE_FILENAME).unwrap();
let zip_reader = io::BufReader::new(zip_file); let zip_reader = io::BufReader::new(zip_file);
let mut archive = zip::ZipArchive::new(zip_reader).unwrap();
let mut archive = zip::ZipArchive::new(zip_reader)?; files.iter().for_each(|name| {
let name_str = name.to_str().unwrap();
println!("{:}", name_str);
for i in 0..archive.len() { // parse txt file
let file = archive.by_index(i)?; let file = archive.by_name(name_str).unwrap();
let name = file.sanitized_name(); let data = parse_file(file).unwrap();
// skip files without "txt" extension
match name.extension() { // make output filename
Some(ext) => match ext.to_str() {
Some(ext_str) if ext_str.eq_ignore_ascii_case("txt") => (),
_ => continue, // extension is not valid unicode or not txt
},
_ => continue, // no extension in filename
}
println!("{}", name.as_path().display());
let data: json::JsonValue = parse_file(file)?;
let mut outfilename = PathBuf::from("./json"); let mut outfilename = PathBuf::from("./json");
outfilename.push(name); outfilename.push(name);
outfilename.set_extension("json"); outfilename.set_extension("json");
let mut outfile = fs::File::create(outfilename)?; // save json to file
let mut outfile = fs::File::create(outfilename).unwrap();
data.write_pretty(&mut outfile, 1).unwrap();
});
}
fn main() -> Result<(), Box<std::error::Error>> {
// open archive just to list files
let zip_file = fs::File::open(BASE_FILENAME)?;
let zip_reader = io::BufReader::new(zip_file);
let mut archive = zip::ZipArchive::new(zip_reader)?;
let source_files: Vec<PathBuf> = (0..archive.len())
.map(|i| archive.by_index(i).unwrap().sanitized_name())
.filter(|name| {
// skip files without "txt" extension
match name.extension() {
Some(ext) => match ext.to_str() {
Some(ext_str) if ext_str.eq_ignore_ascii_case("txt") => true,
_ => false, // extension is not valid unicode or not txt
},
_ => false, // no extension in filename
}
})
.collect();
drop(archive);
//split vector and iterate on it parts in parallel
split_vec(&source_files, rayon::current_num_threads())
.par_iter()
.for_each(|slice| {
let source_files_part = slice.to_vec();
process_files(source_files_part);
});
data.write_pretty(&mut outfile, 1)?;
}
Ok(()) Ok(())
} }