parallel processing using rayon
This commit is contained in:
parent
390097b5b8
commit
0240bf2fdf
143
Cargo.lock
generated
143
Cargo.lock
generated
@ -5,6 +5,14 @@ name = "adler32"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "arrayvec"
|
||||
version = "0.4.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.3.2"
|
||||
@ -44,6 +52,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"json 0.11.14 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"rayon 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"textstream 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"zip 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
@ -56,6 +65,50 @@ dependencies = [
|
||||
"cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"crossbeam-epoch 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.7.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"arrayvec 0.4.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"memoffset 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-queue"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.6.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "encoding"
|
||||
version = "0.2.33"
|
||||
@ -118,6 +171,11 @@ name = "json"
|
||||
version = "0.11.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.60"
|
||||
@ -143,11 +201,54 @@ dependencies = [
|
||||
"libc 0.2.60 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memoffset"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nodrop"
|
||||
version = "0.1.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "num_cpus"
|
||||
version = "1.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"libc 0.2.60 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "podio"
|
||||
version = "0.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"crossbeam-deque 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"either 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"rayon-core 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon-core"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"crossbeam-deque 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"crossbeam-queue 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"num_cpus 1.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.1.56"
|
||||
@ -158,6 +259,32 @@ name = "rle-decode-fast"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "rustc_version"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scopeguard"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver-parser"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "take_mut"
|
||||
version = "0.2.2"
|
||||
@ -215,12 +342,18 @@ dependencies = [
|
||||
|
||||
[metadata]
|
||||
"checksum adler32 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "7e522997b529f05601e05166c07ed17789691f562762c7f3b987263d2dedee5c"
|
||||
"checksum arrayvec 0.4.11 (registry+https://github.com/rust-lang/crates.io-index)" = "b8d73f9beda665eaa98ab9e4f7442bd4e7de6652587de55b2525e52e29c1b0ba"
|
||||
"checksum byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c3dd8985a7111efc5c80b44e23ecdd8c007de8ade3b96595387e812b957cf5"
|
||||
"checksum bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "42b7c3cbf0fa9c1b82308d57191728ca0256cb821220f4e2fd410a72ade26e3b"
|
||||
"checksum bzip2-sys 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "6584aa36f5ad4c9247f5323b0a42f37802b37a836f0ad87084d7a33961abe25f"
|
||||
"checksum cc 1.0.38 (registry+https://github.com/rust-lang/crates.io-index)" = "ce400c638d48ee0e9ab75aef7997609ec57367ccfe1463f21bf53c3eca67bf46"
|
||||
"checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33"
|
||||
"checksum crc32fast 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1"
|
||||
"checksum crossbeam-deque 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "05e44b8cf3e1a625844d1750e1f7820da46044ff6d28f4d43e455ba3e5bb2c13"
|
||||
"checksum crossbeam-epoch 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "fedcd6772e37f3da2a9af9bf12ebe046c0dfe657992377b4df982a2b54cd37a9"
|
||||
"checksum crossbeam-queue 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7c979cd6cfe72335896575c6b5688da489e420d36a27a0b9eb0c73db574b4a4b"
|
||||
"checksum crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)" = "04973fa96e96579258a5091af6003abde64af786b860f18622b82e026cca60e6"
|
||||
"checksum either 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5527cfe0d098f36e3f8839852688e63c8fff1c90b2b405aef730615f9a7bcf7b"
|
||||
"checksum encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)" = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec"
|
||||
"checksum encoding-index-japanese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91"
|
||||
"checksum encoding-index-korean 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81"
|
||||
@ -229,12 +362,22 @@ dependencies = [
|
||||
"checksum encoding-index-tradchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18"
|
||||
"checksum encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
|
||||
"checksum json 0.11.14 (registry+https://github.com/rust-lang/crates.io-index)" = "01d7903059b22f1f09ced2fb9562507e3556a953caa2f835c64ab022bb6148c2"
|
||||
"checksum lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bc5729f27f159ddd61f4df6228e827e86643d4d3e7c32183cb30a1c08f604a14"
|
||||
"checksum libc 0.2.60 (registry+https://github.com/rust-lang/crates.io-index)" = "d44e80633f007889c7eff624b709ab43c92d708caad982295768a7b13ca3b5eb"
|
||||
"checksum libflate 0.1.25 (registry+https://github.com/rust-lang/crates.io-index)" = "90c6f86f4b0caa347206f916f8b687b51d77c6ef8ff18d52dd007491fd580529"
|
||||
"checksum memchr 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "148fab2e51b4f1cfc66da2a7c32981d1d3c083a803978268bb11fe4b86925e7a"
|
||||
"checksum memoffset 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ce6075db033bbbb7ee5a0bbd3a3186bbae616f57fb001c485c7ff77955f8177f"
|
||||
"checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945"
|
||||
"checksum num_cpus 1.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "bcef43580c035376c0705c42792c294b66974abbfd2789b511784023f71f3273"
|
||||
"checksum podio 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "780fb4b6698bbf9cf2444ea5d22411cef2953f0824b98f33cf454ec5615645bd"
|
||||
"checksum rayon 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a4b0186e22767d5b9738a05eab7c6ac90b15db17e5b5f9bd87976dd7d89a10a4"
|
||||
"checksum rayon-core 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ebbe0df8435ac0c397d467b6cad6d25543d06e8a019ef3f6af3c384597515bd2"
|
||||
"checksum redox_syscall 0.1.56 (registry+https://github.com/rust-lang/crates.io-index)" = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84"
|
||||
"checksum rle-decode-fast 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cabe4fa914dec5870285fa7f71f602645da47c486e68486d2b4ceb4a343e90ac"
|
||||
"checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
|
||||
"checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d"
|
||||
"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
|
||||
"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
|
||||
"checksum take_mut 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60"
|
||||
"checksum textstream 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "e7ed81b342f6566026755e7f4b7798810b1c159722e427d212ce72c2c58ffdaa"
|
||||
"checksum time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)" = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f"
|
||||
|
@ -11,3 +11,4 @@ zip = "0.5"
|
||||
encoding = "0.2"
|
||||
textstream = "0.1"
|
||||
json="0.11"
|
||||
rayon="1.1"
|
||||
|
93
src/main.rs
93
src/main.rs
@ -1,15 +1,19 @@
|
||||
extern crate encoding;
|
||||
extern crate json;
|
||||
extern crate rayon;
|
||||
extern crate textstream;
|
||||
extern crate zip;
|
||||
|
||||
use encoding::all::KOI8_R;
|
||||
use encoding::DecoderTrap;
|
||||
use rayon::prelude::*;
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::path::PathBuf;
|
||||
use textstream::TextReader;
|
||||
|
||||
const BASE_FILENAME: &str = "./baza.zip";
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum KeywordType {
|
||||
Ignore,
|
||||
@ -192,33 +196,82 @@ fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<std::error::Er
|
||||
Ok(ctx.data.clone())
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<std::error::Error>> {
|
||||
let fname = std::path::Path::new("./baza.zip");
|
||||
let zip_file = fs::File::open(&fname)?;
|
||||
let zip_reader = io::BufReader::new(zip_file);
|
||||
// split vector to a vector of slices
|
||||
#[allow(clippy::ptr_arg)]
|
||||
fn split_vec<'a, T>(src: &'a Vec<T>, num: usize) -> Vec<&'a [T]> {
|
||||
let all_len = src.len();
|
||||
let part_len = all_len / num;
|
||||
let add_len = all_len % num;
|
||||
let mut result = Vec::<&'a [T]>::new();
|
||||
|
||||
let mut archive = zip::ZipArchive::new(zip_reader)?;
|
||||
|
||||
for i in 0..archive.len() {
|
||||
let file = archive.by_index(i)?;
|
||||
let name = file.sanitized_name();
|
||||
// skip files without "txt" extension
|
||||
match name.extension() {
|
||||
Some(ext) => match ext.to_str() {
|
||||
Some(ext_str) if ext_str.eq_ignore_ascii_case("txt") => (),
|
||||
_ => continue, // extension is not valid unicode or not txt
|
||||
},
|
||||
_ => continue, // no extension in filename
|
||||
if 1 > part_len {
|
||||
result.push(src.as_slice());
|
||||
return result;
|
||||
}
|
||||
println!("{}", name.as_path().display());
|
||||
let data: json::JsonValue = parse_file(file)?;
|
||||
for i in 0..num {
|
||||
let size = if (num - 1) == i {
|
||||
part_len + add_len
|
||||
} else {
|
||||
part_len
|
||||
};
|
||||
let start = part_len * i;
|
||||
result.push(&src[start..(start + size)]);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn process_files(files: Vec<PathBuf>) {
|
||||
let zip_file = fs::File::open(BASE_FILENAME).unwrap();
|
||||
let zip_reader = io::BufReader::new(zip_file);
|
||||
let mut archive = zip::ZipArchive::new(zip_reader).unwrap();
|
||||
|
||||
files.iter().for_each(|name| {
|
||||
let name_str = name.to_str().unwrap();
|
||||
println!("{:}", name_str);
|
||||
|
||||
// parse txt file
|
||||
let file = archive.by_name(name_str).unwrap();
|
||||
let data = parse_file(file).unwrap();
|
||||
|
||||
// make output filename
|
||||
let mut outfilename = PathBuf::from("./json");
|
||||
outfilename.push(name);
|
||||
outfilename.set_extension("json");
|
||||
|
||||
let mut outfile = fs::File::create(outfilename)?;
|
||||
// save json to file
|
||||
let mut outfile = fs::File::create(outfilename).unwrap();
|
||||
data.write_pretty(&mut outfile, 1).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
data.write_pretty(&mut outfile, 1)?;
|
||||
fn main() -> Result<(), Box<std::error::Error>> {
|
||||
// open archive just to list files
|
||||
let zip_file = fs::File::open(BASE_FILENAME)?;
|
||||
let zip_reader = io::BufReader::new(zip_file);
|
||||
let mut archive = zip::ZipArchive::new(zip_reader)?;
|
||||
|
||||
let source_files: Vec<PathBuf> = (0..archive.len())
|
||||
.map(|i| archive.by_index(i).unwrap().sanitized_name())
|
||||
.filter(|name| {
|
||||
// skip files without "txt" extension
|
||||
match name.extension() {
|
||||
Some(ext) => match ext.to_str() {
|
||||
Some(ext_str) if ext_str.eq_ignore_ascii_case("txt") => true,
|
||||
_ => false, // extension is not valid unicode or not txt
|
||||
},
|
||||
_ => false, // no extension in filename
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
drop(archive);
|
||||
|
||||
//split vector and iterate on it parts in parallel
|
||||
split_vec(&source_files, rayon::current_num_threads())
|
||||
.par_iter()
|
||||
.for_each(|slice| {
|
||||
let source_files_part = slice.to_vec();
|
||||
process_files(source_files_part);
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user