diff --git a/.gitignore b/.gitignore index 6bcfb78..ec4a484 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ test?.zip json.zip /exp -/.vscode \ No newline at end of file +/.vscode +test*.bin diff --git a/Cargo.lock b/Cargo.lock index 1e7851b..706a271 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -58,6 +58,25 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a32fd6af2b5827bce66c29053ba0e7c42b9dcab01835835058558c10851a46b" +[[package]] +name = "bincode" +version = "2.0.0-rc.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bb50c5a2ef4b9b1e7ae73e3a73b52ea24b20312d629f9c4df28260b7ad2c3c4" +dependencies = [ + "bincode_derive", + "serde", +] + +[[package]] +name = "bincode_derive" +version = "2.0.0-rc.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a45a23389446d2dd25dc8e73a7a3b3c43522b630cac068927f0649d43d719d2" +dependencies = [ + "virtue", +] + [[package]] name = "bitflags" version = "0.9.1" @@ -131,6 +150,7 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" name = "chgk_ledb" version = "0.1.0" dependencies = [ + "bincode", "clap", "ledb", "ledb-derive", @@ -140,6 +160,7 @@ dependencies = [ "serde_derive", "serde_json", "zip", + "zstd", ] [[package]] @@ -825,6 +846,12 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "virtue" +version = "0.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b60dcd6a64dd45abf9bd426970c9843726da7fc08f44cd6fcebf68c21220a63" + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" diff --git a/Cargo.toml b/Cargo.toml index 26567ba..1fcfe89 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,8 @@ ledb-types="0.4" zip="0.6" rand="0.8" clap = { version = "3.2.22", features = ["derive"] } +bincode = "^2.0.0-rc.2" +zstd = "^0.10" [profile.release] opt-level = 3 diff --git a/bench.txt b/bench.txt new file mode 100644 index 0000000..db49d0b --- /dev/null +++ b/bench.txt @@ -0,0 +1,20 @@ +hyperfine -n print -n print2 -n zip-print -w 100 -m 100 ".\target\release\chgk_ledb.exe print 444" ".\target\release\chgk_ledb.exe print2 444" ".\target\release\chgk_ledb.exe zip-print 444 4" + +Benchmark 1: print + Time (mean ± σ): 19.9 ms ± 1.8 ms [User: 5.3 ms, System: 14.3 ms] + Range (min … max): 17.4 ms … 26.2 ms 100 runs + +Benchmark 2: print2 + Time (mean ± σ): 79.1 ms ± 5.5 ms [User: 10.4 ms, System: 61.4 ms] + Range (min … max): 71.6 ms … 122.1 ms 100 runs + + Warning: Statistical outliers were detected. Consider re-running this benchmark on a quiet PC without any interferences from other programs. It might help to use the '--warmup' or '--prepare' options. + +Benchmark 3: zip-print + Time (mean ± σ): 38.0 ms ± 2.2 ms [User: 10.0 ms, System: 23.4 ms] + Range (min … max): 34.3 ms … 43.1 ms 100 runs + +Summary + 'print' ran + 1.91 ± 0.20 times faster than 'zip-print' + 3.98 ± 0.45 times faster than 'print2' diff --git a/src/main.rs b/src/main.rs index 0f7f521..81feaf8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,6 +9,7 @@ extern crate zip; use clap::{Parser, Subcommand}; use rand::seq::IteratorRandom; +use std::io::{Read, Write}; use std::path::PathBuf; use std::time::Instant; use std::{fs, io, sync::mpsc, thread}; @@ -38,6 +39,11 @@ enum Command { #[clap(value_parser, default_value = "0")] num: usize, }, + Write2, + Print2 { + #[clap(value_parser, default_value = "0")] + id: u32, + }, } #[derive(Parser, Debug)] @@ -210,6 +216,11 @@ fn main() { let get_question = Box::new(|| read_from_zip(*file_num, *num)); Box::new(|| print_question_from(get_question)) } + Command::Write2 => Box::new(write_db2), + Command::Print2 { id } => { + let get_question = Box::new(|| read_from_db2(*id)); + Box::new(|| print_question_from(get_question)) + } }; if args.measure { @@ -218,3 +229,101 @@ fn main() { action(); } + +#[derive(bincode::Decode, bincode::Encode)] +struct VEntry { + pos: u32, + len: u16, +} + +fn read_from_db2(id: u32) -> Option { + use std::io::Cursor; + const LEN_SIZE: usize = std::mem::size_of::(); + let cfg = bincode::config::standard().with_fixed_int_encoding(); + + let input = fs::File::open("test.bin").expect("open input"); + let mut input = std::io::BufReader::with_capacity(100 * 1024 * 1024, input); + + let mut len_data: [u8; LEN_SIZE] = [0; LEN_SIZE]; + input.read_exact(&mut len_data).expect("read len"); + let len = u64::from_le_bytes(len_data) as usize; + + //println!("read len done"); + + let mut zdata = vec![0u8; len]; + input.read_exact(&mut zdata[..len]).expect("read ztab"); + let tab_data = zstd::decode_all(Cursor::new(zdata)).expect("zstd decode table"); + let tab: (Vec, usize) = + bincode::decode_from_slice(&tab_data, cfg).expect("bincode decode tab"); + let tab = tab.0; + + //println!("read tab done"); + + let choosen = tab.get(id as usize).expect("get by id"); + input.seek_relative(choosen.pos as i64).expect("q seek"); + let reader = input.take(choosen.len as u64); + let data = zstd::decode_all(reader).expect("zstd decode data"); + + let question: (Question, usize) = + bincode::decode_from_slice(&data, cfg).expect("bincode decode q"); + let question = question.0; + + //println!("read done"); + + Some(question) +} +fn write_db2() { + let (tx, rx) = mpsc::channel::(); + [ + thread::spawn(move || reader_task(tx)), + thread::spawn(move || db_writer2_task(rx)), + ] + .into_iter() + .for_each(|handle| handle.join().expect("thread panic")); + println!("all done"); +} +fn db_writer2_task(rx: mpsc::Receiver) { + const LEN_SIZE: usize = std::mem::size_of::(); + + let cfg = bincode::config::standard().with_fixed_int_encoding(); + let mut table: Vec = vec![]; + + let buf_data: Vec = Vec::with_capacity(500 * 1024 * 1024); + use std::io::Cursor; + let mut buf = Cursor::new(buf_data); + let mut pos: u32 = 0; + + rx.into_iter().for_each(|q| { + let data = bincode::encode_to_vec(q, cfg).expect("bincode q encode"); + let data = zstd::encode_all(Cursor::new(data), 3).expect("zstd q encode"); + let len = buf.write(&data).expect("write question"); + table.push(VEntry { + pos, + len: len.try_into().expect("len try_into len"), + }); + pos += len as u32; + }); + + buf.set_position(0); + + println!("zbuf done"); + + let tab_data = bincode::encode_to_vec(&table, cfg).expect("encode table"); + let zdata = zstd::encode_all(Cursor::new(tab_data), 3).expect("zstd enc table"); + let zlen = zdata.len() as u64; + + println!("z tab done"); + + let out = fs::File::create("test.bin").expect("out create"); + let mut out = std::io::BufWriter::with_capacity(500 * 1024 * 1024, out); + let len_writed = out.write(&zlen.to_le_bytes()).expect("write zlen"); + assert_eq!(len_writed, LEN_SIZE); + let ztab_writed = out.write(&zdata).expect("write tab zdata"); + assert_eq!(ztab_writed, zdata.len()); + drop(zdata); + std::io::copy(&mut buf, &mut out).expect("copy z buf"); + drop(buf); + out.flush().expect("out flush"); + + println!("write done"); +} diff --git a/src/questions.rs b/src/questions.rs index 7571928..b0853fc 100644 --- a/src/questions.rs +++ b/src/questions.rs @@ -12,7 +12,9 @@ macro_rules! make { ),+ ,..$Target::default()}} } -#[derive(Debug, Default, Clone, Serialize, Deserialize, Document)] +#[derive( + Debug, Default, Clone, Serialize, Deserialize, Document, bincode::Decode, bincode::Encode, +)] pub struct BatchInfo { #[document(primary)] #[serde(default)] @@ -43,7 +45,9 @@ pub struct BatchInfo { pub rating: String, } -#[derive(Debug, Default, Clone, Serialize, Deserialize, Document)] +#[derive( + Debug, Default, Clone, Serialize, Deserialize, Document, bincode::Decode, bincode::Encode, +)] pub struct Question { #[document(primary)] #[serde(default)]