From 16784a3319a56178ea2250f3773586c5d6b317f8 Mon Sep 17 00:00:00 2001 From: Dmitry Date: Fri, 7 Oct 2022 15:43:22 +0300 Subject: [PATCH] min tab 2 --- bench.txt | 25 ++++++++------ src/main.rs | 98 ++++++++++++++++++++++++++++++++--------------------- 2 files changed, 73 insertions(+), 50 deletions(-) diff --git a/bench.txt b/bench.txt index db49d0b..7937a2a 100644 --- a/bench.txt +++ b/bench.txt @@ -1,20 +1,23 @@ -hyperfine -n print -n print2 -n zip-print -w 100 -m 100 ".\target\release\chgk_ledb.exe print 444" ".\target\release\chgk_ledb.exe print2 444" ".\target\release\chgk_ledb.exe zip-print 444 4" +866 MB db/data.mdb +233 MB test.bin + 95 MB json.zip +--- + +hyperfine -n print -n print2 -n zip-print -w 100 -m 100 ".\target\release\chgk_ledb.exe print 444" ".\target\release\chgk_ledb.exe print2 444" ".\target\release\chgk_ledb.exe zip-print 4 84" Benchmark 1: print - Time (mean ± σ): 19.9 ms ± 1.8 ms [User: 5.3 ms, System: 14.3 ms] - Range (min … max): 17.4 ms … 26.2 ms 100 runs + Time (mean ± σ): 17.7 ms ± 2.0 ms [User: 4.3 ms, System: 13.3 ms] + Range (min … max): 15.7 ms … 32.0 ms 100 runs Benchmark 2: print2 - Time (mean ± σ): 79.1 ms ± 5.5 ms [User: 10.4 ms, System: 61.4 ms] - Range (min … max): 71.6 ms … 122.1 ms 100 runs - - Warning: Statistical outliers were detected. Consider re-running this benchmark on a quiet PC without any interferences from other programs. It might help to use the '--warmup' or '--prepare' options. + Time (mean ± σ): 62.2 ms ± 4.3 ms [User: 5.6 ms, System: 53.1 ms] + Range (min … max): 58.1 ms … 94.5 ms 100 runs Benchmark 3: zip-print - Time (mean ± σ): 38.0 ms ± 2.2 ms [User: 10.0 ms, System: 23.4 ms] - Range (min … max): 34.3 ms … 43.1 ms 100 runs + Time (mean ± σ): 37.0 ms ± 6.1 ms [User: 12.0 ms, System: 20.8 ms] + Range (min … max): 32.3 ms … 94.3 ms 100 runs Summary 'print' ran - 1.91 ± 0.20 times faster than 'zip-print' - 3.98 ± 0.45 times faster than 'print2' + 2.08 ± 0.41 times faster than 'zip-print' + 3.50 ± 0.46 times faster than 'print2' diff --git a/src/main.rs b/src/main.rs index 07e0900..53f7542 100644 --- a/src/main.rs +++ b/src/main.rs @@ -60,10 +60,13 @@ fn reader_task(tx: mpsc::Sender) { let zip_file = fs::File::open(ZIP_FILENAME).unwrap(); let zip_reader = io::BufReader::new(zip_file); let archive = zip::ZipArchive::new(zip_reader).unwrap(); - archive - .source_questions() - .convert() - .for_each(|question| tx.send(question).expect("send question")); + let mut source_questions = archive.source_questions(); + for question in source_questions.convert() { + let res = tx.send(question); + if res.is_err() { + break; + } + } println!("read done"); } fn db_writer_task(rx: mpsc::Receiver) { @@ -231,54 +234,67 @@ fn main() { } fn read_from_db2(id: u32) -> Option { - use std::io::Cursor; - const LEN_SIZE: usize = std::mem::size_of::(); - let cfg = bincode::config::standard().with_fixed_int_encoding(); + const LEN_SIZE: usize = std::mem::size_of::(); + let cfg = bincode::config::standard(); let input = fs::File::open("test.bin").expect("open input"); let mut input = std::io::BufReader::with_capacity(100 * 1024 * 1024, input); let mut len_data: [u8; LEN_SIZE] = [0; LEN_SIZE]; input.read_exact(&mut len_data).expect("read len"); - let len = u64::from_le_bytes(len_data) as usize; + let tab_len = u32::from_le_bytes(len_data) as usize; + let records_count = tab_len - 1; - //println!("read len done"); + // println!( + // "read tab_len done: {}, pos {}", + // tab_len, + // input.stream_position().unwrap() + // ); - let mut zdata = vec![0u8; len]; - input.read_exact(&mut zdata[..len]).expect("read ztab"); - let tab_data = zstd::decode_all(Cursor::new(zdata)).expect("zstd decode table"); - let tab: (Vec, usize) = - bincode::decode_from_slice(&tab_data, cfg).expect("bincode decode tab"); - let tab = tab.0; - - //println!("read tab done, len {}", tab.len()); let index = match id { 0 => { let mut rng = rand::thread_rng(); - (1..tab.len()).into_iter().choose(&mut rng).unwrap() + (1..records_count).into_iter().choose(&mut rng).unwrap() } _ => (id - 1) as usize, }; - //println!("index {}", index); + // println!("index {}", index); + assert!(index < records_count); - let pos = *tab.get(index).expect("get pos"); - let pos_next = *tab.get((index + 1) as usize).expect("get pos next"); - let len = pos_next - pos; + let tab_tail_len = (tab_len - (index + 2)) * LEN_SIZE; - //println!("pos {} | next {} | len {}", pos, pos_next, len); + let mut pos_curr_data: [u8; LEN_SIZE] = [0; LEN_SIZE]; + let mut pos_next_data: [u8; LEN_SIZE] = [0; LEN_SIZE]; + input + .seek_relative((index * LEN_SIZE).try_into().expect("index to i64")) + .expect("seek to tab pos"); + input + .read_exact(&mut pos_curr_data) + .expect("read current pos"); + input.read_exact(&mut pos_next_data).expect("read next pos"); + + let data_pos = u32::from_le_bytes(pos_curr_data); + let data_pos_next = u32::from_le_bytes(pos_next_data); + let data_len = data_pos_next - data_pos; - input.seek_relative(pos as i64).expect("q seek"); - let reader = input.take(len as u64); + // println!( + // "pos {} | next {} | len {} | tab_tail_len {}", + // data_pos, data_pos_next, data_len, tab_tail_len + // ); + let data_pos = data_pos + tab_tail_len as u32; + + input.seek_relative(data_pos as i64).expect("q seek"); + let reader = input.take(data_len as u64); let data = zstd::decode_all(reader).expect("zstd decode data"); - //println!("zstd decoded len {}", data.len()); + // println!("zstd decoded len {}", data.len()); let question: (Question, usize) = bincode::decode_from_slice(&data, cfg).expect("bincode decode q"); let question = question.0; - //println!("read done"); + // println!("read done"); Some(question) } @@ -293,11 +309,10 @@ fn write_db2() { println!("all done"); } fn db_writer2_task(rx: mpsc::Receiver) { - const LEN_SIZE: usize = std::mem::size_of::(); + const LEN_SIZE: usize = std::mem::size_of::(); const COMP_DATA_LEVEL: i32 = 2; - const COMP_HDR_LEVEL: i32 = 2; - let cfg = bincode::config::standard().with_fixed_int_encoding(); + let cfg = bincode::config::standard(); let mut table: Vec = vec![]; let buf_data: Vec = Vec::with_capacity(500 * 1024 * 1024); @@ -312,10 +327,13 @@ fn db_writer2_task(rx: mpsc::Receiver) { let len = buf.write(&data).expect("write question"); table.push(pos); + //println!("write [{}]: {}", num, pos); + pos += len as u32; num += 1; }); table.push(pos); + println!("write [{}]: {}", num, pos); println!( "zbuf done, tab len {}, buf size {}", @@ -325,19 +343,21 @@ fn db_writer2_task(rx: mpsc::Receiver) { buf.set_position(0); - let tab_data = bincode::encode_to_vec(&table, cfg).expect("encode table"); - let zdata = zstd::encode_all(Cursor::new(tab_data), COMP_HDR_LEVEL).expect("zstd enc table"); - let zlen = zdata.len() as u64; + let tab_data = vec![0u8; (table.len() + 1) * LEN_SIZE]; + let mut tab_cursor = Cursor::new(tab_data); + let len_data = (table.len() as u32).to_le_bytes(); + tab_cursor.write_all(&len_data).expect("write len"); + for pos in table { + let pos_data = pos.to_le_bytes(); + tab_cursor.write_all(&pos_data).expect("write pos"); + } - println!("z tab done, tab data_len = {}", zlen); + println!("tab buf done, len = {}", tab_cursor.position()); + tab_cursor.set_position(0); let out = fs::File::create("test.bin").expect("out create"); let mut out = std::io::BufWriter::with_capacity(500 * 1024 * 1024, out); - let len_writed = out.write(&zlen.to_le_bytes()).expect("write zlen"); - assert_eq!(len_writed, LEN_SIZE); - let ztab_writed = out.write(&zdata).expect("write tab zdata"); - assert_eq!(ztab_writed, zdata.len()); - drop(zdata); + std::io::copy(&mut tab_cursor, &mut out).expect("write tab"); println!("header write done, pos: {}", out.stream_position().unwrap());