chgk_ledb/src/db.rs

365 lines
9.8 KiB
Rust
Raw Normal View History

use std::{
fs,
2022-10-09 20:54:58 +00:00
io::{self, Cursor, Read, Seek, Write},
marker::PhantomData,
path::Path,
};
type LSize = u32;
const LEN_SIZE: usize = std::mem::size_of::<LSize>();
const BINCODE_CFG: bincode::config::Configuration = bincode::config::standard();
trait ErrorToString {
type Output;
fn str_err(self) -> std::result::Result<Self::Output, String>;
}
impl<T, E> ErrorToString for std::result::Result<T, E>
where
E: std::error::Error,
{
type Output = T;
fn str_err(self) -> std::result::Result<Self::Output, String> {
self.map_err(|e| e.to_string())
}
}
pub struct WriterOpts {
pub compress_lvl: i32,
pub data_buf_size: usize,
pub out_buf_size: usize,
pub current_buf_size: usize,
}
impl Default for WriterOpts {
fn default() -> Self {
Self {
compress_lvl: 1,
data_buf_size: 500 * 1024 * 1024,
out_buf_size: 200 * 1024 * 1024,
2022-10-09 20:54:58 +00:00
current_buf_size: 20 * 1024 * 1024,
}
}
}
2022-10-09 20:54:58 +00:00
pub struct Writer<T>
where
T: bincode::Encode,
{
out: io::BufWriter<fs::File>,
data_buf: Cursor<Vec<u8>>,
table: Vec<LSize>,
2022-10-09 20:54:58 +00:00
compress_lvl: i32,
current_buf_size: usize,
_t: PhantomData<*const T>,
}
2022-10-09 20:54:58 +00:00
impl<T> Writer<T>
where
2022-10-09 22:19:44 +00:00
T: bincode::Encode,
{
pub fn new<P: AsRef<Path>>(path: P, opts: WriterOpts) -> Result<Self, String> {
let out = fs::File::create(path).str_err()?;
let out = io::BufWriter::with_capacity(opts.out_buf_size, out);
let data_buf: Vec<u8> = Vec::with_capacity(opts.data_buf_size);
let data_buf = Cursor::new(data_buf);
2022-10-09 20:54:58 +00:00
let compress_lvl = opts.compress_lvl;
let current_buf_size = opts.current_buf_size;
let table: Vec<LSize> = vec![];
Ok(Self {
out,
data_buf,
table,
2022-10-09 20:54:58 +00:00
compress_lvl,
current_buf_size,
_t: PhantomData,
})
}
2022-10-09 15:26:00 +00:00
pub fn push(&mut self, item: T) -> Result<(), String> {
let pos: LSize = self.data_buf.position() as LSize;
let item_data = bincode::encode_to_vec(item, BINCODE_CFG).str_err()?;
2022-10-09 20:54:58 +00:00
let cur_buf_raw: Vec<u8> = Vec::with_capacity(self.current_buf_size);
let cur_buf_raw = Cursor::new(cur_buf_raw);
let mut zencoder = zstd::stream::raw::Encoder::new(self.compress_lvl).str_err()?;
zencoder
.set_pledged_src_size(item_data.len() as u64)
.str_err()?;
2022-10-09 20:54:58 +00:00
let mut cur_buf_z = zstd::stream::zio::Writer::new(cur_buf_raw, zencoder);
cur_buf_z.write_all(&item_data).str_err()?;
cur_buf_z.finish().str_err()?;
cur_buf_z.flush().str_err()?;
self.table.push(pos);
2022-10-09 20:54:58 +00:00
let (mut cur_buf_raw, _) = cur_buf_z.into_inner();
let size = cur_buf_raw.position();
2022-10-09 20:54:58 +00:00
cur_buf_raw.set_position(0);
2022-10-09 20:54:58 +00:00
let mut chunk_reader = cur_buf_raw.take(size);
io::copy(&mut chunk_reader, &mut self.data_buf).str_err()?;
Ok(())
}
2022-10-09 15:26:00 +00:00
pub fn load<I>(&mut self, iter: &mut I) -> Result<(), String>
where
I: Iterator<Item = T>,
{
for item in iter {
self.push(item)?;
}
Ok(())
}
pub fn finish(mut self) -> Result<(), String> {
// finish tab
let pos: LSize = self.data_buf.position() as LSize;
self.table.push(pos);
// write tab
let tab_size = (self.table.len() * LEN_SIZE) as LSize;
for pos in self.table {
let pos_data = (pos + tab_size).to_le_bytes();
self.out.write_all(&pos_data).str_err()?;
}
// copy data
let data_size = self.data_buf.position();
self.data_buf.set_position(0);
let mut data = self.data_buf.take(data_size);
io::copy(&mut data, &mut self.out).str_err()?;
self.out.flush().str_err()?;
Ok(())
}
}
pub struct Reader<T>
where
T: bincode::Decode,
{
input: io::BufReader<fs::File>,
count: usize,
first_pos: LSize,
_t: PhantomData<*const T>,
}
impl<T> Reader<T>
where
T: bincode::Decode,
{
pub fn new<P: AsRef<Path>>(path: P, buf_size: usize) -> Result<Self, String> {
let input = fs::File::open(path).str_err()?;
let mut input = io::BufReader::with_capacity(buf_size, input);
// read first pos and records count
let mut first_data: [u8; LEN_SIZE] = [0; LEN_SIZE];
input.read_exact(&mut first_data).str_err()?;
let first_pos = LSize::from_le_bytes(first_data);
let tab_len = (first_pos as usize) / LEN_SIZE;
let count = tab_len - 1;
Ok(Self {
input,
count,
first_pos,
_t: PhantomData,
})
}
pub fn len(&self) -> usize {
self.count
}
pub fn get(&mut self, index: usize) -> Result<T, String> {
if index >= self.len() {
return Err("index out of range".into());
}
// read item data pos
let data_pos = if 0 == index {
self.first_pos
} else {
let tab_pos: u64 = (index * LEN_SIZE).try_into().str_err()?;
let mut pos_curr_data: [u8; LEN_SIZE] = [0; LEN_SIZE];
2022-10-09 20:54:58 +00:00
let cur_pos = self.input.stream_position().str_err()? as i64;
self.input.seek_relative((tab_pos as i64) - cur_pos).str_err()?;
self.input.read_exact(&mut pos_curr_data).str_err()?;
LSize::from_le_bytes(pos_curr_data)
};
// read next item pos
let mut pos_next_data: [u8; LEN_SIZE] = [0; LEN_SIZE];
self.input.read_exact(&mut pos_next_data).str_err()?;
let data_pos_next = LSize::from_le_bytes(pos_next_data);
// calc item data length
let data_len = data_pos_next - data_pos;
// read & unpack item data
2022-10-09 20:54:58 +00:00
let cur_pos = self.input.stream_position().str_err()? as i64;
self.input
2022-10-09 20:54:58 +00:00
.seek_relative((data_pos as i64) - cur_pos)
.str_err()?;
let reader = self.input.by_ref().take(data_len as u64);
let data = zstd::decode_all(reader).str_err()?;
// decode item
let item: (T, usize) = bincode::decode_from_slice(&data, BINCODE_CFG).str_err()?;
Ok(item.0)
}
2022-10-09 21:08:57 +00:00
pub fn iter(&mut self) -> ReaderIter<'_, T> {
ReaderIter::new(self)
}
}
pub struct ReaderIter<'a, T>
where
T: bincode::Decode,
{
reader: &'a mut Reader<T>,
index: Option<usize>,
}
impl<'a, T> ReaderIter<'a, T>
where
T: bincode::Decode,
{
fn new(reader: &'a mut Reader<T>) -> Self {
ReaderIter {
reader,
index: None,
}
}
}
impl<'a, T> Iterator for ReaderIter<'a, T>
where
T: bincode::Decode,
{
type Item = T;
fn next(&mut self) -> Option<Self::Item> {
if self.index.is_none() && self.reader.len() != 0 {
self.index = Some(0);
}
match self.index {
Some(i) if i < self.reader.len() => self.nth(i),
_ => None,
}
}
fn nth(&mut self, n: usize) -> Option<Self::Item> {
if self.reader.len() <= n {
return None;
}
self.index = Some(n + 1);
let item = self.reader.get(n);
match item {
Ok(item) => Some(item),
Err(_) => None,
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
let len = self.reader.len();
if self.index.is_none() {
return (len, Some(len));
}
let index = self.index.unwrap();
let rem = if len > index + 1 {
len - (index + 1)
} else {
0
};
(rem, Some(rem))
}
fn count(self) -> usize
where
Self: Sized,
{
self.reader.len()
}
}
impl<'a, T> ExactSizeIterator for ReaderIter<'a, T>
where
T: bincode::Decode,
{
fn len(&self) -> usize {
self.reader.len()
}
}
2022-10-09 20:54:58 +00:00
#[cfg(test)]
mod test {
use super::*;
2022-10-09 22:05:54 +00:00
use tempfile::tempdir;
2022-10-09 20:54:58 +00:00
#[derive(bincode::Encode, bincode::Decode, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
struct TestData {
num: u64,
}
fn gen_data(count: usize) -> impl Iterator<Item = TestData> {
(0..count).into_iter().map(|i| TestData{ num: i as u64})
}
#[test]
fn test_write_read() {
2022-10-09 22:05:54 +00:00
let dir = tempdir().expect("tempdir");
let tmpfile = dir.path().join("test.tmp");
2022-10-09 20:54:58 +00:00
let opts = WriterOpts { compress_lvl: 1, data_buf_size: 10 * 1024 * 1024, out_buf_size: 10 * 1024 * 1024, current_buf_size: 4096 };
2022-10-09 22:05:54 +00:00
let mut writer: Writer<TestData> = Writer::new(&tmpfile, opts).expect("new writer");
2022-10-09 20:54:58 +00:00
let items_iter = gen_data(5);
let items: Vec<TestData> = items_iter.collect();
writer.load(&mut items.clone().into_iter()).expect("load");
writer.finish().expect("finish write");
2022-10-09 22:05:54 +00:00
let mut reader: Reader<TestData> = Reader::new(&tmpfile, 2048).expect("new reader");
2022-10-09 20:54:58 +00:00
assert_eq!(items.len(), reader.len());
for (idx,item) in items.iter().enumerate() {
let ritem = reader.get(idx).expect("get");
assert_eq!(*item, ritem);
}
}
2022-10-09 21:08:57 +00:00
#[test]
fn test_write_read_iter() {
2022-10-09 22:05:54 +00:00
let dir = tempdir().expect("tempdir");
let tmpfile = dir.path().join("test.tmp");
2022-10-09 21:08:57 +00:00
let opts = WriterOpts { compress_lvl: 1, data_buf_size: 10 * 1024 * 1024, out_buf_size: 10 * 1024 * 1024, current_buf_size: 4096 };
2022-10-09 22:05:54 +00:00
let mut writer: Writer<TestData> = Writer::new(&tmpfile, opts).expect("new writer");
2022-10-09 21:08:57 +00:00
let items_iter = gen_data(10);
let items: Vec<TestData> = items_iter.collect();
writer.load(&mut items.clone().into_iter()).expect("load");
writer.finish().expect("finish write");
2022-10-09 22:05:54 +00:00
let mut reader: Reader<TestData> = Reader::new(&tmpfile, 2048).expect("new reader");
2022-10-09 21:08:57 +00:00
assert_eq!(items.len(), reader.len());
items.into_iter().zip(reader.iter()).for_each(|pair| {
assert_eq!(pair.0, pair.1);
});
}
2022-10-09 20:54:58 +00:00
}