ztb_recode/src/main.rs

204 lines
6.2 KiB
Rust
Raw Normal View History

2022-09-22 12:34:51 +00:00
extern crate async_zip;
2022-09-12 17:50:31 +00:00
extern crate encoding;
2022-09-22 12:34:51 +00:00
extern crate tokio;
2022-09-12 17:50:31 +00:00
2022-09-22 12:34:51 +00:00
use async_zip::read::fs::ZipFileReader;
use async_zip::write::{EntryOptions, ZipFileWriter};
use async_zip::Compression;
2022-09-21 08:43:04 +00:00
use clap::{Parser, ValueEnum};
use encoding::label::encoding_from_whatwg_label;
2022-09-23 22:32:24 +00:00
use encoding::{all::UTF_8, Encoding, EncodingRef};
2022-09-12 17:50:31 +00:00
use encoding::{DecoderTrap, EncoderTrap};
2022-09-21 08:43:04 +00:00
use regex::Regex;
2022-09-23 22:32:24 +00:00
use tokio::io::{AsyncReadExt, AsyncWriteExt, BufWriter};
2022-09-22 12:34:51 +00:00
use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender};
use tokio::{fs, task};
2022-09-12 17:50:31 +00:00
2022-09-22 12:34:51 +00:00
/// transcode txt files in zip archive
2022-09-21 08:43:04 +00:00
#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
#[clap(propagate_version = true)]
struct Cli {
/// input encoding
#[clap(short, long, default_value = "koi8-r")]
from: String,
/// output encoding
#[clap(short, long, default_value = "utf8")]
to: String,
/// output compression method
#[clap(arg_enum, short, long, default_value = "zstd")]
compression: OutputFileCompression,
/// filename filter (regex)
#[clap(short, long, default_value = r#".*\.txt$"#)]
regex: String,
/// input zip filename
#[clap(value_parser, default_value = "baza.zip")]
src: String,
/// output zip filename
#[clap(value_parser, default_value = "baza_utf8.zip")]
dst: String,
}
/// output file compression method
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)]
enum OutputFileCompression {
/// Store the file as is
Store,
/// Compress the file using Deflate
Deflate,
/// Compress the file using BZIP2
Bzip2,
2022-09-22 12:34:51 +00:00
/// Compress the file using LZMA
Lzma,
2022-09-21 08:43:04 +00:00
/// Compress the file using ZStandard
Zstd,
2022-09-22 12:34:51 +00:00
/// Compress the file using XZ
Xz,
2022-09-21 08:43:04 +00:00
}
2022-09-22 12:34:51 +00:00
impl From<OutputFileCompression> for Compression {
2022-09-21 08:43:04 +00:00
fn from(compression: OutputFileCompression) -> Self {
match compression {
OutputFileCompression::Store => Self::Stored,
2022-09-22 12:34:51 +00:00
OutputFileCompression::Deflate => Self::Deflate,
OutputFileCompression::Bzip2 => Self::Bz,
OutputFileCompression::Lzma => Self::Lzma,
2022-09-21 08:43:04 +00:00
OutputFileCompression::Zstd => Self::Zstd,
2022-09-22 12:34:51 +00:00
OutputFileCompression::Xz => Self::Xz,
2022-09-21 08:43:04 +00:00
}
}
}
2022-09-12 17:50:31 +00:00
2022-09-21 10:52:44 +00:00
#[derive(Debug)]
struct FileData {
2022-09-12 17:50:31 +00:00
name: String,
2022-09-21 10:52:44 +00:00
data: Vec<u8>,
2022-09-12 17:50:31 +00:00
}
2022-09-22 12:34:51 +00:00
async fn reader_task(tx: UnboundedSender<FileData>, input_filename: String, regex: Regex) {
let archive = ZipFileReader::new(input_filename).await.unwrap();
let mut source_files: Vec<(usize, String, u32)> = archive
.entries()
.iter()
.enumerate()
.filter(|(_, entry)| !entry.dir())
.filter(|(_, entry)| regex.is_match(entry.name()))
.map(|(index, entry)| {
(
index,
entry.name().to_string(),
entry.uncompressed_size().unwrap(),
)
})
2022-09-21 10:52:44 +00:00
.collect();
2022-09-22 12:34:51 +00:00
source_files.sort_by(|(_, name_a, _), (_, name_b, _)| name_a.partial_cmp(name_b).unwrap());
2022-09-21 10:52:44 +00:00
2022-09-22 12:34:51 +00:00
let mut count: usize = 0;
for (index, name, uncompressed_size) in source_files {
let mut entry_reader = archive.entry_reader(index).await.unwrap();
let mut data = Vec::with_capacity(uncompressed_size.try_into().unwrap());
entry_reader.read_to_end(&mut data).await.unwrap();
drop(entry_reader);
2022-09-21 10:52:44 +00:00
tx.send(FileData { name, data }).unwrap();
2022-09-22 12:34:51 +00:00
count += 1;
2022-09-21 10:52:44 +00:00
}
2022-09-22 12:34:51 +00:00
println!("read {count} files done ✅");
2022-09-12 17:50:31 +00:00
}
2022-09-22 12:34:51 +00:00
async fn transcoder_task(
mut rx: UnboundedReceiver<FileData>,
tx: UnboundedSender<FileData>,
2022-09-22 11:27:30 +00:00
encoding_from: EncodingRef,
encoding_to: EncodingRef,
) {
2022-09-23 22:32:24 +00:00
let is_encodings_same = encoding_from.name() == encoding_to.name();
2022-09-23 22:41:02 +00:00
let is_src_encoding_native = UTF_8.name() == encoding_from.name();
let is_dst_encoding_native = UTF_8.name() == encoding_to.name();
2022-09-23 22:32:24 +00:00
2022-09-22 12:34:51 +00:00
while let Some(FileData { name, data }) = rx.recv().await {
2022-09-23 22:32:24 +00:00
let new_data = if is_encodings_same {
data
} else {
task::block_in_place(move || {
2022-09-23 22:41:02 +00:00
let text = if is_src_encoding_native {
2022-09-23 22:32:24 +00:00
String::from_utf8(data).unwrap()
} else {
encoding_from.decode(&data, DecoderTrap::Ignore).unwrap()
};
2022-09-23 22:41:02 +00:00
if is_dst_encoding_native {
2022-09-23 22:32:24 +00:00
text.into_bytes()
} else {
encoding_to
.encode(text.as_str(), EncoderTrap::Ignore)
.unwrap()
}
})
};
2022-09-22 12:34:51 +00:00
2022-09-22 11:27:30 +00:00
tx.send(FileData {
name,
data: new_data,
})
.unwrap();
2022-09-21 10:52:44 +00:00
}
2022-09-22 11:27:30 +00:00
println!("transcode done ✅");
2022-09-21 10:52:44 +00:00
}
2022-09-12 17:50:31 +00:00
2022-09-22 12:34:51 +00:00
async fn writer_task(
mut rx: UnboundedReceiver<FileData>,
2022-09-21 10:52:44 +00:00
output_filename: String,
2022-09-22 12:34:51 +00:00
compression: Compression,
2022-09-21 10:52:44 +00:00
) {
2022-09-23 18:30:23 +00:00
let outfile = fs::File::create(output_filename)
2022-09-22 12:34:51 +00:00
.await
.expect("output file");
2022-09-23 18:30:23 +00:00
let mut buf = BufWriter::with_capacity(100 * 1024 * 1024, outfile);
let mut writer = ZipFileWriter::new(&mut buf);
2022-09-22 12:34:51 +00:00
while let Some(FileData { name, data }) = rx.recv().await {
let opts = EntryOptions::new(name, compression);
writer.write_entry_whole(opts, &data).await.unwrap();
2022-09-13 08:38:04 +00:00
}
2022-09-22 12:34:51 +00:00
writer.close().await.unwrap();
2022-09-23 18:30:23 +00:00
buf.flush().await.unwrap();
2022-09-21 10:52:44 +00:00
println!("write done ✅");
2022-09-12 17:50:31 +00:00
}
2022-09-22 12:34:51 +00:00
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
2022-09-21 08:43:04 +00:00
let args = Cli::parse();
2022-09-21 10:52:44 +00:00
let regex = Regex::new(&args.regex).expect("regex");
2022-09-22 11:27:30 +00:00
let encoding_from = encoding_from_whatwg_label(&args.from).expect("input encoding");
let encoding_to = encoding_from_whatwg_label(&args.to).expect("output encoding");
2022-09-22 12:34:51 +00:00
let compression: Compression = args.compression.into();
2022-09-21 10:52:44 +00:00
let input_filename = args.src;
let output_filename = args.dst;
2022-09-22 12:34:51 +00:00
let (reader_tx, reader_rx) = mpsc::unbounded_channel::<FileData>();
let (transcoder_tx, transcoder_rx) = mpsc::unbounded_channel::<FileData>();
tokio::try_join!(
tokio::spawn(reader_task(reader_tx, input_filename, regex)),
tokio::spawn(transcoder_task(
reader_rx,
transcoder_tx,
encoding_from,
encoding_to
)),
tokio::spawn(writer_task(transcoder_rx, output_filename, compression))
)?;
2022-09-21 08:43:04 +00:00
2022-09-21 10:52:44 +00:00
println!("all done ✅");
2022-09-12 17:50:31 +00:00
Ok(())
}