ztb_recode/src/main.rs

187 lines
5.5 KiB
Rust

extern crate async_zip;
extern crate encoding;
extern crate tokio;
use async_zip::read::fs::ZipFileReader;
use async_zip::write::{EntryOptions, ZipFileWriter};
use async_zip::Compression;
use clap::{Parser, ValueEnum};
use encoding::label::encoding_from_whatwg_label;
use encoding::EncodingRef;
use encoding::{DecoderTrap, EncoderTrap};
use regex::Regex;
use tokio::io::AsyncReadExt;
use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender};
use tokio::{fs, task};
/// transcode txt files in zip archive
#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
#[clap(propagate_version = true)]
struct Cli {
/// input encoding
#[clap(short, long, default_value = "koi8-r")]
from: String,
/// output encoding
#[clap(short, long, default_value = "utf8")]
to: String,
/// output compression method
#[clap(arg_enum, short, long, default_value = "zstd")]
compression: OutputFileCompression,
/// filename filter (regex)
#[clap(short, long, default_value = r#".*\.txt$"#)]
regex: String,
/// input zip filename
#[clap(value_parser, default_value = "baza.zip")]
src: String,
/// output zip filename
#[clap(value_parser, default_value = "baza_utf8.zip")]
dst: String,
}
/// output file compression method
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)]
enum OutputFileCompression {
/// Store the file as is
Store,
/// Compress the file using Deflate
Deflate,
/// Compress the file using BZIP2
Bzip2,
/// Compress the file using LZMA
Lzma,
/// Compress the file using ZStandard
Zstd,
/// Compress the file using XZ
Xz,
}
impl From<OutputFileCompression> for Compression {
fn from(compression: OutputFileCompression) -> Self {
match compression {
OutputFileCompression::Store => Self::Stored,
OutputFileCompression::Deflate => Self::Deflate,
OutputFileCompression::Bzip2 => Self::Bz,
OutputFileCompression::Lzma => Self::Lzma,
OutputFileCompression::Zstd => Self::Zstd,
OutputFileCompression::Xz => Self::Xz,
}
}
}
#[derive(Debug)]
struct FileData {
name: String,
data: Vec<u8>,
}
async fn reader_task(tx: UnboundedSender<FileData>, input_filename: String, regex: Regex) {
let archive = ZipFileReader::new(input_filename).await.unwrap();
let mut source_files: Vec<(usize, String, u32)> = archive
.entries()
.iter()
.enumerate()
.filter(|(_, entry)| !entry.dir())
.filter(|(_, entry)| regex.is_match(entry.name()))
.map(|(index, entry)| {
(
index,
entry.name().to_string(),
entry.uncompressed_size().unwrap(),
)
})
.collect();
source_files.sort_by(|(_, name_a, _), (_, name_b, _)| name_a.partial_cmp(name_b).unwrap());
let mut count: usize = 0;
for (index, name, uncompressed_size) in source_files {
let mut entry_reader = archive.entry_reader(index).await.unwrap();
let mut data = Vec::with_capacity(uncompressed_size.try_into().unwrap());
entry_reader.read_to_end(&mut data).await.unwrap();
drop(entry_reader);
tx.send(FileData { name, data }).unwrap();
count += 1;
}
println!("read {count} files done ✅");
}
async fn transcoder_task(
mut rx: UnboundedReceiver<FileData>,
tx: UnboundedSender<FileData>,
encoding_from: EncodingRef,
encoding_to: EncodingRef,
) {
while let Some(FileData { name, data }) = rx.recv().await {
let new_data = task::spawn_blocking(move || {
let text = encoding_from.decode(&data, DecoderTrap::Ignore).unwrap();
encoding_to
.encode(text.as_str(), EncoderTrap::Ignore)
.unwrap()
})
.await
.unwrap();
tx.send(FileData {
name,
data: new_data,
})
.unwrap();
}
println!("transcode done ✅");
}
async fn writer_task(
mut rx: UnboundedReceiver<FileData>,
output_filename: String,
compression: Compression,
) {
let mut outfile = fs::File::create(output_filename)
.await
.expect("output file");
let mut writer = ZipFileWriter::new(&mut outfile);
while let Some(FileData { name, data }) = rx.recv().await {
let opts = EntryOptions::new(name, compression);
writer.write_entry_whole(opts, &data).await.unwrap();
}
writer.close().await.unwrap();
println!("write done ✅");
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let args = Cli::parse();
let regex = Regex::new(&args.regex).expect("regex");
let encoding_from = encoding_from_whatwg_label(&args.from).expect("input encoding");
let encoding_to = encoding_from_whatwg_label(&args.to).expect("output encoding");
let compression: Compression = args.compression.into();
let input_filename = args.src;
let output_filename = args.dst;
let (reader_tx, reader_rx) = mpsc::unbounded_channel::<FileData>();
let (transcoder_tx, transcoder_rx) = mpsc::unbounded_channel::<FileData>();
tokio::try_join!(
tokio::spawn(reader_task(reader_tx, input_filename, regex)),
tokio::spawn(transcoder_task(
reader_rx,
transcoder_tx,
encoding_from,
encoding_to
)),
tokio::spawn(writer_task(transcoder_rx, output_filename, compression))
)?;
println!("all done ✅");
Ok(())
}