187 lines
5.5 KiB
Rust
187 lines
5.5 KiB
Rust
extern crate async_zip;
|
|
extern crate encoding;
|
|
extern crate tokio;
|
|
|
|
use async_zip::read::fs::ZipFileReader;
|
|
use async_zip::write::{EntryOptions, ZipFileWriter};
|
|
use async_zip::Compression;
|
|
use clap::{Parser, ValueEnum};
|
|
use encoding::label::encoding_from_whatwg_label;
|
|
use encoding::EncodingRef;
|
|
use encoding::{DecoderTrap, EncoderTrap};
|
|
use regex::Regex;
|
|
use tokio::io::AsyncReadExt;
|
|
use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender};
|
|
use tokio::{fs, task};
|
|
|
|
/// transcode txt files in zip archive
|
|
#[derive(Parser, Debug)]
|
|
#[clap(author, version, about, long_about = None)]
|
|
#[clap(propagate_version = true)]
|
|
struct Cli {
|
|
/// input encoding
|
|
#[clap(short, long, default_value = "koi8-r")]
|
|
from: String,
|
|
|
|
/// output encoding
|
|
#[clap(short, long, default_value = "utf8")]
|
|
to: String,
|
|
|
|
/// output compression method
|
|
#[clap(arg_enum, short, long, default_value = "zstd")]
|
|
compression: OutputFileCompression,
|
|
|
|
/// filename filter (regex)
|
|
#[clap(short, long, default_value = r#".*\.txt$"#)]
|
|
regex: String,
|
|
|
|
/// input zip filename
|
|
#[clap(value_parser, default_value = "baza.zip")]
|
|
src: String,
|
|
|
|
/// output zip filename
|
|
#[clap(value_parser, default_value = "baza_utf8.zip")]
|
|
dst: String,
|
|
}
|
|
|
|
/// output file compression method
|
|
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)]
|
|
enum OutputFileCompression {
|
|
/// Store the file as is
|
|
Store,
|
|
/// Compress the file using Deflate
|
|
Deflate,
|
|
/// Compress the file using BZIP2
|
|
Bzip2,
|
|
/// Compress the file using LZMA
|
|
Lzma,
|
|
/// Compress the file using ZStandard
|
|
Zstd,
|
|
/// Compress the file using XZ
|
|
Xz,
|
|
}
|
|
|
|
impl From<OutputFileCompression> for Compression {
|
|
fn from(compression: OutputFileCompression) -> Self {
|
|
match compression {
|
|
OutputFileCompression::Store => Self::Stored,
|
|
OutputFileCompression::Deflate => Self::Deflate,
|
|
OutputFileCompression::Bzip2 => Self::Bz,
|
|
OutputFileCompression::Lzma => Self::Lzma,
|
|
OutputFileCompression::Zstd => Self::Zstd,
|
|
OutputFileCompression::Xz => Self::Xz,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
struct FileData {
|
|
name: String,
|
|
data: Vec<u8>,
|
|
}
|
|
|
|
async fn reader_task(tx: UnboundedSender<FileData>, input_filename: String, regex: Regex) {
|
|
let archive = ZipFileReader::new(input_filename).await.unwrap();
|
|
|
|
let mut source_files: Vec<(usize, String, u32)> = archive
|
|
.entries()
|
|
.iter()
|
|
.enumerate()
|
|
.filter(|(_, entry)| !entry.dir())
|
|
.filter(|(_, entry)| regex.is_match(entry.name()))
|
|
.map(|(index, entry)| {
|
|
(
|
|
index,
|
|
entry.name().to_string(),
|
|
entry.uncompressed_size().unwrap(),
|
|
)
|
|
})
|
|
.collect();
|
|
source_files.sort_by(|(_, name_a, _), (_, name_b, _)| name_a.partial_cmp(name_b).unwrap());
|
|
|
|
let mut count: usize = 0;
|
|
for (index, name, uncompressed_size) in source_files {
|
|
let mut entry_reader = archive.entry_reader(index).await.unwrap();
|
|
let mut data = Vec::with_capacity(uncompressed_size.try_into().unwrap());
|
|
entry_reader.read_to_end(&mut data).await.unwrap();
|
|
drop(entry_reader);
|
|
|
|
tx.send(FileData { name, data }).unwrap();
|
|
count += 1;
|
|
}
|
|
|
|
println!("read {count} files done ✅");
|
|
}
|
|
|
|
async fn transcoder_task(
|
|
mut rx: UnboundedReceiver<FileData>,
|
|
tx: UnboundedSender<FileData>,
|
|
encoding_from: EncodingRef,
|
|
encoding_to: EncodingRef,
|
|
) {
|
|
while let Some(FileData { name, data }) = rx.recv().await {
|
|
let new_data = task::spawn_blocking(move || {
|
|
let text = encoding_from.decode(&data, DecoderTrap::Ignore).unwrap();
|
|
encoding_to
|
|
.encode(text.as_str(), EncoderTrap::Ignore)
|
|
.unwrap()
|
|
})
|
|
.await
|
|
.unwrap();
|
|
|
|
tx.send(FileData {
|
|
name,
|
|
data: new_data,
|
|
})
|
|
.unwrap();
|
|
}
|
|
println!("transcode done ✅");
|
|
}
|
|
|
|
async fn writer_task(
|
|
mut rx: UnboundedReceiver<FileData>,
|
|
output_filename: String,
|
|
compression: Compression,
|
|
) {
|
|
let mut outfile = fs::File::create(output_filename)
|
|
.await
|
|
.expect("output file");
|
|
let mut writer = ZipFileWriter::new(&mut outfile);
|
|
|
|
while let Some(FileData { name, data }) = rx.recv().await {
|
|
let opts = EntryOptions::new(name, compression);
|
|
writer.write_entry_whole(opts, &data).await.unwrap();
|
|
}
|
|
writer.close().await.unwrap();
|
|
println!("write done ✅");
|
|
}
|
|
|
|
#[tokio::main]
|
|
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
let args = Cli::parse();
|
|
|
|
let regex = Regex::new(&args.regex).expect("regex");
|
|
let encoding_from = encoding_from_whatwg_label(&args.from).expect("input encoding");
|
|
let encoding_to = encoding_from_whatwg_label(&args.to).expect("output encoding");
|
|
let compression: Compression = args.compression.into();
|
|
let input_filename = args.src;
|
|
let output_filename = args.dst;
|
|
|
|
let (reader_tx, reader_rx) = mpsc::unbounded_channel::<FileData>();
|
|
let (transcoder_tx, transcoder_rx) = mpsc::unbounded_channel::<FileData>();
|
|
|
|
tokio::try_join!(
|
|
tokio::spawn(reader_task(reader_tx, input_filename, regex)),
|
|
tokio::spawn(transcoder_task(
|
|
reader_rx,
|
|
transcoder_tx,
|
|
encoding_from,
|
|
encoding_to
|
|
)),
|
|
tokio::spawn(writer_task(transcoder_rx, output_filename, compression))
|
|
)?;
|
|
|
|
println!("all done ✅");
|
|
Ok(())
|
|
}
|