179 lines
5.4 KiB
Rust
179 lines
5.4 KiB
Rust
extern crate encoding;
|
|
extern crate zip;
|
|
|
|
use clap::{Parser, ValueEnum};
|
|
use encoding::label::encoding_from_whatwg_label;
|
|
use encoding::EncodingRef;
|
|
use encoding::{DecoderTrap, EncoderTrap};
|
|
use regex::Regex;
|
|
use std::fs;
|
|
use std::io::{Read, Write};
|
|
use std::sync::mpsc;
|
|
use std::thread;
|
|
use zip::ZipWriter;
|
|
|
|
/// transcode txt files in zip archieve
|
|
#[derive(Parser, Debug)]
|
|
#[clap(author, version, about, long_about = None)]
|
|
#[clap(propagate_version = true)]
|
|
struct Cli {
|
|
/// input encoding
|
|
#[clap(short, long, default_value = "koi8-r")]
|
|
from: String,
|
|
|
|
/// output encoding
|
|
#[clap(short, long, default_value = "utf8")]
|
|
to: String,
|
|
|
|
/// output compression method
|
|
#[clap(arg_enum, short, long, default_value = "zstd")]
|
|
compression: OutputFileCompression,
|
|
|
|
/// output compression level
|
|
#[clap(arg_enum, short='l', long, value_parser = clap::value_parser!(i32).range(1..=9), default_value = "5")]
|
|
compression_level: i32,
|
|
|
|
/// filename filter (regex)
|
|
#[clap(short, long, default_value = r#".*\.txt$"#)]
|
|
regex: String,
|
|
|
|
/// input zip filename
|
|
#[clap(value_parser, default_value = "baza.zip")]
|
|
src: String,
|
|
|
|
/// output zip filename
|
|
#[clap(value_parser, default_value = "baza_utf8.zip")]
|
|
dst: String,
|
|
}
|
|
|
|
/// output file compression method
|
|
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)]
|
|
enum OutputFileCompression {
|
|
/// Store the file as is
|
|
Store,
|
|
/// Compress the file using Deflate
|
|
Deflate,
|
|
/// Compress the file using BZIP2
|
|
Bzip2,
|
|
/// Compress the file using ZStandard
|
|
Zstd,
|
|
}
|
|
|
|
impl From<OutputFileCompression> for zip::CompressionMethod {
|
|
fn from(compression: OutputFileCompression) -> Self {
|
|
match compression {
|
|
OutputFileCompression::Store => Self::Stored,
|
|
OutputFileCompression::Deflate => Self::Deflated,
|
|
OutputFileCompression::Bzip2 => Self::Bzip2,
|
|
OutputFileCompression::Zstd => Self::Zstd,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
struct FileData {
|
|
name: String,
|
|
data: Vec<u8>,
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
struct FileText {
|
|
name: String,
|
|
text: String,
|
|
}
|
|
|
|
fn reader_task(tx: mpsc::Sender<FileData>, input_filename: String, regex: Regex) {
|
|
let zip_file = fs::File::open(input_filename).unwrap();
|
|
let mut archive = zip::ZipArchive::new(zip_file).unwrap();
|
|
|
|
let mut source_files: Vec<String> = archive
|
|
.file_names()
|
|
.filter(|name| regex.is_match(name))
|
|
.map(|s| s.to_string())
|
|
.collect();
|
|
|
|
println!("processing {} files...", source_files.len());
|
|
|
|
source_files.sort();
|
|
|
|
for name in source_files {
|
|
let mut file = archive.by_name(name.as_str()).unwrap();
|
|
let mut data = Vec::with_capacity(file.size().try_into().unwrap());
|
|
file.read_to_end(&mut data).unwrap();
|
|
drop(file);
|
|
|
|
tx.send(FileData { name, data }).unwrap();
|
|
}
|
|
|
|
println!("read done ✅");
|
|
}
|
|
|
|
fn decoder_task(rx: mpsc::Receiver<FileData>, tx: mpsc::Sender<FileText>, encoding: EncodingRef) {
|
|
while let Ok(FileData { name, data }) = rx.recv() {
|
|
let text = encoding.decode(&data, DecoderTrap::Ignore).unwrap();
|
|
tx.send(FileText { name, text }).unwrap();
|
|
}
|
|
println!("decode done ✅");
|
|
}
|
|
|
|
fn encoder_task(rx: mpsc::Receiver<FileText>, tx: mpsc::Sender<FileData>, encoding: EncodingRef) {
|
|
while let Ok(FileText { name, text }) = rx.recv() {
|
|
let data = encoding.encode(text.as_str(), EncoderTrap::Ignore).unwrap();
|
|
tx.send(FileData { name, data }).unwrap();
|
|
}
|
|
println!("encode done ✅");
|
|
}
|
|
|
|
fn writer_task(
|
|
rx: mpsc::Receiver<FileData>,
|
|
output_filename: String,
|
|
compression: zip::CompressionMethod,
|
|
compression_level: i32,
|
|
) {
|
|
let options = zip::write::FileOptions::default()
|
|
.compression_method(compression)
|
|
.compression_level(Some(compression_level));
|
|
|
|
let mut outfile = fs::File::create(output_filename).expect("output file");
|
|
let mut zip_writer = ZipWriter::new(&mut outfile);
|
|
|
|
while let Ok(FileData { name, data }) = rx.recv() {
|
|
zip_writer.start_file(name, options).unwrap();
|
|
zip_writer.write_all(&data).unwrap();
|
|
}
|
|
zip_writer.finish().unwrap();
|
|
println!("write done ✅");
|
|
}
|
|
|
|
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
let args = Cli::parse();
|
|
|
|
let regex = Regex::new(&args.regex).expect("regex");
|
|
let encoding_input = encoding_from_whatwg_label(&args.from).expect("input encoding");
|
|
let encoding_output = encoding_from_whatwg_label(&args.to).expect("output encoding");
|
|
let compression: zip::CompressionMethod = args.compression.into();
|
|
let compression_level = args.compression_level;
|
|
let input_filename = args.src;
|
|
let output_filename = args.dst;
|
|
|
|
let (reader_tx, reader_rx) = mpsc::channel::<FileData>();
|
|
let (decoder_tx, decoder_rx) = mpsc::channel::<FileText>();
|
|
let (encoder_tx, encoder_rx) = mpsc::channel::<FileData>();
|
|
|
|
let handles = vec![
|
|
thread::spawn(move || reader_task(reader_tx, input_filename, regex)),
|
|
thread::spawn(move || decoder_task(reader_rx, decoder_tx, encoding_input)),
|
|
thread::spawn(move || encoder_task(decoder_rx, encoder_tx, encoding_output)),
|
|
thread::spawn(move || {
|
|
writer_task(encoder_rx, output_filename, compression, compression_level)
|
|
}),
|
|
];
|
|
|
|
for handle in handles {
|
|
handle.join().expect("thread paniced");
|
|
}
|
|
|
|
println!("all done ✅");
|
|
Ok(())
|
|
}
|