ztb_recode/src/main.rs

178 lines
4.9 KiB
Rust

extern crate encoding;
extern crate textstream;
extern crate zip;
use clap::{Parser, ValueEnum};
use encoding::label::encoding_from_whatwg_label;
use encoding::Encoding;
use encoding::{DecoderTrap, EncoderTrap};
use regex::Regex;
use std::io::{Read, Seek, Write};
use std::path::PathBuf;
use std::{fs, io};
use textstream::TextReader;
use zip::write::FileOptions;
use zip::ZipWriter;
/// transcode txt files in zip archieve
#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
#[clap(propagate_version = true)]
struct Cli {
/// input encoding
#[clap(short, long, default_value = "koi8-r")]
from: String,
/// output encoding
#[clap(short, long, default_value = "utf8")]
to: String,
/// output compression method
#[clap(arg_enum, short, long, default_value = "zstd")]
compression: OutputFileCompression,
/// filename filter (regex)
#[clap(short, long, default_value = r#".*\.txt$"#)]
regex: String,
/// input zip filename
#[clap(value_parser, default_value = "baza.zip")]
src: String,
/// output zip filename
#[clap(value_parser, default_value = "baza_utf8.zip")]
dst: String,
}
/// output file compression method
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)]
enum OutputFileCompression {
/// Store the file as is
Store,
/// Compress the file using Deflate
Deflate,
/// Compress the file using BZIP2
Bzip2,
/// Compress the file using ZStandard
Zstd,
}
impl From<OutputFileCompression> for zip::CompressionMethod {
fn from(compression: OutputFileCompression) -> Self {
match compression {
OutputFileCompression::Store => Self::Stored,
OutputFileCompression::Deflate => Self::Deflated,
OutputFileCompression::Bzip2 => Self::Bzip2,
OutputFileCompression::Zstd => Self::Zstd,
}
}
}
struct OutFileDescr<'a> {
name: String,
options: &'a FileOptions,
encoding: &'a dyn Encoding,
trap: EncoderTrap,
}
fn read_file(file: impl Read, enc: &dyn Encoding) -> Result<String, Box<dyn std::error::Error>> {
let buf = io::BufReader::new(file);
let mut reader = TextReader::new(buf, enc, DecoderTrap::Ignore);
let mut result = String::new();
reader.read_to_end(&mut result).or(Err("decode error"))?;
Ok(result)
}
fn write_file<T: Seek + Write>(
arc: &mut ZipWriter<T>,
f: OutFileDescr,
data: String,
) -> Result<(), Box<dyn std::error::Error>> {
arc.start_file(f.name, *f.options)?;
let data = f.encoding.encode(data.as_str(), f.trap)?;
arc.write_all(&data)?;
Ok(())
}
fn process_files<R: Read + Seek>(
archive: &mut zip::ZipArchive<R>,
output_filename: String,
enc_input: &(dyn Encoding + Send + Sync),
enc_output: &(dyn Encoding + Send + Sync),
compression: zip::CompressionMethod,
files: &[PathBuf],
) -> Result<(), Box<dyn std::error::Error>> {
if files.is_empty() {
return Ok(());
}
let options = zip::write::FileOptions::default()
.compression_method(compression)
.compression_level(Some(9));
let file_def = OutFileDescr {
name: String::new(),
options: &options,
encoding: enc_output,
trap: EncoderTrap::Ignore,
};
let mut outfile = fs::File::create(output_filename)?;
let mut zip_writer = ZipWriter::new(&mut outfile);
for name in files {
let name_str = name.to_str().ok_or("name to str err")?;
// read string from file in input zip
let file = archive.by_name(name_str)?;
let data = read_file(file, enc_input)?;
// write string to file in output zip
let out_file = OutFileDescr {
name: name_str.to_string(),
..file_def
};
write_file(&mut zip_writer, out_file, data)?;
}
zip_writer.finish()?;
Ok(())
}
fn main() -> Result<(), Box<dyn std::error::Error>> {
let args = Cli::parse();
let regex = Regex::new(&args.regex).unwrap();
let encoding_input = encoding_from_whatwg_label(&args.from).expect("input encoding");
let encoding_output = encoding_from_whatwg_label(&args.to).expect("output encoding");
let compression: zip::CompressionMethod = args.compression.into();
// open archive just to list files
let zip_file = fs::File::open(args.src)?;
let zip_reader = io::BufReader::new(zip_file);
let mut archive = zip::ZipArchive::new(zip_reader)?;
let mut source_files: Vec<PathBuf> = (0..archive.len())
.map(|i| archive.by_index(i).unwrap().mangled_name())
.filter(|name| regex.is_match(name.to_str().unwrap()))
.collect();
println!("processing {} files...", source_files.len());
source_files.sort();
let source_files = source_files;
process_files(
&mut archive,
args.dst,
encoding_input,
encoding_output,
compression,
&source_files,
)?;
println!("done");
Ok(())
}