extern crate encoding; extern crate json; extern crate rayon; extern crate textstream; extern crate zip; use encoding::all::KOI8_R; use encoding::DecoderTrap; use rayon::prelude::*; use std::path::PathBuf; use std::{fs, io}; use textstream::TextReader; const BASE_FILENAME: &str = "baza.zip"; const OUTPUT_PATH: &str = "json"; #[derive(Debug, Clone, Copy)] enum KeywordType { Ignore, Global, QuestionPre, QuestionStart, QuestionContent, CurrentScope, } #[derive(Debug, Clone, Copy)] enum DataScope { Global, QuestionPre, QuestionContent, } struct Context { // global output value data: json::JsonValue, // temp questions array questions: json::JsonValue, cur_keyword_type: Option, // temp question value cur_question: json::JsonValue, // temp value for pre'question fields cur_question_pre: json::JsonValue, // scope for data fields cur_scope: DataScope, // curent json key cur_tag: String, // current json value cur_content: Vec, // need to push temp question value if true have_new_question: bool, // prev. keyword type last_keyword_type: Option, // prev. json key (used for store acummulated content when new keyword readed) last_tag: String, } // check questions before push trait PushIfValid { fn is_valid(&self) -> bool; fn push_if_valid(&mut self, value: json::JsonValue); } impl PushIfValid for json::JsonValue { fn is_valid(&self) -> bool { self.has_key("Вопрос") && self.has_key("Ответ") } fn push_if_valid(&mut self, value: json::JsonValue) { if value.is_valid() { self.push(value).unwrap_or(()) } } } impl Context { fn new() -> Context { Context { data: json::JsonValue::new_object(), questions: json::JsonValue::new_array(), cur_keyword_type: None, cur_question: json::JsonValue::new_object(), cur_question_pre: json::JsonValue::new_object(), cur_tag: String::new(), cur_content: Vec::::new(), cur_scope: DataScope::Global, have_new_question: false, last_keyword_type: None, last_tag: String::new(), } } } impl KeywordType { fn from(pattern: &str) -> KeywordType { use KeywordType::*; match pattern { "Мета:" => Ignore, "Чемпионат:" | "Пакет:" => Global, "Тур:" => QuestionPre, "Вопрос " | "Вопрос:" => QuestionStart, "Ответ:" | "Зачет:" => QuestionContent, _ => CurrentScope, // "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" | // "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:" } } } fn parse_file(file: impl io::Read) -> Result> { let buf = io::BufReader::new(file); let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore); let patterns = vec![ "Чемпионат:", "Пакет:", "URL:", "Ссылка:", "Дата:", "Редактор:", "Обработан:", "Копирайт:", "Инфо:", "Тема:", "Вид:", "Тип:", "Тур:", "Мета:", "Вопрос ", "Вопрос:", "Ответ:", "Зачет:", "Источник:", "Рейтинг:", "Автор:", "Комментарий:", "Комментарии:", ]; let mut context = Context::new(); let mut ctx = &mut context; reader .lines() .map(|line| String::from(line.unwrap().trim())) .filter(|line| !line.is_empty()) // ignore empty lines .for_each(|line| { match patterns .iter() // find keyword .find(|&&pattern| line.starts_with(pattern) && line.ends_with(':')) { Some(pattern) => { use KeywordType::*; ctx.last_keyword_type = ctx.cur_keyword_type; ctx.last_tag = ctx.cur_tag.clone(); ctx.cur_keyword_type = Some(KeywordType::from(pattern)); ctx.cur_tag = pattern.replace(' ', "").replace(':', ""); // remember question id if let Some(QuestionStart) = ctx.cur_keyword_type { ctx.cur_question_pre["id"] = line.replace(':', "").as_str().into(); }; // apply accumulated content when new keyword found match ctx.last_keyword_type { Some(Global) => { ctx.cur_scope = DataScope::Global; ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into() } Some(QuestionPre) => { ctx.cur_scope = DataScope::QuestionPre; ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into(); } Some(QuestionStart) => { ctx.cur_scope = DataScope::QuestionContent; // store prev question before reading new if ctx.have_new_question { ctx.questions.push_if_valid(ctx.cur_question.clone()); } // prepare to read new question data with cur_question_pre values ctx.cur_question = ctx.cur_question_pre.clone(); ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into(); ctx.have_new_question = true; } Some(QuestionContent) => { ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into(); } Some(CurrentScope) => { // match value to store data let scope_data = match ctx.cur_scope { DataScope::Global => &mut ctx.data, DataScope::QuestionPre => &mut ctx.cur_question_pre, DataScope::QuestionContent => &mut ctx.cur_question, }; scope_data[&ctx.last_tag] = ctx.cur_content.join("\n").into(); } _ => (), //None or Ignore }; // clear content ctx.cur_content.clear(); } None => { // accumulate content if line is not a keyword ctx.cur_content.push(line); } } }); // finish reading last question if ctx.have_new_question && !ctx.cur_content.is_empty() { ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").into(); ctx.questions.push_if_valid(ctx.cur_question.clone()); ctx.have_new_question = false; } ctx.data["Вопросы"] = ctx.questions.clone(); Ok(ctx.data.clone()) } // split vector to a vector of [num] slices trait SplitTo { fn split_to(&self, num: usize) -> Vec<&[T]>; } impl SplitTo for Vec { fn split_to(&self, num: usize) -> Vec<&[T]> { let part_len = self.len() / num; let add_len = self.len() % num; let mut result = Vec::<&[T]>::with_capacity(num); if 0 == part_len { result.push(self); return result; } for i in 0..num { let size = if (num - 1) == i { part_len + add_len } else { part_len }; let start = part_len * i; result.push(&self[start..(start + size)]); } result } } fn process_files(files: &&[PathBuf]) { if files.is_empty() { return; } let start_file = files[0].to_str().unwrap(); println!("-> start from \"{}\" ({} files)", start_file, files.len()); let zip_file = fs::File::open(BASE_FILENAME).unwrap(); let zip_reader = io::BufReader::new(zip_file); let mut archive = zip::ZipArchive::new(zip_reader).unwrap(); files.iter().for_each(|name| { let name_str = name.to_str().unwrap(); // parse txt file let file = archive.by_name(name_str).unwrap(); let data = parse_file(file).unwrap(); // make output filename let mut outfilename = PathBuf::from(OUTPUT_PATH); outfilename.push(name); outfilename.set_extension("json"); // save json to file let mut outfile = fs::File::create(outfilename).unwrap(); data.write_pretty(&mut outfile, 1).unwrap(); }); println!("<- done {} files (from \"{}\")", files.len(), start_file); } fn main() -> Result<(), Box> { // open archive just to list files let zip_file = fs::File::open(BASE_FILENAME)?; let zip_reader = io::BufReader::new(zip_file); let mut archive = zip::ZipArchive::new(zip_reader)?; let source_files: Vec = (0..archive.len()) .map(|i| archive.by_index(i).unwrap().mangled_name()) .filter(|name| { // skip files without "txt" extension match name.extension() { Some(ext) => match ext.to_str() { Some(ext_str) => ext_str.eq_ignore_ascii_case("txt"), _ => false, // extension is not valid unicode or not txt }, _ => false, // no extension in filename } }) .collect(); drop(archive); // check output directory let out_dir: PathBuf = OUTPUT_PATH.into(); if out_dir.is_file() { return Err("output directory is file!".into()); } else if !out_dir.exists() { fs::create_dir_all(out_dir)?; }; println!("processing {} files with {} threads...", source_files.len(), rayon::current_num_threads()); // split vector and process its parts in parallel source_files.split_to(rayon::current_num_threads()) .par_iter() .for_each(process_files); println!("done"); Ok(()) }