extern crate async_zip; extern crate json; extern crate tokio; use async_zip::read::fs::ZipFileReader; use std::path::PathBuf; use std::str::FromStr; use tokio::{fs, task}; use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncWriteExt, BufReader}; const BASE_FILENAME: &str = "baza_utf8.zip"; const OUTPUT_PATH: &str = "json"; #[derive(Debug, Clone, Copy)] enum KeywordType { Ignore, Global, QuestionPre, QuestionStart, QuestionContent, CurrentScope, } #[derive(Debug, Clone, Copy)] enum DataScope { Global, QuestionPre, QuestionContent, } struct Context { // global output value data: json::JsonValue, // temp questions array questions: json::JsonValue, cur_keyword_type: Option, // temp question value cur_question: json::JsonValue, // temp value for pre'question fields cur_question_pre: json::JsonValue, // scope for data fields cur_scope: DataScope, // curent json key cur_tag: String, // current json value cur_content: Vec, // need to push temp question value if true have_new_question: bool, // prev. keyword type last_keyword_type: Option, // prev. json key (used for store acummulated content when new keyword readed) last_tag: String, } // check questions before push trait PushIfValid { fn is_valid(&self) -> bool; fn push_if_valid(&mut self, value: json::JsonValue); } impl PushIfValid for json::JsonValue { fn is_valid(&self) -> bool { self.has_key("Вопрос") && self.has_key("Ответ") } fn push_if_valid(&mut self, value: json::JsonValue) { if value.is_valid() { self.push(value).unwrap_or(()) } } } impl Context { fn new() -> Context { Context { data: json::JsonValue::new_object(), questions: json::JsonValue::new_array(), cur_keyword_type: None, cur_question: json::JsonValue::new_object(), cur_question_pre: json::JsonValue::new_object(), cur_tag: String::new(), cur_content: Vec::::new(), cur_scope: DataScope::Global, have_new_question: false, last_keyword_type: None, last_tag: String::new(), } } } impl FromStr for KeywordType { type Err = (); fn from_str(pattern: &str) -> Result { use KeywordType::*; Ok(match pattern { "Мета:" => Ignore, "Чемпионат:" | "Пакет:" => Global, "Тур:" => QuestionPre, "Вопрос " | "Вопрос:" => QuestionStart, "Ответ:" | "Зачет:" => QuestionContent, _ => CurrentScope, // "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" | // "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:" }) } } async fn parse_file( entry_reader: impl AsyncReadExt + Unpin, ) -> Result> { let buf_reader = BufReader::new(entry_reader); let mut lines = buf_reader.lines(); let patterns = vec![ "Чемпионат:", "Пакет:", "URL:", "Ссылка:", "Дата:", "Редактор:", "Обработан:", "Копирайт:", "Инфо:", "Тема:", "Вид:", "Тип:", "Тур:", "Мета:", "Вопрос ", "Вопрос:", "Ответ:", "Зачет:", "Источник:", "Рейтинг:", "Автор:", "Комментарий:", "Комментарии:", ]; let mut context = Context::new(); let mut ctx = &mut context; while let Some(line_r) = lines.next_line().await? { let line = line_r.trim(); if line.is_empty() { continue; } let line = line.to_string(); match patterns .iter() // find keyword .find(|&&pattern| line.starts_with(pattern) && line.ends_with(':')) { Some(pattern) => { use KeywordType::*; ctx.last_keyword_type = ctx.cur_keyword_type; ctx.last_tag = ctx.cur_tag.clone(); ctx.cur_keyword_type = Some(pattern.parse().unwrap()); ctx.cur_tag = pattern.replace(' ', "").replace(':', ""); // remember question id if let Some(QuestionStart) = ctx.cur_keyword_type { ctx.cur_question_pre["id"] = line.replace(':', "").as_str().into(); }; // apply accumulated content when new keyword found match ctx.last_keyword_type { Some(Global) => { ctx.cur_scope = DataScope::Global; ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into() } Some(QuestionPre) => { ctx.cur_scope = DataScope::QuestionPre; ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into(); } Some(QuestionStart) => { ctx.cur_scope = DataScope::QuestionContent; // store prev question before reading new if ctx.have_new_question { ctx.questions.push_if_valid(ctx.cur_question.clone()); } // prepare to read new question data with cur_question_pre values ctx.cur_question = ctx.cur_question_pre.clone(); ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into(); ctx.have_new_question = true; } Some(QuestionContent) => { ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into(); } Some(CurrentScope) => { // match value to store data let scope_data = match ctx.cur_scope { DataScope::Global => &mut ctx.data, DataScope::QuestionPre => &mut ctx.cur_question_pre, DataScope::QuestionContent => &mut ctx.cur_question, }; scope_data[&ctx.last_tag] = ctx.cur_content.join("\n").into(); } _ => (), //None or Ignore }; // clear content ctx.cur_content.clear(); } None => { // accumulate content if line is not a keyword ctx.cur_content.push(line); } } } // finish reading last question if ctx.have_new_question && !ctx.cur_content.is_empty() { ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").into(); ctx.questions.push_if_valid(ctx.cur_question.clone()); ctx.have_new_question = false; } ctx.data["Вопросы"] = ctx.questions.clone(); Ok(ctx.data.clone()) } async fn process_file( archive: &ZipFileReader, index: usize, name: String, ) -> Result<(), Box> { let entry_reader = archive.entry_reader(index).await?; // make output filename let mut outfilename = PathBuf::from(OUTPUT_PATH); outfilename.push(name); outfilename.set_extension("json"); // save json to file let new_data = parse_file(entry_reader).await?; let data_str = task::spawn_blocking(move || { new_data.pretty(2) }).await?; let mut outfile = fs::File::create(outfilename).await?; outfile.write_all(data_str.as_bytes()).await?; Ok(()) } #[tokio::main] async fn main() -> Result<(), Box> { // open archive just to list files let archive = ZipFileReader::new(String::from(BASE_FILENAME)).await?; let source_files: Vec<(usize, String)> = archive .entries() .iter() .enumerate() .filter(|item| !item.1.dir()) .filter(|item| { // skip files without "txt" extension item.1.name().ends_with(".txt") }) .map(|item| (item.0, item.1.name().to_string())) .collect(); // check output directory match fs::metadata(OUTPUT_PATH).await { Err(_) => fs::create_dir_all(OUTPUT_PATH).await?, Ok(x) if x.is_file() => return Err("output directory is file!".into()), _ => (), }; println!("processing {} files ...", source_files.len()); for i in source_files { process_file(&archive, i.0, i.1).await?; } println!("done"); Ok(()) }