extern crate async_zip; extern crate json; extern crate tokio; use async_zip::read::fs::ZipFileReader; use async_zip::write::{EntryOptions, ZipFileWriter}; use async_zip::Compression; use std::path::PathBuf; use std::str::FromStr; use tokio::fs; use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader, BufWriter}; use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender}; const INPUT_FILENAME: &str = "baza_utf8.zip"; const OUTPUT_FILENAME: &str = "json.zip"; const OUTPUT_COMPRESSION: Compression = Compression::Zstd; #[derive(Debug, Clone, Copy)] enum KeywordType { Ignore, Global, QuestionPre, QuestionStart, QuestionContent, CurrentScope, } impl FromStr for KeywordType { type Err = (); fn from_str(pattern: &str) -> Result { use KeywordType::*; Ok(match pattern { "Мета:" => Ignore, "Чемпионат:" | "Пакет:" => Global, "Тур:" => QuestionPre, "Вопрос " | "Вопрос:" => QuestionStart, "Ответ:" | "Зачет:" => QuestionContent, _ => CurrentScope, // "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" | // "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:" }) } } #[derive(Debug, Clone, Copy)] enum DataScope { Global, QuestionPre, QuestionContent, } struct QuestionsParser { // global output value data: json::JsonValue, // temp questions array questions: json::JsonValue, cur_keyword_type: Option, // temp question value cur_question: json::JsonValue, // temp value for pre'question fields cur_question_pre: json::JsonValue, // scope for data fields cur_scope: DataScope, // curent json key cur_tag: String, // current json value cur_content: Vec, // need to push temp question value if true have_new_question: bool, // prev. keyword type last_keyword_type: Option, // prev. json key (used for store acummulated content when new keyword readed) last_tag: String, } /// Text questions parser impl QuestionsParser { const PATTERNS: &'static [&'static str] = &[ "Чемпионат:", "Пакет:", "URL:", "Ссылка:", "Дата:", "Редактор:", "Обработан:", "Копирайт:", "Инфо:", "Тема:", "Вид:", "Тип:", "Тур:", "Мета:", "Вопрос ", "Вопрос:", "Ответ:", "Зачет:", "Источник:", "Рейтинг:", "Автор:", "Комментарий:", "Комментарии:", ]; /// create new parser pub fn new() -> QuestionsParser { QuestionsParser { data: json::JsonValue::new_object(), questions: json::JsonValue::new_array(), cur_keyword_type: None, cur_question: json::JsonValue::new_object(), cur_question_pre: json::JsonValue::new_object(), cur_tag: String::new(), cur_content: Vec::::new(), cur_scope: DataScope::Global, have_new_question: false, last_keyword_type: None, last_tag: String::new(), } } /// join current content lines fn get_current_content(&self) -> String { self.cur_content.join("\n") } /// clear current content fn clear_current_content(&mut self) { self.cur_content.clear() } /// add new line to current content fn append_to_current_content(&mut self, line: String) { self.cur_content.push(line); } /// check current question have required fields fn is_current_question_valid(&self) -> bool { self.cur_question.has_key("Вопрос") && self.cur_question.has_key("Ответ") } /// add current question to parsed array fn add_cur_question(&mut self) { if self.is_current_question_valid() { let current = std::mem::replace(&mut self.cur_question, self.cur_question_pre.clone()); self.questions.push(current).unwrap() } } /// set current content to last tag(keyword) to data scope fn apply_content_to(&mut self, scope: DataScope) { let content = self.get_current_content(); // match value to store data let scope_data = match scope { DataScope::Global => &mut self.data, DataScope::QuestionPre => &mut self.cur_question_pre, DataScope::QuestionContent => &mut self.cur_question, }; scope_data[&self.last_tag] = content.into(); self.clear_current_content(); } /// set current content to last tag(keyword) to current scope fn apply_content_to_cur_scope(&mut self) { self.apply_content_to(self.cur_scope); } /// set current scope fn set_scope(&mut self, scope: DataScope) { self.cur_scope = scope; } /// set current scope and set current content to last tag(keyword) to data scope fn set_scope_and_apply(&mut self, scope: DataScope) { self.set_scope(scope); self.apply_content_to_cur_scope(); } /// add last question (if have) and start collecting new one fn start_new_question(&mut self) { // store prev question before reading new if self.have_new_question { self.add_cur_question(); } self.have_new_question = true; } /// check last tag(keyword) and set current content to corresponding data scope fn apply_content_for_last_keyword(&mut self) { // apply accumulated content when new keyword found match self.last_keyword_type { Some(KeywordType::Global) => { self.set_scope_and_apply(DataScope::Global); } Some(KeywordType::QuestionPre) => { self.set_scope_and_apply(DataScope::QuestionPre); } Some(KeywordType::QuestionStart) => { self.start_new_question(); self.set_scope_and_apply(DataScope::QuestionContent); } Some(KeywordType::QuestionContent) => { self.apply_content_to(DataScope::QuestionContent); } Some(KeywordType::CurrentScope) => { self.apply_content_to_cur_scope(); } _ => (), //None or Ignore }; } /// set current keyword(tag) and type as last, and set new as current fn set_new_keyword(&mut self, keyword: &str) { self.last_keyword_type = std::mem::replace(&mut self.cur_keyword_type, Some(keyword.parse().unwrap())); self.last_tag = std::mem::replace( &mut self.cur_tag, keyword.trim_end().trim_end_matches(':').to_string(), ); } /// if line matched keyword fn on_keyword_match(&mut self, line: &str, keyword: &str) { self.set_new_keyword(keyword); // remember question id if let Some(KeywordType::QuestionStart) = self.cur_keyword_type { self.cur_question_pre["id"] = line.trim_end().trim_end_matches(':').into(); }; self.apply_content_for_last_keyword(); } /// parse next line pub fn parse_line(&mut self, line: &str) { match QuestionsParser::PATTERNS .iter() // find keyword .find(|&&pattern| line.starts_with(pattern) && line.ends_with(':')) { Some(pattern) => { self.on_keyword_match(line, pattern); } None => { self.append_to_current_content(line.to_string()); } } } /// finish parsing pub fn finish(&mut self) { if self.have_new_question && !self.cur_content.is_empty() { self.cur_question[&self.cur_tag] = self.get_current_content().into(); self.add_cur_question(); self.clear_current_content(); self.have_new_question = false; } self.data["Вопросы"] = std::mem::replace(&mut self.questions, json::JsonValue::new_array()); } /// get parsed data pub fn get_parsed(self) -> json::JsonValue { self.data } } #[derive(Debug)] struct FileText { name: String, text: String, } #[derive(Debug)] enum TextReaderMessage { NextLine(String), EndOfFile(String), } /// read txt files from zip and convert to json async fn zip_text_reader(tx: UnboundedSender) { // open archive just to list files let archive = ZipFileReader::new(INPUT_FILENAME).await.expect("open zip"); let source_files = archive .entries() .iter() .enumerate() .filter(|(_, entry)| !entry.dir()) .filter(|(_, entry)| { // skip files without "txt" extension entry.name().ends_with(".txt") }) .map(|(index, entry)| (index, entry.name().to_string())); // for (index, name) in source_files { let entry_reader = archive.entry_reader(index).await.expect("read entry"); let buf_reader = BufReader::new(entry_reader); let mut lines = buf_reader.lines(); while let Some(line) = lines.next_line().await.expect("next line") { tx.send(TextReaderMessage::NextLine(line)) .expect("send line"); } tx.send(TextReaderMessage::EndOfFile(name)) .expect("send end"); } println!("read done ✅"); } /// convert text questions to json format async fn questions_converter( mut rx: UnboundedReceiver, tx: UnboundedSender, ) { let mut parser = QuestionsParser::new(); while let Some(msg) = rx.recv().await { match msg { TextReaderMessage::NextLine(line) => { let line = line.trim(); if line.is_empty() { continue; } parser.parse_line(line); } TextReaderMessage::EndOfFile(name) => { parser.finish(); let data_json = parser.get_parsed(); let text = data_json.pretty(2); tx.send(FileText { name, text }).expect("send json"); parser = QuestionsParser::new(); } } } println!("convert done ✅"); } /// write json data to zip files async fn zip_json_writer(mut rx: UnboundedReceiver) { let file = fs::File::create(OUTPUT_FILENAME) .await .expect("create file"); let mut buf = BufWriter::with_capacity(100 * 1024 * 1024, file); let mut writer = ZipFileWriter::new(&mut buf); while let Some(FileText { name, text: data }) = rx.recv().await { // make output filename let mut outfilename = PathBuf::from(name); outfilename.set_extension("json"); let outfilename = outfilename.to_str().unwrap().to_string(); let opts = EntryOptions::new(outfilename, OUTPUT_COMPRESSION); // write new zip entry writer .write_entry_whole(opts, data.as_bytes()) .await .expect("write entry"); } writer.close().await.expect("close writer"); buf.flush().await.expect("flush buffer"); println!("write done ✅"); } #[tokio::main] async fn main() -> Result<(), Box> { // check output filename match fs::metadata(OUTPUT_FILENAME).await { Ok(x) if x.is_dir() => return Err("output file is a directory!".into()), _ => (), }; let (reader_tx, reader_rx) = mpsc::unbounded_channel::(); let (json_tx, json_rx) = mpsc::unbounded_channel::(); tokio::try_join!( tokio::spawn(zip_text_reader(reader_tx)), tokio::spawn(questions_converter(reader_rx, json_tx)), tokio::spawn(zip_json_writer(json_rx)) )?; println!("all done ✅"); Ok(()) }