chgk_txt2json/src/main.rs

extern crate async_zip;
extern crate json;
extern crate tokio;

use async_zip::read::fs::ZipFileReader;
use async_zip::write::{EntryOptions, ZipFileWriter};
use async_zip::Compression;
use std::path::PathBuf;
use std::str::FromStr;
use tokio::fs;
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader, BufWriter};
use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender};

const INPUT_FILENAME: &str = "baza_utf8.zip";
const OUTPUT_FILENAME: &str = "json.zip";
const OUTPUT_COMPRESSION: Compression = Compression::Zstd;

#[derive(Debug, Clone, Copy)]
enum KeywordType {
    Ignore,
    Global,
    QuestionPre,
    QuestionStart,
    QuestionContent,
    CurrentScope,
}

impl FromStr for KeywordType {
    type Err = ();

    fn from_str(pattern: &str) -> Result<Self, Self::Err> {
        use KeywordType::*;
        Ok(match pattern {
            "Мета:" => Ignore,
            "Чемпионат:" | "Пакет:" => Global,
            "Тур:" => QuestionPre,
            "Вопрос " | "Вопрос:" => QuestionStart,
            "Ответ:" | "Зачет:" => QuestionContent,
            _ => CurrentScope,
            // "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
            // "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
        })
    }
}

#[derive(Debug, Clone, Copy)]
enum DataScope {
    Global,
    QuestionPre,
    QuestionContent,
}

struct QuestionsParser {
    // global output value
    data: json::JsonValue,
    // temp questions array
    questions: json::JsonValue,
    cur_keyword_type: Option<KeywordType>,
    // temp question value
    cur_question: json::JsonValue,
    // temp value for pre'question fields
    cur_question_pre: json::JsonValue,
    // scope for data fields
    cur_scope: DataScope,
    // curent json key
    cur_tag: String,
    // current json value
    cur_content: Vec<String>,
    // need to push temp question value if true
    have_new_question: bool,
    // prev. keyword type
    last_keyword_type: Option<KeywordType>,
    // prev. json key (used for store acummulated content when new keyword readed)
    last_tag: String,
}

/// Text questions parser
impl QuestionsParser {
    const PATTERNS: &'static [&'static str] = &[
        "Чемпионат:",
        "Пакет:",
        "URL:",
        "Ссылка:",
        "Дата:",
        "Редактор:",
        "Обработан:",
        "Копирайт:",
        "Инфо:",
        "Тема:",
        "Вид:",
        "Тип:",
        "Тур:",
        "Мета:",
        "Вопрос ",
        "Вопрос:",
        "Ответ:",
        "Зачет:",
        "Источник:",
        "Рейтинг:",
        "Автор:",
        "Комментарий:",
        "Комментарии:",
    ];

    /// create new parser
    pub fn new() -> QuestionsParser {
        QuestionsParser {
            data: json::JsonValue::new_object(),
            questions: json::JsonValue::new_array(),
            cur_keyword_type: None,
            cur_question: json::JsonValue::new_object(),
            cur_question_pre: json::JsonValue::new_object(),
            cur_tag: String::new(),
            cur_content: Vec::<String>::new(),
            cur_scope: DataScope::Global,
            have_new_question: false,
            last_keyword_type: None,
            last_tag: String::new(),
        }
    }
    /// join current content lines
    fn get_current_content(&self) -> String {
        self.cur_content.join("\n")
    }
    /// clear current content
    fn clear_current_content(&mut self) {
        self.cur_content.clear()
    }
    /// add new line to current content
    fn append_to_current_content(&mut self, line: String) {
        self.cur_content.push(line);
    }
    /// check current question have required fields
    fn is_current_question_valid(&self) -> bool {
        self.cur_question.has_key("Вопрос") && self.cur_question.has_key("Ответ")
    }
    /// add current question to parsed array
    fn add_cur_question(&mut self) {
        if self.is_current_question_valid() {
            let current = std::mem::replace(&mut self.cur_question, self.cur_question_pre.clone());
            self.questions.push(current).unwrap()
        }
    }
    /// set current content to last tag(keyword) to data scope
    fn apply_content_to(&mut self, scope: DataScope) {
        let content = self.get_current_content();
        // match value to store data
        let scope_data = match scope {
            DataScope::Global => &mut self.data,
            DataScope::QuestionPre => &mut self.cur_question_pre,
            DataScope::QuestionContent => &mut self.cur_question,
        };
        scope_data[&self.last_tag] = content.into();
        self.clear_current_content();
    }
    /// set current content to last tag(keyword) to current scope
    fn apply_content_to_cur_scope(&mut self) {
        self.apply_content_to(self.cur_scope);
    }
    /// set current scope
    fn set_scope(&mut self, scope: DataScope) {
        self.cur_scope = scope;
    }
    /// set current scope and set current content to last tag(keyword) to data scope
    fn set_scope_and_apply(&mut self, scope: DataScope) {
        self.set_scope(scope);
        self.apply_content_to_cur_scope();
    }
    /// add last question (if have) and start collecting new one
    fn start_new_question(&mut self) {
        // store prev question before reading new
        if self.have_new_question {
            self.add_cur_question();
        }
        self.have_new_question = true;
    }

    /// check last tag(keyword) and set current content to corresponding data scope
    fn apply_content_for_last_keyword(&mut self) {
        // apply accumulated content when new keyword found
        match self.last_keyword_type {
            Some(KeywordType::Global) => {
                self.set_scope_and_apply(DataScope::Global);
            }
            Some(KeywordType::QuestionPre) => {
                self.set_scope_and_apply(DataScope::QuestionPre);
            }
            Some(KeywordType::QuestionStart) => {
                self.start_new_question();
                self.set_scope_and_apply(DataScope::QuestionContent);
            }
            Some(KeywordType::QuestionContent) => {
                self.apply_content_to(DataScope::QuestionContent);
            }
            Some(KeywordType::CurrentScope) => {
                self.apply_content_to_cur_scope();
            }
            _ => (), //None or Ignore
        };
    }
    /// set current keyword(tag) and type as last, and set new as current
    fn set_new_keyword(&mut self, keyword: &str) {
        self.last_keyword_type =
            std::mem::replace(&mut self.cur_keyword_type, Some(keyword.parse().unwrap()));
        self.last_tag = std::mem::replace(
            &mut self.cur_tag,
            keyword.trim_end().trim_end_matches(':').to_string(),
        );
    }
    /// if line matched keyword
    fn on_keyword_match(&mut self, line: &str, keyword: &str) {
        self.set_new_keyword(keyword);

        // remember question id
        if let Some(KeywordType::QuestionStart) = self.cur_keyword_type {
            self.cur_question_pre["id"] = line.trim_end().trim_end_matches(':').into();
        };

        self.apply_content_for_last_keyword();
    }

    /// parse next line
    pub fn parse_line(&mut self, line: &str) {
        match QuestionsParser::PATTERNS
            .iter() // find keyword
            .find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
        {
            Some(pattern) => {
                self.on_keyword_match(line, pattern);
            }
            None => {
                self.append_to_current_content(line.to_string());
            }
        }
    }

    /// finish parsing
    pub fn finish(&mut self) {
        if self.have_new_question && !self.cur_content.is_empty() {
            self.cur_question[&self.cur_tag] = self.get_current_content().into();
            self.add_cur_question();
            self.clear_current_content();
            self.have_new_question = false;
        }
        self.data["Вопросы"] = std::mem::replace(&mut self.questions, json::JsonValue::new_array());
    }

    /// get parsed data
    pub fn get_parsed(self) -> json::JsonValue {
        self.data
    }
}

#[derive(Debug)]
struct FileText {
    name: String,
    text: String,
}

#[derive(Debug)]
enum TextReaderMessage {
    NextLine(String),
    EndOfFile(String),
}

/// read txt files from zip and convert to json
async fn zip_text_reader(tx: UnboundedSender<TextReaderMessage>) {
    // open archive just to list files
    let archive = ZipFileReader::new(INPUT_FILENAME).await.expect("open zip");

    let source_files = archive
        .entries()
        .iter()
        .enumerate()
        .filter(|(_, entry)| !entry.dir())
        .filter(|(_, entry)| {
            // skip files without "txt" extension
            entry.name().ends_with(".txt")
        })
        .map(|(index, entry)| (index, entry.name().to_string()));
    //
    for (index, name) in source_files {
        let entry_reader = archive.entry_reader(index).await.expect("read entry");
        let buf_reader = BufReader::new(entry_reader);
        let mut lines = buf_reader.lines();
        while let Some(line) = lines.next_line().await.expect("next line") {
            tx.send(TextReaderMessage::NextLine(line))
                .expect("send line");
        }
        tx.send(TextReaderMessage::EndOfFile(name))
            .expect("send end");
    }

    println!("read done ✅");
}

/// convert text questions to json format
async fn questions_converter(
    mut rx: UnboundedReceiver<TextReaderMessage>,
    tx: UnboundedSender<FileText>,
) {
    let mut parser = QuestionsParser::new();
    while let Some(msg) = rx.recv().await {
        match msg {
            TextReaderMessage::NextLine(line) => {
                let line = line.trim();
                if line.is_empty() {
                    continue;
                }
                parser.parse_line(line);
            }
            TextReaderMessage::EndOfFile(name) => {
                parser.finish();
                let data_json = parser.get_parsed();
                let text = data_json.pretty(2);
                tx.send(FileText { name, text }).expect("send json");
                parser = QuestionsParser::new();
            }
        }
    }
    println!("convert done ✅");
}

/// write json data to zip files
async fn zip_json_writer(mut rx: UnboundedReceiver<FileText>) {
    let file = fs::File::create(OUTPUT_FILENAME)
        .await
        .expect("create file");
    let mut buf = BufWriter::with_capacity(100 * 1024 * 1024, file);
    let mut writer = ZipFileWriter::new(&mut buf);

    while let Some(FileText { name, text: data }) = rx.recv().await {
        // make output filename
        let mut outfilename = PathBuf::from(name);
        outfilename.set_extension("json");
        let outfilename = outfilename.to_str().unwrap().to_string();
        let opts = EntryOptions::new(outfilename, OUTPUT_COMPRESSION);

        // write new zip entry
        writer
            .write_entry_whole(opts, data.as_bytes())
            .await
            .expect("write entry");
    }
    writer.close().await.expect("close writer");
    buf.flush().await.expect("flush buffer");

    println!("write done ✅");
}

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
    // check output filename
    match fs::metadata(OUTPUT_FILENAME).await {
        Ok(x) if x.is_dir() => return Err("output file is a directory!".into()),
        _ => (),
    };

    let (reader_tx, reader_rx) = mpsc::unbounded_channel::<TextReaderMessage>();
    let (json_tx, json_rx) = mpsc::unbounded_channel::<FileText>();

    tokio::try_join!(
        tokio::spawn(zip_text_reader(reader_tx)),
        tokio::spawn(questions_converter(reader_rx, json_tx)),
        tokio::spawn(zip_json_writer(json_rx))
    )?;

    println!("all done ✅");
    Ok(())
}