chgk_txt2json/src/main.rs

extern crate encoding;
extern crate json;
extern crate rayon;
extern crate textstream;
extern crate zip;

use encoding::all::KOI8_R;
use encoding::DecoderTrap;
use rayon::prelude::*;
use std::path::PathBuf;
use std::str::FromStr;
use std::{fs, io};
use textstream::TextReader;

const BASE_FILENAME: &str = "baza.zip";
const OUTPUT_PATH: &str = "json";

#[derive(Debug, Clone, Copy)]
enum KeywordType {
    Ignore,
    Global,
    QuestionPre,
    QuestionStart,
    QuestionContent,
    CurrentScope,
}

#[derive(Debug, Clone, Copy)]
enum DataScope {
    Global,
    QuestionPre,
    QuestionContent,
}

struct Context {
    // global output value
    data: json::JsonValue,
    // temp questions array
    questions: json::JsonValue,
    cur_keyword_type: Option<KeywordType>,
    // temp question value
    cur_question: json::JsonValue,
    // temp value for pre'question fields
    cur_question_pre: json::JsonValue,
    // scope for data fields
    cur_scope: DataScope,
    // curent json key
    cur_tag: String,
    // current json value
    cur_content: Vec<String>,
    // need to push temp question value if true
    have_new_question: bool,
    // prev. keyword type
    last_keyword_type: Option<KeywordType>,
    // prev. json key (used for store acummulated content when new keyword readed)
    last_tag: String,
}

// check questions before push
trait PushIfValid {
    fn is_valid(&self) -> bool;
    fn push_if_valid(&mut self, value: json::JsonValue);
}

impl PushIfValid for json::JsonValue {
    fn is_valid(&self) -> bool {
        self.has_key("Вопрос") && self.has_key("Ответ")
    }
    fn push_if_valid(&mut self, value: json::JsonValue) {
        if value.is_valid() {
            self.push(value).unwrap_or(())
        }
    }
}

impl Context {
    fn new() -> Context {
        Context {
            data: json::JsonValue::new_object(),
            questions: json::JsonValue::new_array(),
            cur_keyword_type: None,
            cur_question: json::JsonValue::new_object(),
            cur_question_pre: json::JsonValue::new_object(),
            cur_tag: String::new(),
            cur_content: Vec::<String>::new(),
            cur_scope: DataScope::Global,
            have_new_question: false,
            last_keyword_type: None,
            last_tag: String::new(),
        }
    }
}

impl FromStr for KeywordType {
    type Err = ();

    fn from_str(pattern: &str) -> Result<Self, Self::Err> {
        use KeywordType::*;
        Ok(match pattern {
            "Мета:" => Ignore,
            "Чемпионат:" | "Пакет:" => Global,
            "Тур:" => QuestionPre,
            "Вопрос " | "Вопрос:" => QuestionStart,
            "Ответ:" | "Зачет:" => QuestionContent,
            _ => CurrentScope,
            // "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
            // "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
        })
    }
}

fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<dyn std::error::Error>> {
    let buf = io::BufReader::new(file);
    let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore);

    let patterns = vec![
        "Чемпионат:",
        "Пакет:",
        "URL:",
        "Ссылка:",
        "Дата:",
        "Редактор:",
        "Обработан:",
        "Копирайт:",
        "Инфо:",
        "Тема:",
        "Вид:",
        "Тип:",
        "Тур:",
        "Мета:",
        "Вопрос ",
        "Вопрос:",
        "Ответ:",
        "Зачет:",
        "Источник:",
        "Рейтинг:",
        "Автор:",
        "Комментарий:",
        "Комментарии:",
    ];
    let mut context = Context::new();
    let mut ctx = &mut context;

    reader
        .lines()
        .map(|line| String::from(line.unwrap().trim()))
        .filter(|line| !line.is_empty()) // ignore empty lines
        .for_each(|line| {
            match patterns
                .iter() // find keyword
                .find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
            {
                Some(pattern) => {
                    use KeywordType::*;

                    ctx.last_keyword_type = ctx.cur_keyword_type;
                    ctx.last_tag = ctx.cur_tag.clone();
                    ctx.cur_keyword_type = Some(pattern.parse().unwrap());
                    ctx.cur_tag = pattern.replace(' ', "").replace(':', "");

                    // remember question id
                    if let Some(QuestionStart) = ctx.cur_keyword_type {
                        ctx.cur_question_pre["id"] = line.replace(':', "").as_str().into();
                    };

                    // apply accumulated content when new keyword found
                    match ctx.last_keyword_type {
                        Some(Global) => {
                            ctx.cur_scope = DataScope::Global;
                            ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into()
                        }
                        Some(QuestionPre) => {
                            ctx.cur_scope = DataScope::QuestionPre;
                            ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into();
                        }
                        Some(QuestionStart) => {
                            ctx.cur_scope = DataScope::QuestionContent;
                            // store prev question before reading new
                            if ctx.have_new_question {
                                ctx.questions.push_if_valid(ctx.cur_question.clone());
                            }
                            // prepare to read new question data with cur_question_pre values
                            ctx.cur_question = ctx.cur_question_pre.clone();
                            ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
                            ctx.have_new_question = true;
                        }
                        Some(QuestionContent) => {
                            ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
                        }
                        Some(CurrentScope) => {
                            // match value to store data
                            let scope_data = match ctx.cur_scope {
                                DataScope::Global => &mut ctx.data,
                                DataScope::QuestionPre => &mut ctx.cur_question_pre,
                                DataScope::QuestionContent => &mut ctx.cur_question,
                            };
                            scope_data[&ctx.last_tag] = ctx.cur_content.join("\n").into();
                        }
                        _ => (), //None or Ignore
                    };
                    // clear content
                    ctx.cur_content.clear();
                }
                None => {
                    // accumulate content if line is not a keyword
                    ctx.cur_content.push(line);
                }
            }
        });

    // finish reading last question
    if ctx.have_new_question && !ctx.cur_content.is_empty() {
        ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").into();
        ctx.questions.push_if_valid(ctx.cur_question.clone());
        ctx.have_new_question = false;
    }

    ctx.data["Вопросы"] = ctx.questions.clone();
    Ok(ctx.data.clone())
}

// split vector to a vector of [num] slices
trait SplitTo<T> {
    fn split_to(&self, num: usize) -> Vec<&[T]>;
}

impl<T> SplitTo<T> for Vec<T> {
    fn split_to(&self, num: usize) -> Vec<&[T]> {
        let part_len = self.len() / num;
        let add_len = self.len() % num;
        let mut result = Vec::<&[T]>::with_capacity(num);

        if 0 == part_len {
            result.push(self);
            return result;
        }
        for i in 0..num {
            let size = if (num - 1) == i {
                part_len + add_len
            } else {
                part_len
            };
            let start = part_len * i;
            result.push(&self[start..(start + size)]);
        }
        result
    }
}

fn process_files(files: &&[PathBuf]) {
    if files.is_empty() {
        return;
    }

    let start_file = files[0].to_str().unwrap();
    println!("-> start from \"{}\" ({} files)", start_file, files.len());

    let zip_file = fs::File::open(BASE_FILENAME).unwrap();
    let zip_reader = io::BufReader::new(zip_file);
    let mut archive = zip::ZipArchive::new(zip_reader).unwrap();

    files.iter().for_each(|name| {
        let name_str = name.to_str().unwrap();

        // parse txt file
        let file = archive.by_name(name_str).unwrap();
        let data = parse_file(file).unwrap();

        // make output filename
        let mut outfilename = PathBuf::from(OUTPUT_PATH);
        outfilename.push(name);
        outfilename.set_extension("json");

        // save json to file
        let mut outfile = fs::File::create(outfilename).unwrap();
        data.write_pretty(&mut outfile, 1).unwrap();
    });

    println!("<- done {} files (from \"{}\")", files.len(), start_file);
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
    // open archive just to list files
    let zip_file = fs::File::open(BASE_FILENAME)?;
    let zip_reader = io::BufReader::new(zip_file);
    let mut archive = zip::ZipArchive::new(zip_reader)?;

    let source_files: Vec<PathBuf> = (0..archive.len())
        .map(|i| archive.by_index(i).unwrap().mangled_name())
        .filter(|name| {
            // skip files without "txt" extension
            match name.extension() {
                Some(ext) => match ext.to_str() {
                    Some(ext_str) => ext_str.eq_ignore_ascii_case("txt"),
                    _ => false, // extension is not valid unicode or not txt
                },
                _ => false, // no extension in filename
            }
        })
        .collect();
    drop(archive);

    // check output directory
    let out_dir: PathBuf = OUTPUT_PATH.into();
    if out_dir.is_file() {
        return Err("output directory is file!".into());
    } else if !out_dir.exists() {
        fs::create_dir_all(out_dir)?;
    };

    println!(
        "processing {} files with {} threads...",
        source_files.len(),
        rayon::current_num_threads()
    );

    // split vector and process its parts in parallel
    source_files
        .split_to(rayon::current_num_threads())
        .par_iter()
        .for_each(process_files);

    println!("done");
    Ok(())
}