chgk_txt2json/src/main.rs

extern crate encoding;
extern crate json;
extern crate rayon;
extern crate textstream;
extern crate zip;

use encoding::all::KOI8_R;
use encoding::DecoderTrap;
use rayon::prelude::*;
use std::fs;
use std::io;
use std::path::PathBuf;
use textstream::TextReader;

const BASE_FILENAME: &str = "./baza.zip";

#[derive(Debug, Clone, Copy)]
enum KeywordType {
	Ignore,
	Global,
	QuestionPre,
	QuestionStart,
	QuestionContent,
	CurrentScope,
}

#[derive(Debug, Clone, Copy)]
enum DataScope {
	Global,
	QuestionPre,
	QuestionContent,
}

fn keyword_type(pattern: &str) -> KeywordType {
	use KeywordType::*;
	match pattern {
		"Мета:" => Ignore,
		"Чемпионат:" | "Пакет:" => Global,
		"Тур:" => QuestionPre,
		"Вопрос " => QuestionStart,
		"Вопрос:" => QuestionStart,
		"Ответ:" | "Зачет:" => QuestionContent,
		_ => CurrentScope,
		// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
		// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
	}
}

struct Context {
	// global output value
	data: json::JsonValue,
	// temp questions array
	questions: json::JsonValue,
	cur_keyword_type: Option<KeywordType>,
	// temp question value
	cur_question: json::JsonValue,
	// temp value for pre'question fields
	cur_question_pre: json::JsonValue,
	// scope for data fields
	cur_scope: DataScope,
	// curent json key
	cur_tag: String,
	// current json value
	cur_content: Vec<String>,
	// need to push temp question value if true
	have_new_question: bool,
	// prev. keyword type
	last_keyword_type: Option<KeywordType>,
	// prev. json key (used for store acummulated content when new keyword readed)
	last_tag: String,
}

fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<std::error::Error>> {
	let buf = io::BufReader::new(file);
	let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore);

	let patterns = vec![
		"Чемпионат:",
		"Пакет:",
		"URL:",
		"Ссылка:",
		"Дата:",
		"Редактор:",
		"Обработан:",
		"Копирайт:",
		"Инфо:",
		"Тема:",
		"Вид:",
		"Тип:",
		"Тур:",
		"Мета:",
		"Вопрос ",
		"Вопрос:",
		"Ответ:",
		"Зачет:",
		"Источник:",
		"Рейтинг:",
		"Автор:",
		"Комментарий:",
		"Комментарии:",
	];
	// init context
	let mut context = Context {
		data: json::JsonValue::new_object(),
		questions: json::JsonValue::new_array(),
		cur_keyword_type: None,
		cur_question: json::JsonValue::new_object(),
		cur_question_pre: json::JsonValue::new_object(),
		cur_tag: String::new(),
		cur_content: Vec::<String>::new(),
		cur_scope: DataScope::Global,
		have_new_question: false,
		last_keyword_type: None,
		last_tag: String::new(),
	};
	let mut ctx = &mut context;

	for line in reader.lines() {
		// ignore empty lines
		let line_str = String::from(line.unwrap().trim());
		let line_s = &line_str;
		if line_s.is_empty() {
			continue;
		}
		// find keywords
		match patterns
			.iter()
			.find(|&&pattern| line_s.starts_with(pattern) && line_s.ends_with(':'))
		{
			Some(pattern) => {
				use KeywordType::*;

				ctx.last_keyword_type = ctx.cur_keyword_type;
				ctx.last_tag = ctx.cur_tag.clone();
				ctx.cur_keyword_type = Some(keyword_type(&pattern));
				ctx.cur_tag = pattern.replace(' ', "").replace(':', "");

				// remember question id
				if let Some(QuestionStart) = ctx.cur_keyword_type {
					ctx.cur_question_pre["id"] = line_s.replace(':', "").as_str().into();
				};

				// apply accumulated content when new keyword found
				match ctx.last_keyword_type {
					Some(Global) => {
						ctx.cur_scope = DataScope::Global;
						ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into()
					}
					Some(QuestionPre) => {
						ctx.cur_scope = DataScope::QuestionPre;
						ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into();
					}
					Some(QuestionStart) => {
						ctx.cur_scope = DataScope::QuestionContent;
						// store prev question before reading new
						if ctx.have_new_question {
							ctx.questions.push(ctx.cur_question.clone()).unwrap();
						}
						// prepare for read new question data with cur_question_pre values
						ctx.cur_question = ctx.cur_question_pre.clone();
						// ctx.cur_question_pre = json::JsonValue::new_object(); // uncomment => forget pre at new question
						ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
						ctx.have_new_question = true;
					}
					Some(QuestionContent) => {
						ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
					}
					Some(CurrentScope) => {
						// match value to store data
						(match ctx.cur_scope {
							DataScope::Global => &mut ctx.data,
							DataScope::QuestionPre => &mut ctx.cur_question_pre,
							DataScope::QuestionContent => &mut ctx.cur_question,
						})[&ctx.last_tag] = ctx.cur_content.join("\n").into();
					}
					_ => (), //None or Ignore
				};
				// clear content
				ctx.cur_content.clear();
			}
			None => {
				// accumulate content if line is not a keyword
				ctx.cur_content.push(String::from(line_s));
			}
		}
	}

	// finish reading last question
	if ctx.have_new_question && !ctx.cur_content.is_empty() {
		ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").as_str().into();
		ctx.questions.push(ctx.cur_question.clone()).unwrap();
		ctx.have_new_question = false;
	}

	ctx.data["Вопросы"] = ctx.questions.clone();
	Ok(ctx.data.clone())
}

// split slice to a vector of slices
fn split_vec<'a, T>(src: &'a [T], num: usize) -> Vec<&'a [T]> {
	let all_len = src.len();
	let part_len = all_len / num;
	let add_len = all_len % num;
	let mut result = Vec::<&'a [T]>::new();

	if 0 == part_len {
		result.push(src);
		return result;
	}
	for i in 0..num {
		let size = if (num - 1) == i {
			part_len + add_len
		} else {
			part_len
		};
		let start = part_len * i;
		result.push(&src[start..(start + size)]);
	}
	result
}

fn process_files(files: Vec<PathBuf>) {
	let zip_file = fs::File::open(BASE_FILENAME).unwrap();
	let zip_reader = io::BufReader::new(zip_file);
	let mut archive = zip::ZipArchive::new(zip_reader).unwrap();

	files.iter().for_each(|name| {
		let name_str = name.to_str().unwrap();
		println!("{:}", name_str);

		// parse txt file
		let file = archive.by_name(name_str).unwrap();
		let data = parse_file(file).unwrap();

		// make output filename
		let mut outfilename = PathBuf::from("./json");
		outfilename.push(name);
		outfilename.set_extension("json");

		// save json to file
		let mut outfile = fs::File::create(outfilename).unwrap();
		data.write_pretty(&mut outfile, 1).unwrap();
	});
}

fn main() -> Result<(), Box<std::error::Error>> {
	// open archive just to list files
	let zip_file = fs::File::open(BASE_FILENAME)?;
	let zip_reader = io::BufReader::new(zip_file);
	let mut archive = zip::ZipArchive::new(zip_reader)?;

	let source_files: Vec<PathBuf> = (0..archive.len())
		.map(|i| archive.by_index(i).unwrap().sanitized_name())
		.filter(|name| {
			// skip files without "txt" extension
			match name.extension() {
				Some(ext) => match ext.to_str() {
					Some(ext_str) if ext_str.eq_ignore_ascii_case("txt") => true,
					_ => false, // extension is not valid unicode or not txt
				},
				_ => false, // no extension in filename
			}
		})
		.collect();
	drop(archive);

	//split vector and iterate on it parts in parallel
	split_vec(&source_files, rayon::current_num_threads())
		.par_iter()
		.for_each(|slice| {
			let source_files_part = slice.to_vec();
			process_files(source_files_part);
		});

	Ok(())
}