chgk_txt2json/src/main.rs

371 lines
12 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

extern crate async_zip;
extern crate json;
extern crate tokio;
use async_zip::read::fs::ZipFileReader;
use async_zip::write::{EntryOptions, ZipFileWriter};
use async_zip::Compression;
use std::path::PathBuf;
use std::str::FromStr;
use tokio::fs;
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader, BufWriter};
use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender};
const INPUT_FILENAME: &str = "baza_utf8.zip";
const OUTPUT_FILENAME: &str = "json.zip";
const OUTPUT_COMPRESSION: Compression = Compression::Zstd;
#[derive(Debug, Clone, Copy)]
enum KeywordType {
Ignore,
Global,
QuestionPre,
QuestionStart,
QuestionContent,
CurrentScope,
}
impl FromStr for KeywordType {
type Err = ();
fn from_str(pattern: &str) -> Result<Self, Self::Err> {
use KeywordType::*;
Ok(match pattern {
"Мета:" => Ignore,
"Чемпионат:" | "Пакет:" => Global,
"Тур:" => QuestionPre,
"Вопрос " | "Вопрос:" => QuestionStart,
"Ответ:" | "Зачет:" => QuestionContent,
_ => CurrentScope,
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
})
}
}
#[derive(Debug, Clone, Copy)]
enum DataScope {
Global,
QuestionPre,
QuestionContent,
}
struct QuestionsParser {
// global output value
data: json::JsonValue,
// temp questions array
questions: json::JsonValue,
cur_keyword_type: Option<KeywordType>,
// temp question value
cur_question: json::JsonValue,
// temp value for pre'question fields
cur_question_pre: json::JsonValue,
// scope for data fields
cur_scope: DataScope,
// curent json key
cur_tag: String,
// current json value
cur_content: Vec<String>,
// need to push temp question value if true
have_new_question: bool,
// prev. keyword type
last_keyword_type: Option<KeywordType>,
// prev. json key (used for store acummulated content when new keyword readed)
last_tag: String,
}
/// Text questions parser
impl QuestionsParser {
const PATTERNS: &'static [&'static str] = &[
"Чемпионат:",
"Пакет:",
"URL:",
"Ссылка:",
"Дата:",
"Редактор:",
"Обработан:",
"Копирайт:",
"Инфо:",
"Тема:",
"Вид:",
"Тип:",
"Тур:",
"Мета:",
"Вопрос ",
"Вопрос:",
"Ответ:",
"Зачет:",
"Источник:",
"Рейтинг:",
"Автор:",
"Комментарий:",
"Комментарии:",
];
/// create new parser
pub fn new() -> QuestionsParser {
QuestionsParser {
data: json::JsonValue::new_object(),
questions: json::JsonValue::new_array(),
cur_keyword_type: None,
cur_question: json::JsonValue::new_object(),
cur_question_pre: json::JsonValue::new_object(),
cur_tag: String::new(),
cur_content: Vec::<String>::new(),
cur_scope: DataScope::Global,
have_new_question: false,
last_keyword_type: None,
last_tag: String::new(),
}
}
/// join current content lines
fn get_current_content(&self) -> String {
self.cur_content.join("\n")
}
/// clear current content
fn clear_current_content(&mut self) {
self.cur_content.clear()
}
/// add new line to current content
fn append_to_current_content(&mut self, line: String) {
self.cur_content.push(line);
}
/// check current question have required fields
fn is_current_question_valid(&self) -> bool {
self.cur_question.has_key("Вопрос") && self.cur_question.has_key("Ответ")
}
/// add current question to parsed array
fn add_cur_question(&mut self) {
if self.is_current_question_valid() {
let current = std::mem::replace(&mut self.cur_question, self.cur_question_pre.clone());
self.questions.push(current).unwrap()
}
}
/// set current content to last tag(keyword) to data scope
fn apply_content_to(&mut self, scope: DataScope) {
let content = self.get_current_content();
// match value to store data
let scope_data = match scope {
DataScope::Global => &mut self.data,
DataScope::QuestionPre => &mut self.cur_question_pre,
DataScope::QuestionContent => &mut self.cur_question,
};
scope_data[&self.last_tag] = content.into();
self.clear_current_content();
}
/// set current content to last tag(keyword) to current scope
fn apply_content_to_cur_scope(&mut self) {
self.apply_content_to(self.cur_scope);
}
/// set current scope
fn set_scope(&mut self, scope: DataScope) {
self.cur_scope = scope;
}
/// set current scope and set current content to last tag(keyword) to data scope
fn set_scope_and_apply(&mut self, scope: DataScope) {
self.set_scope(scope);
self.apply_content_to_cur_scope();
}
/// add last question (if have) and start collecting new one
fn start_new_question(&mut self) {
// store prev question before reading new
if self.have_new_question {
self.add_cur_question();
}
self.have_new_question = true;
}
/// check last tag(keyword) and set current content to corresponding data scope
fn apply_content_for_last_keyword(&mut self) {
// apply accumulated content when new keyword found
match self.last_keyword_type {
Some(KeywordType::Global) => {
self.set_scope_and_apply(DataScope::Global);
}
Some(KeywordType::QuestionPre) => {
self.set_scope_and_apply(DataScope::QuestionPre);
}
Some(KeywordType::QuestionStart) => {
self.start_new_question();
self.set_scope_and_apply(DataScope::QuestionContent);
}
Some(KeywordType::QuestionContent) => {
self.apply_content_to(DataScope::QuestionContent);
}
Some(KeywordType::CurrentScope) => {
self.apply_content_to_cur_scope();
}
_ => (), //None or Ignore
};
}
/// set current keyword(tag) and type as last, and set new as current
fn set_new_keyword(&mut self, keyword: &str) {
self.last_keyword_type =
std::mem::replace(&mut self.cur_keyword_type, Some(keyword.parse().unwrap()));
self.last_tag = std::mem::replace(
&mut self.cur_tag,
keyword.trim_end().trim_end_matches(':').to_string(),
);
}
/// if line matched keyword
fn on_keyword_match(&mut self, line: &str, keyword: &str) {
self.set_new_keyword(keyword);
// remember question id
if let Some(KeywordType::QuestionStart) = self.cur_keyword_type {
self.cur_question_pre["id"] = line.trim_end().trim_end_matches(':').into();
};
self.apply_content_for_last_keyword();
}
/// parse next line
pub fn parse_line(&mut self, line: &str) {
match QuestionsParser::PATTERNS
.iter() // find keyword
.find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
{
Some(pattern) => {
self.on_keyword_match(line, pattern);
}
None => {
self.append_to_current_content(line.to_string());
}
}
}
/// finish parsing
pub fn finish(&mut self) {
if self.have_new_question && !self.cur_content.is_empty() {
self.cur_question[&self.cur_tag] = self.get_current_content().into();
self.add_cur_question();
self.clear_current_content();
self.have_new_question = false;
}
self.data["Вопросы"] = std::mem::replace(&mut self.questions, json::JsonValue::new_array());
}
/// get parsed data
pub fn get_parsed(self) -> json::JsonValue {
self.data
}
}
#[derive(Debug)]
struct FileText {
name: String,
text: String,
}
#[derive(Debug)]
enum TextReaderMessage {
NextLine(String),
EndOfFile(String),
}
/// read txt files from zip and convert to json
async fn zip_text_reader(tx: UnboundedSender<TextReaderMessage>) {
// open archive just to list files
let archive = ZipFileReader::new(INPUT_FILENAME).await.expect("open zip");
let source_files = archive
.entries()
.iter()
.enumerate()
.filter(|(_, entry)| !entry.dir())
.filter(|(_, entry)| {
// skip files without "txt" extension
entry.name().ends_with(".txt")
})
.map(|(index, entry)| (index, entry.name().to_string()));
//
for (index, name) in source_files {
let entry_reader = archive.entry_reader(index).await.expect("read entry");
let buf_reader = BufReader::new(entry_reader);
let mut lines = buf_reader.lines();
while let Some(line) = lines.next_line().await.expect("next line") {
tx.send(TextReaderMessage::NextLine(line))
.expect("send line");
}
tx.send(TextReaderMessage::EndOfFile(name))
.expect("send end");
}
println!("read done ✅");
}
/// convert text questions to json format
async fn questions_converter(
mut rx: UnboundedReceiver<TextReaderMessage>,
tx: UnboundedSender<FileText>,
) {
let mut parser = QuestionsParser::new();
while let Some(msg) = rx.recv().await {
match msg {
TextReaderMessage::NextLine(line) => {
let line = line.trim();
if line.is_empty() {
continue;
}
parser.parse_line(line);
}
TextReaderMessage::EndOfFile(name) => {
parser.finish();
let data_json = parser.get_parsed();
let text = data_json.pretty(2);
tx.send(FileText { name, text }).expect("send json");
parser = QuestionsParser::new();
}
}
}
println!("convert done ✅");
}
/// write json data to zip files
async fn zip_json_writer(mut rx: UnboundedReceiver<FileText>) {
let file = fs::File::create(OUTPUT_FILENAME)
.await
.expect("create file");
let mut buf = BufWriter::with_capacity(100 * 1024 * 1024, file);
let mut writer = ZipFileWriter::new(&mut buf);
while let Some(FileText { name, text: data }) = rx.recv().await {
// make output filename
let mut outfilename = PathBuf::from(name);
outfilename.set_extension("json");
let outfilename = outfilename.to_str().unwrap().to_string();
let opts = EntryOptions::new(outfilename, OUTPUT_COMPRESSION);
// write new zip entry
writer
.write_entry_whole(opts, data.as_bytes())
.await
.expect("write entry");
}
writer.close().await.expect("close writer");
buf.flush().await.expect("flush buffer");
println!("write done ✅");
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// check output filename
match fs::metadata(OUTPUT_FILENAME).await {
Ok(x) if x.is_dir() => return Err("output file is a directory!".into()),
_ => (),
};
let (reader_tx, reader_rx) = mpsc::unbounded_channel::<TextReaderMessage>();
let (json_tx, json_rx) = mpsc::unbounded_channel::<FileText>();
tokio::try_join!(
tokio::spawn(zip_text_reader(reader_tx)),
tokio::spawn(questions_converter(reader_rx, json_tx)),
tokio::spawn(zip_json_writer(json_rx))
)?;
println!("all done ✅");
Ok(())
}