2022-09-12 12:59:35 +00:00
|
|
|
|
extern crate async_zip;
|
2019-07-26 09:24:25 +00:00
|
|
|
|
extern crate json;
|
2022-09-12 12:59:35 +00:00
|
|
|
|
extern crate tokio;
|
2019-07-26 09:24:25 +00:00
|
|
|
|
|
2022-09-12 12:59:35 +00:00
|
|
|
|
use async_zip::read::fs::ZipFileReader;
|
2022-09-12 19:34:17 +00:00
|
|
|
|
use async_zip::write::{EntryOptions, ZipFileWriter};
|
|
|
|
|
use async_zip::Compression;
|
2019-07-26 09:24:25 +00:00
|
|
|
|
use std::path::PathBuf;
|
2022-08-25 13:12:47 +00:00
|
|
|
|
use std::str::FromStr;
|
2022-09-19 21:16:04 +00:00
|
|
|
|
use tokio::fs;
|
2022-10-01 09:42:32 +00:00
|
|
|
|
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader, BufWriter};
|
2022-09-20 14:17:59 +00:00
|
|
|
|
use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender};
|
2019-07-26 09:24:25 +00:00
|
|
|
|
|
2022-09-12 19:34:17 +00:00
|
|
|
|
const INPUT_FILENAME: &str = "baza_utf8.zip";
|
|
|
|
|
const OUTPUT_FILENAME: &str = "json.zip";
|
2022-09-20 12:34:39 +00:00
|
|
|
|
const OUTPUT_COMPRESSION: Compression = Compression::Zstd;
|
2019-07-26 20:04:41 +00:00
|
|
|
|
|
2019-07-26 09:24:25 +00:00
|
|
|
|
#[derive(Debug, Clone, Copy)]
|
|
|
|
|
enum KeywordType {
|
2019-07-27 17:24:49 +00:00
|
|
|
|
Ignore,
|
|
|
|
|
Global,
|
|
|
|
|
QuestionPre,
|
|
|
|
|
QuestionStart,
|
|
|
|
|
QuestionContent,
|
|
|
|
|
CurrentScope,
|
2019-07-26 09:25:45 +00:00
|
|
|
|
}
|
|
|
|
|
|
2022-09-20 11:31:37 +00:00
|
|
|
|
impl FromStr for KeywordType {
|
|
|
|
|
type Err = ();
|
|
|
|
|
|
|
|
|
|
fn from_str(pattern: &str) -> Result<Self, Self::Err> {
|
|
|
|
|
use KeywordType::*;
|
|
|
|
|
Ok(match pattern {
|
|
|
|
|
"Мета:" => Ignore,
|
|
|
|
|
"Чемпионат:" | "Пакет:" => Global,
|
|
|
|
|
"Тур:" => QuestionPre,
|
|
|
|
|
"Вопрос " | "Вопрос:" => QuestionStart,
|
|
|
|
|
"Ответ:" | "Зачет:" => QuestionContent,
|
|
|
|
|
_ => CurrentScope,
|
|
|
|
|
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
|
|
|
|
|
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-07-26 09:25:45 +00:00
|
|
|
|
#[derive(Debug, Clone, Copy)]
|
|
|
|
|
enum DataScope {
|
2019-07-27 17:24:49 +00:00
|
|
|
|
Global,
|
|
|
|
|
QuestionPre,
|
|
|
|
|
QuestionContent,
|
2019-07-26 09:24:25 +00:00
|
|
|
|
}
|
|
|
|
|
|
2022-09-20 11:31:37 +00:00
|
|
|
|
struct QuestionsParser {
|
2019-07-27 17:24:49 +00:00
|
|
|
|
// global output value
|
|
|
|
|
data: json::JsonValue,
|
|
|
|
|
// temp questions array
|
|
|
|
|
questions: json::JsonValue,
|
|
|
|
|
cur_keyword_type: Option<KeywordType>,
|
|
|
|
|
// temp question value
|
|
|
|
|
cur_question: json::JsonValue,
|
|
|
|
|
// temp value for pre'question fields
|
|
|
|
|
cur_question_pre: json::JsonValue,
|
|
|
|
|
// scope for data fields
|
|
|
|
|
cur_scope: DataScope,
|
|
|
|
|
// curent json key
|
|
|
|
|
cur_tag: String,
|
|
|
|
|
// current json value
|
|
|
|
|
cur_content: Vec<String>,
|
|
|
|
|
// need to push temp question value if true
|
|
|
|
|
have_new_question: bool,
|
|
|
|
|
// prev. keyword type
|
|
|
|
|
last_keyword_type: Option<KeywordType>,
|
|
|
|
|
// prev. json key (used for store acummulated content when new keyword readed)
|
|
|
|
|
last_tag: String,
|
2019-07-26 09:24:25 +00:00
|
|
|
|
}
|
|
|
|
|
|
2022-09-20 11:31:37 +00:00
|
|
|
|
/// Text questions parser
|
|
|
|
|
impl QuestionsParser {
|
|
|
|
|
const PATTERNS: &'static [&'static str] = &[
|
2019-07-27 17:24:49 +00:00
|
|
|
|
"Чемпионат:",
|
|
|
|
|
"Пакет:",
|
|
|
|
|
"URL:",
|
|
|
|
|
"Ссылка:",
|
|
|
|
|
"Дата:",
|
|
|
|
|
"Редактор:",
|
|
|
|
|
"Обработан:",
|
|
|
|
|
"Копирайт:",
|
|
|
|
|
"Инфо:",
|
|
|
|
|
"Тема:",
|
|
|
|
|
"Вид:",
|
|
|
|
|
"Тип:",
|
|
|
|
|
"Тур:",
|
|
|
|
|
"Мета:",
|
|
|
|
|
"Вопрос ",
|
|
|
|
|
"Вопрос:",
|
|
|
|
|
"Ответ:",
|
|
|
|
|
"Зачет:",
|
|
|
|
|
"Источник:",
|
|
|
|
|
"Рейтинг:",
|
|
|
|
|
"Автор:",
|
|
|
|
|
"Комментарий:",
|
|
|
|
|
"Комментарии:",
|
|
|
|
|
];
|
2019-07-26 09:24:25 +00:00
|
|
|
|
|
2022-09-20 11:31:37 +00:00
|
|
|
|
/// create new parser
|
|
|
|
|
pub fn new() -> QuestionsParser {
|
|
|
|
|
QuestionsParser {
|
|
|
|
|
data: json::JsonValue::new_object(),
|
|
|
|
|
questions: json::JsonValue::new_array(),
|
|
|
|
|
cur_keyword_type: None,
|
|
|
|
|
cur_question: json::JsonValue::new_object(),
|
|
|
|
|
cur_question_pre: json::JsonValue::new_object(),
|
|
|
|
|
cur_tag: String::new(),
|
|
|
|
|
cur_content: Vec::<String>::new(),
|
|
|
|
|
cur_scope: DataScope::Global,
|
|
|
|
|
have_new_question: false,
|
|
|
|
|
last_keyword_type: None,
|
|
|
|
|
last_tag: String::new(),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
/// join current content lines
|
|
|
|
|
fn get_current_content(&self) -> String {
|
|
|
|
|
self.cur_content.join("\n")
|
|
|
|
|
}
|
|
|
|
|
/// clear current content
|
|
|
|
|
fn clear_current_content(&mut self) {
|
|
|
|
|
self.cur_content.clear()
|
|
|
|
|
}
|
|
|
|
|
/// add new line to current content
|
|
|
|
|
fn append_to_current_content(&mut self, line: String) {
|
|
|
|
|
self.cur_content.push(line);
|
|
|
|
|
}
|
|
|
|
|
/// check current question have required fields
|
|
|
|
|
fn is_current_question_valid(&self) -> bool {
|
|
|
|
|
self.cur_question.has_key("Вопрос") && self.cur_question.has_key("Ответ")
|
|
|
|
|
}
|
|
|
|
|
/// add current question to parsed array
|
|
|
|
|
fn add_cur_question(&mut self) {
|
|
|
|
|
if self.is_current_question_valid() {
|
2022-09-23 20:20:17 +00:00
|
|
|
|
let current = std::mem::replace(&mut self.cur_question, self.cur_question_pre.clone());
|
|
|
|
|
self.questions.push(current).unwrap()
|
2022-09-12 12:59:35 +00:00
|
|
|
|
}
|
2022-09-20 11:31:37 +00:00
|
|
|
|
}
|
|
|
|
|
/// set current content to last tag(keyword) to data scope
|
|
|
|
|
fn apply_content_to(&mut self, scope: DataScope) {
|
|
|
|
|
let content = self.get_current_content();
|
|
|
|
|
// match value to store data
|
|
|
|
|
let scope_data = match scope {
|
|
|
|
|
DataScope::Global => &mut self.data,
|
|
|
|
|
DataScope::QuestionPre => &mut self.cur_question_pre,
|
|
|
|
|
DataScope::QuestionContent => &mut self.cur_question,
|
|
|
|
|
};
|
|
|
|
|
scope_data[&self.last_tag] = content.into();
|
|
|
|
|
self.clear_current_content();
|
|
|
|
|
}
|
|
|
|
|
/// set current content to last tag(keyword) to current scope
|
|
|
|
|
fn apply_content_to_cur_scope(&mut self) {
|
|
|
|
|
self.apply_content_to(self.cur_scope);
|
|
|
|
|
}
|
|
|
|
|
/// set current scope
|
|
|
|
|
fn set_scope(&mut self, scope: DataScope) {
|
|
|
|
|
self.cur_scope = scope;
|
|
|
|
|
}
|
|
|
|
|
/// set current scope and set current content to last tag(keyword) to data scope
|
|
|
|
|
fn set_scope_and_apply(&mut self, scope: DataScope) {
|
|
|
|
|
self.set_scope(scope);
|
|
|
|
|
self.apply_content_to_cur_scope();
|
|
|
|
|
}
|
|
|
|
|
/// add last question (if have) and start collecting new one
|
|
|
|
|
fn start_new_question(&mut self) {
|
|
|
|
|
// store prev question before reading new
|
|
|
|
|
if self.have_new_question {
|
|
|
|
|
self.add_cur_question();
|
|
|
|
|
}
|
|
|
|
|
self.have_new_question = true;
|
|
|
|
|
}
|
2022-09-12 12:59:35 +00:00
|
|
|
|
|
2022-09-20 11:31:37 +00:00
|
|
|
|
/// check last tag(keyword) and set current content to corresponding data scope
|
|
|
|
|
fn apply_content_for_last_keyword(&mut self) {
|
|
|
|
|
// apply accumulated content when new keyword found
|
|
|
|
|
match self.last_keyword_type {
|
|
|
|
|
Some(KeywordType::Global) => {
|
|
|
|
|
self.set_scope_and_apply(DataScope::Global);
|
|
|
|
|
}
|
|
|
|
|
Some(KeywordType::QuestionPre) => {
|
|
|
|
|
self.set_scope_and_apply(DataScope::QuestionPre);
|
|
|
|
|
}
|
|
|
|
|
Some(KeywordType::QuestionStart) => {
|
|
|
|
|
self.start_new_question();
|
|
|
|
|
self.set_scope_and_apply(DataScope::QuestionContent);
|
|
|
|
|
}
|
|
|
|
|
Some(KeywordType::QuestionContent) => {
|
|
|
|
|
self.apply_content_to(DataScope::QuestionContent);
|
|
|
|
|
}
|
|
|
|
|
Some(KeywordType::CurrentScope) => {
|
|
|
|
|
self.apply_content_to_cur_scope();
|
|
|
|
|
}
|
|
|
|
|
_ => (), //None or Ignore
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
/// set current keyword(tag) and type as last, and set new as current
|
|
|
|
|
fn set_new_keyword(&mut self, keyword: &str) {
|
2022-09-23 19:50:42 +00:00
|
|
|
|
self.last_keyword_type =
|
|
|
|
|
std::mem::replace(&mut self.cur_keyword_type, Some(keyword.parse().unwrap()));
|
|
|
|
|
self.last_tag = std::mem::replace(
|
|
|
|
|
&mut self.cur_tag,
|
|
|
|
|
keyword.trim_end().trim_end_matches(':').to_string(),
|
|
|
|
|
);
|
2022-09-20 11:31:37 +00:00
|
|
|
|
}
|
|
|
|
|
/// if line matched keyword
|
|
|
|
|
fn on_keyword_match(&mut self, line: &str, keyword: &str) {
|
|
|
|
|
self.set_new_keyword(keyword);
|
|
|
|
|
|
|
|
|
|
// remember question id
|
|
|
|
|
if let Some(KeywordType::QuestionStart) = self.cur_keyword_type {
|
2022-09-23 19:37:17 +00:00
|
|
|
|
self.cur_question_pre["id"] = line.trim_end().trim_end_matches(':').into();
|
2022-09-20 11:31:37 +00:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
self.apply_content_for_last_keyword();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// parse next line
|
|
|
|
|
pub fn parse_line(&mut self, line: &str) {
|
|
|
|
|
match QuestionsParser::PATTERNS
|
2022-09-12 12:59:35 +00:00
|
|
|
|
.iter() // find keyword
|
|
|
|
|
.find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
|
|
|
|
|
{
|
|
|
|
|
Some(pattern) => {
|
2022-09-20 11:31:37 +00:00
|
|
|
|
self.on_keyword_match(line, pattern);
|
2019-07-27 17:24:49 +00:00
|
|
|
|
}
|
2022-09-12 12:59:35 +00:00
|
|
|
|
None => {
|
2022-09-20 11:31:37 +00:00
|
|
|
|
self.append_to_current_content(line.to_string());
|
2022-09-12 12:59:35 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-07-26 09:24:25 +00:00
|
|
|
|
|
2022-09-20 11:31:37 +00:00
|
|
|
|
/// finish parsing
|
|
|
|
|
pub fn finish(&mut self) {
|
|
|
|
|
if self.have_new_question && !self.cur_content.is_empty() {
|
|
|
|
|
self.cur_question[&self.cur_tag] = self.get_current_content().into();
|
|
|
|
|
self.add_cur_question();
|
|
|
|
|
self.clear_current_content();
|
|
|
|
|
self.have_new_question = false;
|
|
|
|
|
}
|
2022-09-23 19:50:42 +00:00
|
|
|
|
self.data["Вопросы"] = std::mem::replace(&mut self.questions, json::JsonValue::new_array());
|
2022-09-20 11:31:37 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// get parsed data
|
|
|
|
|
pub fn get_parsed(self) -> json::JsonValue {
|
|
|
|
|
self.data
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-01 09:42:32 +00:00
|
|
|
|
#[derive(Debug)]
|
|
|
|
|
struct FileText {
|
|
|
|
|
name: String,
|
|
|
|
|
text: String,
|
2019-07-26 09:24:25 +00:00
|
|
|
|
}
|
|
|
|
|
|
2022-09-20 14:17:59 +00:00
|
|
|
|
#[derive(Debug)]
|
2022-10-01 09:42:32 +00:00
|
|
|
|
enum TextReaderMessage {
|
|
|
|
|
NextLine(String),
|
|
|
|
|
EndOfFile(String),
|
2019-07-26 20:04:41 +00:00
|
|
|
|
}
|
|
|
|
|
|
2022-09-20 11:35:47 +00:00
|
|
|
|
/// read txt files from zip and convert to json
|
2022-10-01 09:42:32 +00:00
|
|
|
|
async fn zip_text_reader(tx: UnboundedSender<TextReaderMessage>) {
|
2019-07-27 17:24:49 +00:00
|
|
|
|
// open archive just to list files
|
2022-10-01 09:42:32 +00:00
|
|
|
|
let archive = ZipFileReader::new(INPUT_FILENAME).await.expect("open zip");
|
2022-09-12 12:59:35 +00:00
|
|
|
|
|
2022-09-19 21:12:49 +00:00
|
|
|
|
let source_files = archive
|
2022-09-12 12:59:35 +00:00
|
|
|
|
.entries()
|
|
|
|
|
.iter()
|
|
|
|
|
.enumerate()
|
2022-09-12 19:34:17 +00:00
|
|
|
|
.filter(|(_, entry)| !entry.dir())
|
|
|
|
|
.filter(|(_, entry)| {
|
2019-07-27 17:24:49 +00:00
|
|
|
|
// skip files without "txt" extension
|
2022-09-12 19:34:17 +00:00
|
|
|
|
entry.name().ends_with(".txt")
|
2019-07-27 17:24:49 +00:00
|
|
|
|
})
|
2022-09-19 21:12:49 +00:00
|
|
|
|
.map(|(index, entry)| (index, entry.name().to_string()));
|
|
|
|
|
//
|
|
|
|
|
for (index, name) in source_files {
|
2022-10-01 09:42:32 +00:00
|
|
|
|
let entry_reader = archive.entry_reader(index).await.expect("read entry");
|
|
|
|
|
let buf_reader = BufReader::new(entry_reader);
|
|
|
|
|
let mut lines = buf_reader.lines();
|
|
|
|
|
while let Some(line) = lines.next_line().await.expect("next line") {
|
|
|
|
|
tx.send(TextReaderMessage::NextLine(line))
|
|
|
|
|
.expect("send line");
|
|
|
|
|
}
|
|
|
|
|
tx.send(TextReaderMessage::EndOfFile(name))
|
|
|
|
|
.expect("send end");
|
2022-09-19 21:12:49 +00:00
|
|
|
|
}
|
2022-09-20 08:39:53 +00:00
|
|
|
|
|
2022-10-01 09:42:32 +00:00
|
|
|
|
println!("read done ✅");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// convert text questions to json format
|
|
|
|
|
async fn questions_converter(
|
|
|
|
|
mut rx: UnboundedReceiver<TextReaderMessage>,
|
|
|
|
|
tx: UnboundedSender<FileText>,
|
|
|
|
|
) {
|
|
|
|
|
let mut parser = QuestionsParser::new();
|
|
|
|
|
while let Some(msg) = rx.recv().await {
|
|
|
|
|
match msg {
|
|
|
|
|
TextReaderMessage::NextLine(line) => {
|
|
|
|
|
let line = line.trim();
|
|
|
|
|
if line.is_empty() {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
parser.parse_line(line);
|
|
|
|
|
}
|
|
|
|
|
TextReaderMessage::EndOfFile(name) => {
|
|
|
|
|
parser.finish();
|
|
|
|
|
let data_json = parser.get_parsed();
|
|
|
|
|
let text = data_json.pretty(2);
|
|
|
|
|
tx.send(FileText { name, text }).expect("send json");
|
|
|
|
|
parser = QuestionsParser::new();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2022-09-20 08:39:53 +00:00
|
|
|
|
println!("convert done ✅");
|
2022-09-19 21:12:49 +00:00
|
|
|
|
}
|
|
|
|
|
|
2022-09-20 11:35:47 +00:00
|
|
|
|
/// write json data to zip files
|
2022-10-01 09:42:32 +00:00
|
|
|
|
async fn zip_json_writer(mut rx: UnboundedReceiver<FileText>) {
|
|
|
|
|
let file = fs::File::create(OUTPUT_FILENAME)
|
|
|
|
|
.await
|
|
|
|
|
.expect("create file");
|
2022-09-23 20:01:57 +00:00
|
|
|
|
let mut buf = BufWriter::with_capacity(100 * 1024 * 1024, file);
|
|
|
|
|
let mut writer = ZipFileWriter::new(&mut buf);
|
2022-09-19 21:12:49 +00:00
|
|
|
|
|
2022-10-01 09:42:32 +00:00
|
|
|
|
while let Some(FileText { name, text: data }) = rx.recv().await {
|
2022-09-20 14:17:59 +00:00
|
|
|
|
// make output filename
|
|
|
|
|
let mut outfilename = PathBuf::from(name);
|
|
|
|
|
outfilename.set_extension("json");
|
|
|
|
|
let outfilename = outfilename.to_str().unwrap().to_string();
|
|
|
|
|
let opts = EntryOptions::new(outfilename, OUTPUT_COMPRESSION);
|
2022-09-19 21:16:04 +00:00
|
|
|
|
|
2022-09-20 14:17:59 +00:00
|
|
|
|
// write new zip entry
|
|
|
|
|
writer
|
|
|
|
|
.write_entry_whole(opts, data.as_bytes())
|
|
|
|
|
.await
|
2022-10-01 09:42:32 +00:00
|
|
|
|
.expect("write entry");
|
2022-09-19 21:12:49 +00:00
|
|
|
|
}
|
2022-10-01 09:42:32 +00:00
|
|
|
|
writer.close().await.expect("close writer");
|
|
|
|
|
buf.flush().await.expect("flush buffer");
|
2022-09-20 11:31:37 +00:00
|
|
|
|
|
2022-09-20 08:39:53 +00:00
|
|
|
|
println!("write done ✅");
|
2022-09-19 21:12:49 +00:00
|
|
|
|
}
|
2022-08-25 12:50:37 +00:00
|
|
|
|
|
2022-09-19 21:12:49 +00:00
|
|
|
|
#[tokio::main]
|
|
|
|
|
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
2022-09-12 19:34:17 +00:00
|
|
|
|
// check output filename
|
|
|
|
|
match fs::metadata(OUTPUT_FILENAME).await {
|
|
|
|
|
Ok(x) if x.is_dir() => return Err("output file is a directory!".into()),
|
2022-09-12 12:59:35 +00:00
|
|
|
|
_ => (),
|
2022-08-25 11:45:03 +00:00
|
|
|
|
};
|
2022-08-25 12:50:37 +00:00
|
|
|
|
|
2022-10-01 09:42:32 +00:00
|
|
|
|
let (reader_tx, reader_rx) = mpsc::unbounded_channel::<TextReaderMessage>();
|
|
|
|
|
let (json_tx, json_rx) = mpsc::unbounded_channel::<FileText>();
|
2022-09-19 21:12:49 +00:00
|
|
|
|
|
2022-09-20 08:39:53 +00:00
|
|
|
|
tokio::try_join!(
|
2022-10-01 09:42:32 +00:00
|
|
|
|
tokio::spawn(zip_text_reader(reader_tx)),
|
|
|
|
|
tokio::spawn(questions_converter(reader_rx, json_tx)),
|
|
|
|
|
tokio::spawn(zip_json_writer(json_rx))
|
2022-09-20 08:39:53 +00:00
|
|
|
|
)?;
|
2022-08-25 12:50:37 +00:00
|
|
|
|
|
2022-09-20 08:39:53 +00:00
|
|
|
|
println!("all done ✅");
|
2019-07-27 17:24:49 +00:00
|
|
|
|
Ok(())
|
2019-07-25 09:02:25 +00:00
|
|
|
|
}
|