diff options
| author | Janito Vaqueiro Ferreira Filho <janito@mullvad.net> | 2021-05-26 19:57:15 +0000 |
|---|---|---|
| committer | Janito Vaqueiro Ferreira Filho <janito@mullvad.net> | 2021-05-28 11:54:59 +0000 |
| commit | a4e9b174b438b0b96a35576b89817d66205059d1 (patch) | |
| tree | 48be552b0f4b972de554bfc0f39c7675f26156f0 /android | |
| parent | 15aa1e81984a94895969daf27082c849222b4367 (diff) | |
| download | mullvadvpn-a4e9b174b438b0b96a35576b89817d66205059d1.tar.xz mullvadvpn-a4e9b174b438b0b96a35576b89817d66205059d1.zip | |
Refactor parser into a clearer state machine
Make it more robust, maintainable and readable.
Diffstat (limited to 'android')
| -rw-r--r-- | android/translations-converter/src/gettext/parser.rs | 395 |
1 files changed, 322 insertions, 73 deletions
diff --git a/android/translations-converter/src/gettext/parser.rs b/android/translations-converter/src/gettext/parser.rs index e32143a368..fe9cac1469 100644 --- a/android/translations-converter/src/gettext/parser.rs +++ b/android/translations-converter/src/gettext/parser.rs @@ -1,4 +1,4 @@ -use super::{messages::Messages, msg_string::MsgString, parse_line, PluralForm}; +use super::{Messages, MsgString, PluralForm}; use derive_more::{Display, Error}; use std::{collections::BTreeMap, mem}; @@ -38,12 +38,50 @@ use std::{collections::BTreeMap, mem}; /// msgstr[1] "%d tradukitaj mesaĝoj" /// ``` #[derive(Debug)] -pub struct Parser { - parsing_header: bool, - messages: Messages, - current_id: Option<MsgString>, - current_plural_id: Option<MsgString>, - variants: BTreeMap<usize, MsgString>, +pub enum Parser { + /// Initial state. + /// + /// No useful information has been extracted yet. + Start, + + /// Possible start of file header. + /// + /// Found an empty message ID, if the next line is an empty message string the header of the + /// file has been found. + HeaderStart, + + /// Start of file header found. + Header, + + /// Skipping to the end of the header. + /// + /// The useful information has already been extracted. + HeaderEnd(Messages), + + /// Waiting for a next message section. + /// + /// Parser has completed parsing either at least one valid entry or the file header. + Idle(Messages), + + /// New message entry. + /// + /// Parsed a new message ID. + NewEntry { id: MsgString, messages: Messages }, + + /// Detected that entry is for a plural. + /// + /// Found a plural ID. + PluralEntry { + id: MsgString, + plural_id: MsgString, + variants: BTreeMap<usize, MsgString>, + messages: Messages, + }, + + /// Internal transition state. + /// + /// Used while a line is being parsed. + Parsing, } impl Parser { @@ -52,93 +90,296 @@ impl Parser { /// Parsing can then be done by feeding lines to the instance using [`Parser::parse_line`] and /// finishing with a call to [`Parser::finish`] to obtain the parsed result. pub fn new() -> Self { - Parser { - parsing_header: false, - messages: Messages::default(), - current_id: None, - current_plural_id: None, - variants: BTreeMap::new(), - } + Parser::Start } /// Parse an input line. pub fn parse_line(&mut self, line: &str) -> Result<(), Error> { - match_str! { (line.trim()) - ["msgid \"", msg_id, "\""] => { - self.current_id = Some(MsgString::from_escaped(msg_id)); + let state = mem::replace(self, Parser::Parsing); + + *self = match state { + Parser::Start => Self::parse_start(line)?, + Parser::HeaderStart => Self::parse_header_start(line)?, + Parser::Header => Self::parse_header(line)?, + Parser::HeaderEnd(messages) => Self::parse_header_end(line, messages)?, + Parser::Idle(messages) => Self::parse_idle(line, messages)?, + Parser::NewEntry { id, messages } => Self::parse_new_entry(line, id, messages)?, + Parser::PluralEntry { + id, + plural_id, + variants, + messages, + } => Self::parse_plural_entry(line, id, plural_id, variants, messages)?, + Parser::Parsing => unreachable!("Parser should never stop on the Parsing state"), + }; + + Ok(()) + } + + /// Finish parsing and obtain the parsed [`Messages]. + pub fn finish(self) -> Result<Messages, Error> { + match self { + // Input file is empty + Parser::Start => Ok(Messages::default()), + + // A single empty msgid was parsed, but no msgstr for that entry (or header) + Parser::HeaderStart => Err(Error::IncompleteEntry(MsgString::empty())), + + // Input file only contains headers that were ignored + Parser::Header => Ok(Messages::default()), + + // Input file only contains headers, but the plural form was successfully parsed + Parser::HeaderEnd(messages) => Ok(messages), + + // Parsing successful + Parser::Idle(messages) => Ok(messages), + + // Input file ends on an incomplete entry + Parser::NewEntry { id, .. } => Err(Error::IncompleteEntry(id)), + + // Input file ends with a plural entry (it might be missing variants) + Parser::PluralEntry { + id, + plural_id, + variants, + mut messages, + } => { + let variants = collect_variants(&id, variants)?; + + messages.add_plural(id, plural_id, variants); + + Ok(messages) + } + + Parser::Parsing => unreachable!("Parser should never stop on the Parsing state"), + } + } + + fn parse_start(line: &str) -> Result<Parser, Error> { + let next_state = match_str! { (line) + // Ignore empty lines and comment lines + [""] | ["#", ..] => Parser::Start, + + // An empty message ID may indicate the start of the header + ["msgid \"\""] => Parser::HeaderStart, + + // Headers don't have context, so skip it and get ready to parse entries + ["msgctxt ", ..] => Parser::Idle(Messages::default()), + + // File has no header, went directly to the first entry + ["msgid \"", msg_id, "\""] => Parser::NewEntry { + id: MsgString::from_escaped(msg_id), + messages: Messages::default() }, - ["msgstr \"", translation, "\""] => { - if let Some(id) = self.current_id.take() { - self.parsing_header = id.is_empty() && translation.is_empty(); - self.messages.add(id, MsgString::from_escaped(translation)); - } - self.current_id = None; - self.current_plural_id = None; + other => return Err(Error::UnexpectedLine(other.to_owned())), + }; + + Ok(next_state) + } + + fn parse_header_start(line: &str) -> Result<Parser, Error> { + let next_state = match_str! { (line) + // Ignore comment lines + ["#", ..] => Parser::HeaderStart, + + // An empty message string confirms the start of the header + ["msgstr \"\""] => Parser::Header, + + // A non-empty message string means an entry with an empty ID has been parsed + ["msgstr \"", string, "\""] => Parser::Idle( + Messages::starting_with(MsgString::empty(), MsgString::from_escaped(string)) + ), + + // A plural ID means this is the start of a plural entry with an empty ID + ["msgid_plural \"", plural_id, "\""] => Parser::PluralEntry { + id: MsgString::empty(), + plural_id: MsgString::from_escaped(plural_id), + variants: BTreeMap::new(), + messages: Messages::default(), + }, + + other => return Err(Error::UnexpectedLine(other.to_owned())), + }; + + Ok(next_state) + } + + fn parse_header(line: &str) -> Result<Parser, Error> { + let next_state = match_str! { (line) + // Ignore comment lines + ["#", ..] => Parser::HeaderStart, + + // An empty line marks the end of the header + [""] => Parser::Idle(Messages::default()), + + // The Plural-Forms header is the only header that's currently used, so after finding + // it the parser can skip to the end of the headers + ["\"Plural-Forms: ", plural_formula, ";\\n\""] => { + let plural_form = PluralForm::from_formula(plural_formula) + .ok_or_else(|| Error::UnrecognizedPluralFormula(plural_formula.to_owned()))?; + + Parser::HeaderEnd(Messages::with_plural_form(plural_form)) }, - ["msgid_plural \"", plural_id, "\""] => { - self.current_plural_id = Some(MsgString::from_escaped(plural_id)); - self.parsing_header = false; + + // Skip other headers + ["\"", .., "\\n\""] => Parser::Header, + + other => return Err(Error::UnexpectedLine(other.to_owned())), + }; + + Ok(next_state) + } + + fn parse_header_end(line: &str, messages: Messages) -> Result<Parser, Error> { + let next_state = match_str! { (line) + // An empty line marks the end of the header + [""] => Parser::Idle(messages), + + // Ignore comment lines + ["#", ..] => Parser::HeaderEnd(messages), + + // Skip any other headers + ["\"", .., "\\n\""] => Parser::HeaderEnd(messages), + + other => return Err(Error::UnexpectedLine(other.to_owned())), + }; + + Ok(next_state) + } + + fn parse_idle(line: &str, messages: Messages) -> Result<Parser, Error> { + let next_state = match_str! { (line) + // Ignore empty lines, comment lines and message context lines + [""] | ["#", ..] | ["msgctxt ", ..] => Parser::Idle(messages), + + // Start of a new message entry + ["msgid \"", msg_id, "\""] => Parser::NewEntry { + id: MsgString::from_escaped(msg_id), + messages, }, - ["msgstr[", plural_translation, "\""] => { - let variant_id_end = plural_translation - .chars() - .position(|character| character == ']') - .ok_or_else(|| Error::InvalidPluralVariant(plural_translation.to_owned()))?; - let variant_id: usize = plural_translation[..variant_id_end] - .parse() - .map_err(|_| { - Error::InvalidPluralIndex(plural_translation[..variant_id_end].to_owned()) - })?; - let variant_msg = parse_line(&plural_translation[variant_id_end..], "] \"", "") - .ok_or_else(|| Error::InvalidPluralVariant(plural_translation.to_owned()))?; - self.variants.insert(variant_id, MsgString::from_escaped(variant_msg)); - self.parsing_header = false; + other => return Err(Error::UnexpectedLine(other.to_owned())), + }; + + Ok(next_state) + } + + fn parse_new_entry(line: &str, id: MsgString, mut messages: Messages) -> Result<Parser, Error> { + let next_state = match_str! { (line) + // Ignore comment lines + ["#", ..] => Parser::NewEntry { id, messages }, + + // A message string for an invariant entry + ["msgstr \"", string, "\""] => { + messages.add(id, MsgString::from_escaped(string)); + + Parser::Idle(messages) }, - ["\"", header, "\\n\""] => { - if self.parsing_header { - if let Some(plural_formula) = parse_line(header, "Plural-Forms: ", ";") { - self.messages.plural_form = PluralForm::from_formula(plural_formula); - } - } + + // A plural ID means this is the start of a plural entry + ["msgid_plural \"", plural_id, "\""] => Parser::PluralEntry { + id, + plural_id: MsgString::from_escaped(plural_id), + variants: BTreeMap::new(), + messages, }, - line => { - if let Some(plural_id) = self.current_plural_id.take() { - let id = self.current_id.take() - .ok_or_else(|| Error::UnexpectedLine(line.to_owned()))?; - let values = mem::replace(&mut self.variants, BTreeMap::new()) - .into_iter() - .enumerate() - .map(|(index, (variant_id, value))| { - if index == variant_id { - Ok(value) - } else { - Err(Error::IncompletePluralEntry(id.clone())) - } - }) - .collect::<Result<Vec<_>, Error>>()?; + other => return Err(Error::UnexpectedLine(other.to_owned())), + }; + + Ok(next_state) + } + + fn parse_plural_entry( + line: &str, + id: MsgString, + plural_id: MsgString, + mut variants: BTreeMap<usize, MsgString>, + mut messages: Messages, + ) -> Result<Parser, Error> { + let next_state = match_str! { (line) + // Ignore comment lines + ["#", ..] => Parser::PluralEntry { id, plural_id, variants, messages }, + + // A message string for a plural variant + ["msgstr[", index_and_string, "\""] => { + let (index, message) = extract_plural_variant(index_and_string)?; - self.messages.add_plural(id, plural_id, values); + variants.insert(index, message); + + Parser::PluralEntry { + id, + plural_id, + variants, + messages, } + }, + + // An empty line marks the end of the plural entry + [""] => { + let variants = collect_variants(&id, variants)?; - self.current_id = None; - self.current_plural_id = None; - self.variants.clear(); - self.parsing_header = false; + messages.add_plural(id, plural_id, variants); + + Parser::Idle(messages) }, - } - Ok(()) + other => return Err(Error::UnexpectedLine(other.to_owned())), + }; + + Ok(next_state) } +} - /// Finish parsing and obtain the parsed [`Messages]. - pub fn finish(mut self) -> Result<Messages, Error> { - self.parse_line("")?; +/// Helper function to extract the plural variant index and message. +/// +/// The parser will try to parse a plural line of the form `msgstr[1] "%d tradukitaj mesaĝoj"`. +/// When matching the line to the expected template, it will remove the `msgstr[` prefix and the +/// `"` suffix. This function will then parse the rest of the string (`1] "%d tradukitaj mesaĝoj`) +/// by extracting the index (1), and then extracting the message string by skipping the separator +/// (`] "`). +fn extract_plural_variant(index_and_string: &str) -> Result<(usize, MsgString), Error> { + let recreate_line = || format!("msgstr[{}\"", index_and_string); + + let parts: Vec<_> = index_and_string.splitn(2, "] \"").collect(); + + if parts.len() != 2 { + return Err(Error::InvalidPluralVariant(recreate_line())); + } + + let index_string = parts[0]; + let message_string = parts[1]; + + let index = index_string + .parse() + .map_err(|_| Error::InvalidPluralIndex(recreate_line()))?; + + let variant_message = MsgString::from_escaped(message_string); + + Ok((index, variant_message)) +} - Ok(self.messages) +/// Helper function to collect parsed variants. +/// +/// This will return only the variant messages in index order. The function will return an error if +/// any variant index is missing. +fn collect_variants( + id: &MsgString, + variant_map: BTreeMap<usize, MsgString>, +) -> Result<Vec<MsgString>, Error> { + let index_count = variant_map.len(); + + for index in 0..index_count { + if !variant_map.contains_key(&index) { + return Err(Error::IncompletePluralEntry(id.clone())); + } } + + Ok(variant_map + .into_iter() + .map(|(_, variant)| variant) + .collect()) } /// Parsing errors. @@ -148,6 +389,14 @@ pub enum Error { #[display(fmt = "Unexpected line parsing gettext messages: {}", _0)] UnexpectedLine(#[error(not(source))] String), + /// Input uses an unrecognized plural forumal. + #[display(fmt = "Input uses an unrecognized formula for the plural form: {}", _0)] + UnrecognizedPluralFormula(#[error(not(source))] String), + + /// Input ended with an incomplete entry. + #[display(fmt = "Input ended with an incomplete gettext entry with ID: {}", _0)] + IncompleteEntry(#[error(not(source))] MsgString), + /// Plural entry definition is missing a plural variant. #[display(fmt = "Plural entry is missing a plural variant: {}", _0)] IncompletePluralEntry(#[error(not(source))] MsgString), |
