summaryrefslogtreecommitdiffhomepage
path: root/android
diff options
context:
space:
mode:
authorJanito Vaqueiro Ferreira Filho <janito@mullvad.net>2021-05-26 19:57:15 +0000
committerJanito Vaqueiro Ferreira Filho <janito@mullvad.net>2021-05-28 11:54:59 +0000
commita4e9b174b438b0b96a35576b89817d66205059d1 (patch)
tree48be552b0f4b972de554bfc0f39c7675f26156f0 /android
parent15aa1e81984a94895969daf27082c849222b4367 (diff)
downloadmullvadvpn-a4e9b174b438b0b96a35576b89817d66205059d1.tar.xz
mullvadvpn-a4e9b174b438b0b96a35576b89817d66205059d1.zip
Refactor parser into a clearer state machine
Make it more robust, maintainable and readable.
Diffstat (limited to 'android')
-rw-r--r--android/translations-converter/src/gettext/parser.rs395
1 files changed, 322 insertions, 73 deletions
diff --git a/android/translations-converter/src/gettext/parser.rs b/android/translations-converter/src/gettext/parser.rs
index e32143a368..fe9cac1469 100644
--- a/android/translations-converter/src/gettext/parser.rs
+++ b/android/translations-converter/src/gettext/parser.rs
@@ -1,4 +1,4 @@
-use super::{messages::Messages, msg_string::MsgString, parse_line, PluralForm};
+use super::{Messages, MsgString, PluralForm};
use derive_more::{Display, Error};
use std::{collections::BTreeMap, mem};
@@ -38,12 +38,50 @@ use std::{collections::BTreeMap, mem};
/// msgstr[1] "%d tradukitaj mesaĝoj"
/// ```
#[derive(Debug)]
-pub struct Parser {
- parsing_header: bool,
- messages: Messages,
- current_id: Option<MsgString>,
- current_plural_id: Option<MsgString>,
- variants: BTreeMap<usize, MsgString>,
+pub enum Parser {
+ /// Initial state.
+ ///
+ /// No useful information has been extracted yet.
+ Start,
+
+ /// Possible start of file header.
+ ///
+ /// Found an empty message ID, if the next line is an empty message string the header of the
+ /// file has been found.
+ HeaderStart,
+
+ /// Start of file header found.
+ Header,
+
+ /// Skipping to the end of the header.
+ ///
+ /// The useful information has already been extracted.
+ HeaderEnd(Messages),
+
+ /// Waiting for a next message section.
+ ///
+ /// Parser has completed parsing either at least one valid entry or the file header.
+ Idle(Messages),
+
+ /// New message entry.
+ ///
+ /// Parsed a new message ID.
+ NewEntry { id: MsgString, messages: Messages },
+
+ /// Detected that entry is for a plural.
+ ///
+ /// Found a plural ID.
+ PluralEntry {
+ id: MsgString,
+ plural_id: MsgString,
+ variants: BTreeMap<usize, MsgString>,
+ messages: Messages,
+ },
+
+ /// Internal transition state.
+ ///
+ /// Used while a line is being parsed.
+ Parsing,
}
impl Parser {
@@ -52,93 +90,296 @@ impl Parser {
/// Parsing can then be done by feeding lines to the instance using [`Parser::parse_line`] and
/// finishing with a call to [`Parser::finish`] to obtain the parsed result.
pub fn new() -> Self {
- Parser {
- parsing_header: false,
- messages: Messages::default(),
- current_id: None,
- current_plural_id: None,
- variants: BTreeMap::new(),
- }
+ Parser::Start
}
/// Parse an input line.
pub fn parse_line(&mut self, line: &str) -> Result<(), Error> {
- match_str! { (line.trim())
- ["msgid \"", msg_id, "\""] => {
- self.current_id = Some(MsgString::from_escaped(msg_id));
+ let state = mem::replace(self, Parser::Parsing);
+
+ *self = match state {
+ Parser::Start => Self::parse_start(line)?,
+ Parser::HeaderStart => Self::parse_header_start(line)?,
+ Parser::Header => Self::parse_header(line)?,
+ Parser::HeaderEnd(messages) => Self::parse_header_end(line, messages)?,
+ Parser::Idle(messages) => Self::parse_idle(line, messages)?,
+ Parser::NewEntry { id, messages } => Self::parse_new_entry(line, id, messages)?,
+ Parser::PluralEntry {
+ id,
+ plural_id,
+ variants,
+ messages,
+ } => Self::parse_plural_entry(line, id, plural_id, variants, messages)?,
+ Parser::Parsing => unreachable!("Parser should never stop on the Parsing state"),
+ };
+
+ Ok(())
+ }
+
+ /// Finish parsing and obtain the parsed [`Messages].
+ pub fn finish(self) -> Result<Messages, Error> {
+ match self {
+ // Input file is empty
+ Parser::Start => Ok(Messages::default()),
+
+ // A single empty msgid was parsed, but no msgstr for that entry (or header)
+ Parser::HeaderStart => Err(Error::IncompleteEntry(MsgString::empty())),
+
+ // Input file only contains headers that were ignored
+ Parser::Header => Ok(Messages::default()),
+
+ // Input file only contains headers, but the plural form was successfully parsed
+ Parser::HeaderEnd(messages) => Ok(messages),
+
+ // Parsing successful
+ Parser::Idle(messages) => Ok(messages),
+
+ // Input file ends on an incomplete entry
+ Parser::NewEntry { id, .. } => Err(Error::IncompleteEntry(id)),
+
+ // Input file ends with a plural entry (it might be missing variants)
+ Parser::PluralEntry {
+ id,
+ plural_id,
+ variants,
+ mut messages,
+ } => {
+ let variants = collect_variants(&id, variants)?;
+
+ messages.add_plural(id, plural_id, variants);
+
+ Ok(messages)
+ }
+
+ Parser::Parsing => unreachable!("Parser should never stop on the Parsing state"),
+ }
+ }
+
+ fn parse_start(line: &str) -> Result<Parser, Error> {
+ let next_state = match_str! { (line)
+ // Ignore empty lines and comment lines
+ [""] | ["#", ..] => Parser::Start,
+
+ // An empty message ID may indicate the start of the header
+ ["msgid \"\""] => Parser::HeaderStart,
+
+ // Headers don't have context, so skip it and get ready to parse entries
+ ["msgctxt ", ..] => Parser::Idle(Messages::default()),
+
+ // File has no header, went directly to the first entry
+ ["msgid \"", msg_id, "\""] => Parser::NewEntry {
+ id: MsgString::from_escaped(msg_id),
+ messages: Messages::default()
},
- ["msgstr \"", translation, "\""] => {
- if let Some(id) = self.current_id.take() {
- self.parsing_header = id.is_empty() && translation.is_empty();
- self.messages.add(id, MsgString::from_escaped(translation));
- }
- self.current_id = None;
- self.current_plural_id = None;
+ other => return Err(Error::UnexpectedLine(other.to_owned())),
+ };
+
+ Ok(next_state)
+ }
+
+ fn parse_header_start(line: &str) -> Result<Parser, Error> {
+ let next_state = match_str! { (line)
+ // Ignore comment lines
+ ["#", ..] => Parser::HeaderStart,
+
+ // An empty message string confirms the start of the header
+ ["msgstr \"\""] => Parser::Header,
+
+ // A non-empty message string means an entry with an empty ID has been parsed
+ ["msgstr \"", string, "\""] => Parser::Idle(
+ Messages::starting_with(MsgString::empty(), MsgString::from_escaped(string))
+ ),
+
+ // A plural ID means this is the start of a plural entry with an empty ID
+ ["msgid_plural \"", plural_id, "\""] => Parser::PluralEntry {
+ id: MsgString::empty(),
+ plural_id: MsgString::from_escaped(plural_id),
+ variants: BTreeMap::new(),
+ messages: Messages::default(),
+ },
+
+ other => return Err(Error::UnexpectedLine(other.to_owned())),
+ };
+
+ Ok(next_state)
+ }
+
+ fn parse_header(line: &str) -> Result<Parser, Error> {
+ let next_state = match_str! { (line)
+ // Ignore comment lines
+ ["#", ..] => Parser::HeaderStart,
+
+ // An empty line marks the end of the header
+ [""] => Parser::Idle(Messages::default()),
+
+ // The Plural-Forms header is the only header that's currently used, so after finding
+ // it the parser can skip to the end of the headers
+ ["\"Plural-Forms: ", plural_formula, ";\\n\""] => {
+ let plural_form = PluralForm::from_formula(plural_formula)
+ .ok_or_else(|| Error::UnrecognizedPluralFormula(plural_formula.to_owned()))?;
+
+ Parser::HeaderEnd(Messages::with_plural_form(plural_form))
},
- ["msgid_plural \"", plural_id, "\""] => {
- self.current_plural_id = Some(MsgString::from_escaped(plural_id));
- self.parsing_header = false;
+
+ // Skip other headers
+ ["\"", .., "\\n\""] => Parser::Header,
+
+ other => return Err(Error::UnexpectedLine(other.to_owned())),
+ };
+
+ Ok(next_state)
+ }
+
+ fn parse_header_end(line: &str, messages: Messages) -> Result<Parser, Error> {
+ let next_state = match_str! { (line)
+ // An empty line marks the end of the header
+ [""] => Parser::Idle(messages),
+
+ // Ignore comment lines
+ ["#", ..] => Parser::HeaderEnd(messages),
+
+ // Skip any other headers
+ ["\"", .., "\\n\""] => Parser::HeaderEnd(messages),
+
+ other => return Err(Error::UnexpectedLine(other.to_owned())),
+ };
+
+ Ok(next_state)
+ }
+
+ fn parse_idle(line: &str, messages: Messages) -> Result<Parser, Error> {
+ let next_state = match_str! { (line)
+ // Ignore empty lines, comment lines and message context lines
+ [""] | ["#", ..] | ["msgctxt ", ..] => Parser::Idle(messages),
+
+ // Start of a new message entry
+ ["msgid \"", msg_id, "\""] => Parser::NewEntry {
+ id: MsgString::from_escaped(msg_id),
+ messages,
},
- ["msgstr[", plural_translation, "\""] => {
- let variant_id_end = plural_translation
- .chars()
- .position(|character| character == ']')
- .ok_or_else(|| Error::InvalidPluralVariant(plural_translation.to_owned()))?;
- let variant_id: usize = plural_translation[..variant_id_end]
- .parse()
- .map_err(|_| {
- Error::InvalidPluralIndex(plural_translation[..variant_id_end].to_owned())
- })?;
- let variant_msg = parse_line(&plural_translation[variant_id_end..], "] \"", "")
- .ok_or_else(|| Error::InvalidPluralVariant(plural_translation.to_owned()))?;
- self.variants.insert(variant_id, MsgString::from_escaped(variant_msg));
- self.parsing_header = false;
+ other => return Err(Error::UnexpectedLine(other.to_owned())),
+ };
+
+ Ok(next_state)
+ }
+
+ fn parse_new_entry(line: &str, id: MsgString, mut messages: Messages) -> Result<Parser, Error> {
+ let next_state = match_str! { (line)
+ // Ignore comment lines
+ ["#", ..] => Parser::NewEntry { id, messages },
+
+ // A message string for an invariant entry
+ ["msgstr \"", string, "\""] => {
+ messages.add(id, MsgString::from_escaped(string));
+
+ Parser::Idle(messages)
},
- ["\"", header, "\\n\""] => {
- if self.parsing_header {
- if let Some(plural_formula) = parse_line(header, "Plural-Forms: ", ";") {
- self.messages.plural_form = PluralForm::from_formula(plural_formula);
- }
- }
+
+ // A plural ID means this is the start of a plural entry
+ ["msgid_plural \"", plural_id, "\""] => Parser::PluralEntry {
+ id,
+ plural_id: MsgString::from_escaped(plural_id),
+ variants: BTreeMap::new(),
+ messages,
},
- line => {
- if let Some(plural_id) = self.current_plural_id.take() {
- let id = self.current_id.take()
- .ok_or_else(|| Error::UnexpectedLine(line.to_owned()))?;
- let values = mem::replace(&mut self.variants, BTreeMap::new())
- .into_iter()
- .enumerate()
- .map(|(index, (variant_id, value))| {
- if index == variant_id {
- Ok(value)
- } else {
- Err(Error::IncompletePluralEntry(id.clone()))
- }
- })
- .collect::<Result<Vec<_>, Error>>()?;
+ other => return Err(Error::UnexpectedLine(other.to_owned())),
+ };
+
+ Ok(next_state)
+ }
+
+ fn parse_plural_entry(
+ line: &str,
+ id: MsgString,
+ plural_id: MsgString,
+ mut variants: BTreeMap<usize, MsgString>,
+ mut messages: Messages,
+ ) -> Result<Parser, Error> {
+ let next_state = match_str! { (line)
+ // Ignore comment lines
+ ["#", ..] => Parser::PluralEntry { id, plural_id, variants, messages },
+
+ // A message string for a plural variant
+ ["msgstr[", index_and_string, "\""] => {
+ let (index, message) = extract_plural_variant(index_and_string)?;
- self.messages.add_plural(id, plural_id, values);
+ variants.insert(index, message);
+
+ Parser::PluralEntry {
+ id,
+ plural_id,
+ variants,
+ messages,
}
+ },
+
+ // An empty line marks the end of the plural entry
+ [""] => {
+ let variants = collect_variants(&id, variants)?;
- self.current_id = None;
- self.current_plural_id = None;
- self.variants.clear();
- self.parsing_header = false;
+ messages.add_plural(id, plural_id, variants);
+
+ Parser::Idle(messages)
},
- }
- Ok(())
+ other => return Err(Error::UnexpectedLine(other.to_owned())),
+ };
+
+ Ok(next_state)
}
+}
- /// Finish parsing and obtain the parsed [`Messages].
- pub fn finish(mut self) -> Result<Messages, Error> {
- self.parse_line("")?;
+/// Helper function to extract the plural variant index and message.
+///
+/// The parser will try to parse a plural line of the form `msgstr[1] "%d tradukitaj mesaĝoj"`.
+/// When matching the line to the expected template, it will remove the `msgstr[` prefix and the
+/// `"` suffix. This function will then parse the rest of the string (`1] "%d tradukitaj mesaĝoj`)
+/// by extracting the index (1), and then extracting the message string by skipping the separator
+/// (`] "`).
+fn extract_plural_variant(index_and_string: &str) -> Result<(usize, MsgString), Error> {
+ let recreate_line = || format!("msgstr[{}\"", index_and_string);
+
+ let parts: Vec<_> = index_and_string.splitn(2, "] \"").collect();
+
+ if parts.len() != 2 {
+ return Err(Error::InvalidPluralVariant(recreate_line()));
+ }
+
+ let index_string = parts[0];
+ let message_string = parts[1];
+
+ let index = index_string
+ .parse()
+ .map_err(|_| Error::InvalidPluralIndex(recreate_line()))?;
+
+ let variant_message = MsgString::from_escaped(message_string);
+
+ Ok((index, variant_message))
+}
- Ok(self.messages)
+/// Helper function to collect parsed variants.
+///
+/// This will return only the variant messages in index order. The function will return an error if
+/// any variant index is missing.
+fn collect_variants(
+ id: &MsgString,
+ variant_map: BTreeMap<usize, MsgString>,
+) -> Result<Vec<MsgString>, Error> {
+ let index_count = variant_map.len();
+
+ for index in 0..index_count {
+ if !variant_map.contains_key(&index) {
+ return Err(Error::IncompletePluralEntry(id.clone()));
+ }
}
+
+ Ok(variant_map
+ .into_iter()
+ .map(|(_, variant)| variant)
+ .collect())
}
/// Parsing errors.
@@ -148,6 +389,14 @@ pub enum Error {
#[display(fmt = "Unexpected line parsing gettext messages: {}", _0)]
UnexpectedLine(#[error(not(source))] String),
+ /// Input uses an unrecognized plural forumal.
+ #[display(fmt = "Input uses an unrecognized formula for the plural form: {}", _0)]
+ UnrecognizedPluralFormula(#[error(not(source))] String),
+
+ /// Input ended with an incomplete entry.
+ #[display(fmt = "Input ended with an incomplete gettext entry with ID: {}", _0)]
+ IncompleteEntry(#[error(not(source))] MsgString),
+
/// Plural entry definition is missing a plural variant.
#[display(fmt = "Plural entry is missing a plural variant: {}", _0)]
IncompletePluralEntry(#[error(not(source))] MsgString),