diff options
| author | HampusM <hampus@hampusmat.com> | 2023-04-15 18:26:29 +0200 | 
|---|---|---|
| committer | HampusM <hampus@hampusmat.com> | 2023-05-09 19:51:02 +0200 | 
| commit | e762babd9e69400ccd178ba8946168640093eb63 (patch) | |
| tree | e07a56940f0c4a3551c87afad80bb949969335c7 | |
| parent | da509c366972ac6d423f95732cd3d319a2265841 (diff) | |
feat: add deserialization
| -rw-r--r-- | Cargo.toml | 3 | ||||
| -rw-r--r-- | src/attribute.rs | 170 | ||||
| -rw-r--r-- | src/deserializer/buffered.rs | 212 | ||||
| -rw-r--r-- | src/deserializer/mod.rs | 179 | ||||
| -rw-r--r-- | src/event.rs | 54 | ||||
| -rw-r--r-- | src/lib.rs | 54 | ||||
| -rw-r--r-- | src/tagged.rs | 62 | 
7 files changed, 734 insertions, 0 deletions
| @@ -5,3 +5,6 @@ edition = "2021"  license = "MIT OR Apache-2.0"  [dependencies] +quick-xml = "0.27.1" +thiserror = "1.0.38" + diff --git a/src/attribute.rs b/src/attribute.rs new file mode 100644 index 0000000..8fb4778 --- /dev/null +++ b/src/attribute.rs @@ -0,0 +1,170 @@ +//! Attribute. + +use quick_xml::events::attributes::{ +    AttrError, +    Attribute as QuickXMLAttribute, +    Attributes, +}; + +/// Represent a XML attribute. +#[derive(Debug, Clone, PartialEq)] +pub struct Attribute<'a> +{ +    inner: QuickXMLAttribute<'a>, +} + +impl<'a> Attribute<'a> +{ +    /// Attribute key. +    #[must_use] +    pub fn key(&self) -> &[u8] +    { +        self.inner.key.as_ref() +    } + +    /// Attribute value. +    #[must_use] +    pub fn value(&self) -> &[u8] +    { +        &self.inner.value +    } +} + +// Crate-local functions +impl<'a> Attribute<'a> +{ +    pub(crate) fn from_inner(inner: QuickXMLAttribute<'a>) -> Self +    { +        Self { inner } +    } +} + +/// Errors that can be raised when parsing [`Attribute`]s. +/// +/// Recovery position in examples shows the position from which parsing of the +/// next attribute will be attempted. +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum Error +{ +    /// Attribute key was not followed by `=`, position relative to the start of +    /// the owning tag is provided. +    /// +    /// Example of input that raises this error: +    /// ```xml +    /// <tag key another="attribute"/> +    /// <!--     ^~~ error position, recovery position (8) --> +    /// ``` +    #[error("Position {0}: attribute key must be directly followed by `=` or space")] +    ExpectedEq(usize), + +    /// Attribute value was not found after `=`, position relative to the start +    /// of the owning tag is provided. +    /// +    /// Example of input that raises this error: +    /// ```xml +    /// <tag key = /> +    /// <!--       ^~~ error position, recovery position (10) --> +    /// ``` +    /// +    /// This error can be returned only for the last attribute in the list, +    /// because otherwise any content after `=` will be threated as a value. +    /// The XML +    /// ```xml +    /// <tag key = another-key = "value"/> +    /// <!--                   ^ ^- recovery position (24) --> +    /// <!--                   '~~ error position (22) --> +    /// ``` +    /// +    /// will be treated as `Attribute { key = b"key", value = b"another-key" }` +    /// and or [`Attribute`] is returned, or [`Error::UnquotedValue`] is raised, +    /// depending on the parsing mode. +    #[error("Position {0}: `=` must be followed by an attribute value")] +    ExpectedValue(usize), + +    /// Attribute value is not quoted, position relative to the start of the +    /// owning tag is provided. +    /// +    /// Example of input that raises this error: +    /// ```xml +    /// <tag key = value /> +    /// <!--       ^    ^~~ recovery position (15) --> +    /// <!--       '~~ error position (10) --> +    /// ``` +    #[error("Position {0}: attribute value must be enclosed in `\"` or `'`")] +    UnquotedValue(usize), + +    /// Attribute value was not finished with a matching quote, position relative +    /// to the start of owning tag and a quote is provided. That position is always +    /// a last character in the tag content. +    /// +    /// Example of input that raises this error: +    /// ```xml +    /// <tag key = "value  /> +    /// <tag key = 'value  /> +    /// <!--               ^~~ error position, recovery position (18) --> +    /// ``` +    /// +    /// This error can be returned only for the last attribute in the list, +    /// because all input was consumed during scanning for a quote. +    #[error("Position {0}: missing closing quote `{1}` in attribute value")] +    ExpectedQuote(usize, u8), + +    /// An attribute with the same name was already encountered. Two parameters +    /// define (1) the error position relative to the start of the owning tag +    /// for a new attribute and (2) the start position of a previously encountered +    /// attribute with the same name. +    /// +    /// Example of input that raises this error: +    /// ```xml +    /// <tag key = 'value'  key="value2" attr3='value3' /> +    /// <!-- ^              ^            ^~~ recovery position (32) --> +    /// <!-- |              '~~ error position (19) --> +    /// <!-- '~~ previous position (4) --> +    /// ``` +    #[error("Position {0}: duplicated attribute, previous declaration at position {1}")] +    Duplicated(usize, usize), +} + +impl From<AttrError> for Error +{ +    fn from(attr_err: AttrError) -> Self +    { +        match attr_err { +            AttrError::ExpectedEq(pos) => Self::ExpectedEq(pos), +            AttrError::ExpectedValue(pos) => Self::ExpectedValue(pos), +            AttrError::UnquotedValue(pos) => Self::UnquotedValue(pos), +            AttrError::ExpectedQuote(pos, quote) => Self::ExpectedQuote(pos, quote), +            AttrError::Duplicated(pos, same_attr_pos) => { +                Self::Duplicated(pos, same_attr_pos) +            } +        } +    } +} + +/// Iterates through [`Attribute`]s. +#[derive(Debug)] +pub struct Iter<'a> +{ +    attrs: Attributes<'a>, +} + +impl<'a> Iter<'a> +{ +    pub(crate) fn new(attrs: Attributes<'a>) -> Self +    { +        Self { attrs } +    } +} + +impl<'a> Iterator for Iter<'a> +{ +    type Item = Result<Attribute<'a>, Error>; + +    fn next(&mut self) -> Option<Self::Item> +    { +        let attr = self.attrs.next()?; + +        Some(attr.map(Attribute::from_inner).map_err(Into::into)) +    } +} diff --git a/src/deserializer/buffered.rs b/src/deserializer/buffered.rs new file mode 100644 index 0000000..7a6058b --- /dev/null +++ b/src/deserializer/buffered.rs @@ -0,0 +1,212 @@ +//! Buffered XML deserializer. +use std::convert::Infallible; +use std::io::BufRead; + +use quick_xml::events::Event; +use quick_xml::Reader; + +use crate::deserializer::{Deserializer, Error, IgnoreEnd}; +use crate::event::EventExt; +use crate::tagged::TagStart; +use crate::DeserializeTagged; + +/// XML deserializer using a source which has an internal buffer. +pub struct Buffered<Source: BufRead> +{ +    reader: Reader<Source>, +    leftover_event: Option<Event<'static>>, +    buf: Vec<u8>, +} + +impl<Source> Buffered<Source> +where +    Source: BufRead, +{ +    /// Returns a new [`Buffered`]. +    pub fn new(source: Source) -> Self +    { +        let mut reader = Reader::from_reader(source); + +        reader.trim_text(true); +        reader.expand_empty_elements(true); + +        Self { +            reader, +            leftover_event: None, +            buf: Vec::new(), +        } +    } +} + +impl<Source> Deserializer for Buffered<Source> +where +    Source: BufRead, +{ +    fn de_tag<De: DeserializeTagged>( +        &mut self, +        tag_name: &str, +        ignore_end: IgnoreEnd, +    ) -> Result<De, Error<De::Error>> +    { +        self.de_tag_with(tag_name, ignore_end, De::deserialize) +    } + +    fn de_tag_with<Output, Err, DeserializeFn>( +        &mut self, +        tag_name: &str, +        ignore_end: IgnoreEnd, +        deserialize: DeserializeFn, +    ) -> Result<Output, Error<Err>> +    where +        Err: std::error::Error + Send + Sync + 'static, +        DeserializeFn: FnOnce(&TagStart, &mut Self) -> Result<Output, Err>, +    { +        let deserialized = match self.read_event()? { +            Event::Start(start) if start.name().as_ref() == tag_name.as_bytes() => { +                deserialize(&TagStart::from_inner(start), self) +                    .map_err(Error::DeserializeFailed)? +            } +            event => { +                self.leftover_event = Some(event.clone().into_owned()); + +                return Err(Error::UnexpectedEvent { +                    expected_event_name: format!("start({tag_name})"), +                    found_event: event.describe().unwrap(), +                }); +            } +        }; + +        if let IgnoreEnd::No = ignore_end { +            self.read_end_event(tag_name.as_bytes()) +                .map_err(Error::into_with_de_error)?; +        } + +        Ok(deserialized) +    } + +    fn de_tag_list<De: DeserializeTagged>( +        &mut self, +        tag_name: Option<&str>, +    ) -> Result<Vec<De>, Error<De::Error>> +    { +        let mut deserialized_items = Vec::new(); + +        loop { +            let start = match self.read_event()? { +                Event::Start(start) +                    if tag_name.map_or_else( +                        || true, +                        |expected_tag_name| { +                            start.name().as_ref() == expected_tag_name.as_bytes() +                        }, +                    ) => +                { +                    TagStart::from_inner(start) +                } +                Event::Comment(_) => { +                    continue; +                } +                event => { +                    self.leftover_event = Some(event.into_owned()); +                    break; +                } +            }; + +            let deserialized = +                De::deserialize(&start, self).map_err(Error::DeserializeFailed)?; + +            self.read_end_event(start.name()) +                .map_err(Error::into_with_de_error)?; + +            deserialized_items.push(deserialized); +        } + +        Ok(deserialized_items) +    } + +    fn de_text(&mut self) -> Result<String, Error<Infallible>> +    { +        let text = match self.read_event::<Infallible>()? { +            Event::Text(text) => Ok(text), +            event => { +                self.leftover_event = Some(event.clone().into_owned()); + +                Err(Error::<Infallible>::UnexpectedEvent { +                    expected_event_name: "text".to_string(), +                    found_event: event.describe().unwrap(), +                }) +            } +        }? +        .unescape() +        .map_err(|err| Error::<Infallible>::XMLError(err.into()))?; + +        Ok(text.to_string()) +    } + +    fn skip_to_tag_start(&mut self, tag_name: &str) -> Result<(), Error<Infallible>> +    { +        loop { +            match self.read_event::<Infallible>()? { +                Event::Start(start) if start.name().as_ref() == tag_name.as_bytes() => { +                    self.leftover_event = Some(Event::Start(start).into_owned()); + +                    break; +                } +                _ => {} +            } +        } + +        Ok(()) +    } + +    fn skip_to_tag_end(&mut self, tag_name: &str) -> Result<(), Error<Infallible>> +    { +        loop { +            match self.read_event::<Infallible>()? { +                Event::End(end) if end.name().as_ref() == tag_name.as_bytes() => { +                    self.leftover_event = Some(Event::End(end).into_owned()); + +                    return Ok(()); +                } +                _ => {} +            } +        } +    } +} + +impl<Source> Buffered<Source> +where +    Source: BufRead, +{ +    fn read_end_event(&mut self, tag_name: &[u8]) -> Result<(), Error<Infallible>> +    { +        let event = self.read_event::<Infallible>()?; + +        if matches!(&event, Event::End(end) if end.name().as_ref() == tag_name) { +            return Ok(()); +        } + +        Err(Error::UnexpectedEvent { +            expected_event_name: "end".to_string(), +            found_event: event.describe().unwrap(), +        }) +    } + +    fn read_event<DeError>(&mut self) -> Result<Event<'static>, Error<DeError>> +    { +        let event = if let Some(leftover_event) = self.leftover_event.take() { +            leftover_event +        } else { +            self.reader +                .read_event_into(&mut self.buf) +                .map_err(|err| Error::<DeError>::XMLError(err.into()))? +                .into_owned() +        }; + +        if let Event::Eof = &event { +            return Err(Error::UnexpectedEndOfFile); +        } + +        Ok(event) +    } +} diff --git a/src/deserializer/mod.rs b/src/deserializer/mod.rs new file mode 100644 index 0000000..bd0c0e4 --- /dev/null +++ b/src/deserializer/mod.rs @@ -0,0 +1,179 @@ +//! Deserializer. +use std::convert::Infallible; + +use crate::tagged::TagStart; +use crate::DeserializeTagged; + +pub mod buffered; + +/// XML deserializer. +pub trait Deserializer +{ +    /// Deserializes a tagged element. +    /// +    /// # Errors +    /// Returns `Err` if deserialization fails. +    fn de_tag<De: DeserializeTagged>( +        &mut self, +        tag_name: &str, +        ignore_end: IgnoreEnd, +    ) -> Result<De, Error<De::Error>>; + +    /// Deserializes a tagged element using the given function. +    /// +    /// # Errors +    /// Returns `Err` if deserialization fails. +    fn de_tag_with<Output, Err, DeserializeFn>( +        &mut self, +        tag_name: &str, +        ignore_end: IgnoreEnd, +        deserialize: DeserializeFn, +    ) -> Result<Output, Error<Err>> +    where +        Err: std::error::Error + Send + Sync + 'static, +        DeserializeFn: FnOnce(&TagStart, &mut Self) -> Result<Output, Err>; + +    /// Deserializes a list of tagged elements. +    /// +    /// # Errors +    /// Returns `Err` if deserialization fails. +    fn de_tag_list<De: DeserializeTagged>( +        &mut self, +        tag_name: Option<&str>, +    ) -> Result<Vec<De>, Error<De::Error>>; + +    /// Deserializes a text element. +    /// +    /// # Errors +    /// Returns `Err` if deserialization fails. +    fn de_text(&mut self) -> Result<String, Error<Infallible>>; + +    /// Skips past all elements until a tagged element with the name `tag_name` is +    /// reached. +    /// +    /// # Errors +    /// Returns `Err` if unsuccessful. +    fn skip_to_tag_start(&mut self, tag_name: &str) -> Result<(), Error<Infallible>>; + +    /// Skips past all elements until the end of a tagged element with the name `tag_name` +    /// is reached. +    /// +    /// # Errors +    /// Returns `Err` if unsuccessful. +    fn skip_to_tag_end(&mut self, tag_name: &str) -> Result<(), Error<Infallible>>; +} + +/// Whether or not to skip the end tag of a tagged element. +/// +/// **Should be `No`**. +#[derive(Debug, Default)] +pub enum IgnoreEnd +{ +    /// Skip the end tag. +    /// +    /// **Will cause problems in most cases and should be used very carefully**. +    Yes, + +    /// Don't skip the end tag. +    #[default] +    No, +} + +/// [`Deserializer`] error. +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum Error<DeError> +{ +    /// A XML error occurred. +    #[error("A XML error occurred")] +    XMLError(#[source] XMLError), + +    /// Failed to deserialize. +    #[error("Failed to deserialize")] +    DeserializeFailed(#[from] DeError), + +    /// Unexpected event. +    #[error("Expected {expected_event_name} event. Found {found_event}")] +    UnexpectedEvent +    { +        /// The name of the expected event. +        expected_event_name: String, + +        /// The found event. +        found_event: String, +    }, + +    /// Unexpected end of file. +    #[error("Unexpected end of file")] +    UnexpectedEndOfFile, +} + +impl<DeError> Error<DeError> +{ +    /// Returns `Self` with `DeError` as [`Infallible`]. +    /// +    /// # Panics +    /// Will panic if `Self` is the `DeserializeFailed` variant. +    pub fn into_never_de_err(self) -> Error<Infallible> +    { +        match self { +            Self::XMLError(xml_err) => Error::XMLError(xml_err), +            Self::DeserializeFailed(_) => { +                panic!("is a deserialization error"); +            } +            Self::UnexpectedEvent { +                expected_event_name, +                found_event, +            } => Error::UnexpectedEvent { +                expected_event_name, +                found_event, +            }, +            Self::UnexpectedEndOfFile => Error::UnexpectedEndOfFile, +        } +    } +} + +impl Error<Infallible> +{ +    fn into_with_de_error<DeError>(self) -> Error<DeError> +    { +        match self { +            Self::XMLError(xml_err) => Error::XMLError(xml_err), +            Self::DeserializeFailed(_) => { +                unreachable!(); +            } +            Self::UnexpectedEvent { +                expected_event_name, +                found_event, +            } => Error::UnexpectedEvent { +                expected_event_name, +                found_event, +            }, +            Self::UnexpectedEndOfFile => Error::UnexpectedEndOfFile, +        } +    } +} + +impl From<Error<Error<Infallible>>> for Error<Infallible> +{ +    fn from(err: Error<Error<Infallible>>) -> Self +    { +        match err { +            Error::XMLError(xml_err) => Self::XMLError(xml_err), +            Error::DeserializeFailed(de_err) => de_err, +            Error::UnexpectedEvent { +                expected_event_name, +                found_event, +            } => Self::UnexpectedEvent { +                expected_event_name, +                found_event, +            }, +            Error::UnexpectedEndOfFile => Self::UnexpectedEndOfFile, +        } +    } +} + +/// XML error. +#[derive(Debug, thiserror::Error)] +#[error(transparent)] +pub struct XMLError(#[from] quick_xml::Error); diff --git a/src/event.rs b/src/event.rs new file mode 100644 index 0000000..ae0624d --- /dev/null +++ b/src/event.rs @@ -0,0 +1,54 @@ +use std::str::Utf8Error; + +use quick_xml::events::Event; + +#[allow(clippy::module_name_repetitions)] +pub trait EventExt +{ +    fn describe(&self) -> Result<String, Utf8Error>; +} + +impl<'a> EventExt for Event<'a> +{ +    fn describe(&self) -> Result<String, Utf8Error> +    { +        Ok(match self { +            Event::Start(start) => { +                format!( +                    "tag start with name \"{}\"", +                    std::str::from_utf8(start.name().as_ref())? +                ) +            } +            Event::End(end) => { +                format!( +                    "tag end with name \"{}\"", +                    std::str::from_utf8(end.name().as_ref())? +                ) +            } +            Event::Empty(start) => { +                format!( +                    "empty tag with name \"{}\"", +                    std::str::from_utf8(start.name().as_ref())? +                ) +            } +            Event::Text(text) => { +                format!("text \"{}\"", std::str::from_utf8(text)?) +            } +            Event::Comment(comment) => { +                format!("comment \"{}\"", std::str::from_utf8(comment)?) +            } +            Event::CData(cdata) => { +                format!("cdata \"{}\"", std::str::from_utf8(cdata)?) +            } +            Event::Decl(_) => "XML declaration".to_string(), +            Event::PI(processing_instruction) => { +                format!( +                    "processing instruction \"{}\"", +                    std::str::from_utf8(processing_instruction)? +                ) +            } +            Event::DocType(_) => "doctype".to_string(), +            Event::Eof => "end of file".to_string(), +        }) +    } +} @@ -1 +1,55 @@ +//! XML is awful. +#![deny(clippy::all, clippy::pedantic, unsafe_code, missing_docs)] +use crate::deserializer::{Deserializer, Error as DeserializerError}; +use crate::tagged::TagStart; +pub mod attribute; +pub mod deserializer; +pub mod tagged; + +mod event; + +/// Trait implemented by types that want to be deserializable from tagged XML elements. +pub trait DeserializeTagged: Sized +{ +    /// Error type. +    type Error: std::error::Error + Send + Sync + 'static; + +    /// Deserializes into a new `Self`. +    /// +    /// # Errors +    /// When or if a error is returned is decided by the type implementing this trait. +    fn deserialize<TDeserializer: Deserializer>( +        start: &TagStart, +        deserializer: &mut TDeserializer, +    ) -> Result<Self, Self::Error>; +} + +/// Result extension. +pub trait ResultExt<Value, DeError> +{ +    /// Returns `Ok(None)` if `Err` is `DeserializerError::UnexpectedEvent`. +    fn try_event(self) -> Result<Option<Value>, DeserializerError<DeError>>; +} + +impl<Value, DeError> ResultExt<Value, DeError> +    for Result<Value, DeserializerError<DeError>> +{ +    fn try_event(self) -> Result<Option<Value>, DeserializerError<DeError>> +    { +        self.map_or_else( +            |err| { +                if let DeserializerError::UnexpectedEvent { +                    expected_event_name: _, +                    found_event: _, +                } = err +                { +                    return Ok(None); +                } + +                Err(err) +            }, +            |value| Ok(Some(value)), +        ) +    } +} diff --git a/src/tagged.rs b/src/tagged.rs new file mode 100644 index 0000000..19ae03b --- /dev/null +++ b/src/tagged.rs @@ -0,0 +1,62 @@ +//! Tagged element. + +use std::borrow::Cow; +use std::str::Utf8Error; + +use quick_xml::events::BytesStart; + +use crate::attribute::Iter as AttributeIter; + +/// The start tag of a tagged element. +/// +/// The `<xyz foo="bar">` in `<xyz foo="bar">Hello</xyz>` +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TagStart<'a> +{ +    inner: BytesStart<'a>, +} + +impl<'a> TagStart<'a> +{ +    /// Returns a new `TagStart`. +    pub fn new(name: impl Into<Cow<'a, str>>) -> Self +    { +        Self { +            inner: BytesStart::new(name), +        } +    } + +    /// Returns the tag name. +    #[must_use] +    pub fn name(&self) -> &[u8] +    { +        let name_length = self.inner.name().as_ref().len(); + +        &self.inner.as_ref()[..name_length] +    } + +    /// Returns the tag name in UTF-8. +    /// +    /// # Errors +    /// Returns `Err` if the name is not valid UTF-8. +    pub fn name_utf8(&self) -> Result<&str, Utf8Error> +    { +        std::str::from_utf8(self.name()) +    } + +    /// Returns the tag attributes. +    #[must_use] +    pub fn attributes(&'a self) -> AttributeIter<'a> +    { +        AttributeIter::new(self.inner.attributes()) +    } +} + +// Crate-local functions +impl<'a> TagStart<'a> +{ +    pub(crate) fn from_inner(inner: BytesStart<'a>) -> Self +    { +        Self { inner } +    } +} | 
