From e762babd9e69400ccd178ba8946168640093eb63 Mon Sep 17 00:00:00 2001 From: HampusM Date: Sat, 15 Apr 2023 18:26:29 +0200 Subject: feat: add deserialization --- src/attribute.rs | 170 ++++++++++++++++++++++++++++++++++ src/deserializer/buffered.rs | 212 +++++++++++++++++++++++++++++++++++++++++++ src/deserializer/mod.rs | 179 ++++++++++++++++++++++++++++++++++++ src/event.rs | 54 +++++++++++ src/lib.rs | 54 +++++++++++ src/tagged.rs | 62 +++++++++++++ 6 files changed, 731 insertions(+) create mode 100644 src/attribute.rs create mode 100644 src/deserializer/buffered.rs create mode 100644 src/deserializer/mod.rs create mode 100644 src/event.rs create mode 100644 src/tagged.rs (limited to 'src') diff --git a/src/attribute.rs b/src/attribute.rs new file mode 100644 index 0000000..8fb4778 --- /dev/null +++ b/src/attribute.rs @@ -0,0 +1,170 @@ +//! Attribute. + +use quick_xml::events::attributes::{ + AttrError, + Attribute as QuickXMLAttribute, + Attributes, +}; + +/// Represent a XML attribute. +#[derive(Debug, Clone, PartialEq)] +pub struct Attribute<'a> +{ + inner: QuickXMLAttribute<'a>, +} + +impl<'a> Attribute<'a> +{ + /// Attribute key. + #[must_use] + pub fn key(&self) -> &[u8] + { + self.inner.key.as_ref() + } + + /// Attribute value. + #[must_use] + pub fn value(&self) -> &[u8] + { + &self.inner.value + } +} + +// Crate-local functions +impl<'a> Attribute<'a> +{ + pub(crate) fn from_inner(inner: QuickXMLAttribute<'a>) -> Self + { + Self { inner } + } +} + +/// Errors that can be raised when parsing [`Attribute`]s. +/// +/// Recovery position in examples shows the position from which parsing of the +/// next attribute will be attempted. +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum Error +{ + /// Attribute key was not followed by `=`, position relative to the start of + /// the owning tag is provided. + /// + /// Example of input that raises this error: + /// ```xml + /// + /// + /// ``` + #[error("Position {0}: attribute key must be directly followed by `=` or space")] + ExpectedEq(usize), + + /// Attribute value was not found after `=`, position relative to the start + /// of the owning tag is provided. + /// + /// Example of input that raises this error: + /// ```xml + /// + /// + /// ``` + /// + /// This error can be returned only for the last attribute in the list, + /// because otherwise any content after `=` will be threated as a value. + /// The XML + /// ```xml + /// + /// + /// + /// ``` + /// + /// will be treated as `Attribute { key = b"key", value = b"another-key" }` + /// and or [`Attribute`] is returned, or [`Error::UnquotedValue`] is raised, + /// depending on the parsing mode. + #[error("Position {0}: `=` must be followed by an attribute value")] + ExpectedValue(usize), + + /// Attribute value is not quoted, position relative to the start of the + /// owning tag is provided. + /// + /// Example of input that raises this error: + /// ```xml + /// + /// + /// + /// ``` + #[error("Position {0}: attribute value must be enclosed in `\"` or `'`")] + UnquotedValue(usize), + + /// Attribute value was not finished with a matching quote, position relative + /// to the start of owning tag and a quote is provided. That position is always + /// a last character in the tag content. + /// + /// Example of input that raises this error: + /// ```xml + /// + /// + /// + /// + /// ``` + #[error("Position {0}: duplicated attribute, previous declaration at position {1}")] + Duplicated(usize, usize), +} + +impl From for Error +{ + fn from(attr_err: AttrError) -> Self + { + match attr_err { + AttrError::ExpectedEq(pos) => Self::ExpectedEq(pos), + AttrError::ExpectedValue(pos) => Self::ExpectedValue(pos), + AttrError::UnquotedValue(pos) => Self::UnquotedValue(pos), + AttrError::ExpectedQuote(pos, quote) => Self::ExpectedQuote(pos, quote), + AttrError::Duplicated(pos, same_attr_pos) => { + Self::Duplicated(pos, same_attr_pos) + } + } + } +} + +/// Iterates through [`Attribute`]s. +#[derive(Debug)] +pub struct Iter<'a> +{ + attrs: Attributes<'a>, +} + +impl<'a> Iter<'a> +{ + pub(crate) fn new(attrs: Attributes<'a>) -> Self + { + Self { attrs } + } +} + +impl<'a> Iterator for Iter<'a> +{ + type Item = Result, Error>; + + fn next(&mut self) -> Option + { + let attr = self.attrs.next()?; + + Some(attr.map(Attribute::from_inner).map_err(Into::into)) + } +} diff --git a/src/deserializer/buffered.rs b/src/deserializer/buffered.rs new file mode 100644 index 0000000..7a6058b --- /dev/null +++ b/src/deserializer/buffered.rs @@ -0,0 +1,212 @@ +//! Buffered XML deserializer. +use std::convert::Infallible; +use std::io::BufRead; + +use quick_xml::events::Event; +use quick_xml::Reader; + +use crate::deserializer::{Deserializer, Error, IgnoreEnd}; +use crate::event::EventExt; +use crate::tagged::TagStart; +use crate::DeserializeTagged; + +/// XML deserializer using a source which has an internal buffer. +pub struct Buffered +{ + reader: Reader, + leftover_event: Option>, + buf: Vec, +} + +impl Buffered +where + Source: BufRead, +{ + /// Returns a new [`Buffered`]. + pub fn new(source: Source) -> Self + { + let mut reader = Reader::from_reader(source); + + reader.trim_text(true); + reader.expand_empty_elements(true); + + Self { + reader, + leftover_event: None, + buf: Vec::new(), + } + } +} + +impl Deserializer for Buffered +where + Source: BufRead, +{ + fn de_tag( + &mut self, + tag_name: &str, + ignore_end: IgnoreEnd, + ) -> Result> + { + self.de_tag_with(tag_name, ignore_end, De::deserialize) + } + + fn de_tag_with( + &mut self, + tag_name: &str, + ignore_end: IgnoreEnd, + deserialize: DeserializeFn, + ) -> Result> + where + Err: std::error::Error + Send + Sync + 'static, + DeserializeFn: FnOnce(&TagStart, &mut Self) -> Result, + { + let deserialized = match self.read_event()? { + Event::Start(start) if start.name().as_ref() == tag_name.as_bytes() => { + deserialize(&TagStart::from_inner(start), self) + .map_err(Error::DeserializeFailed)? + } + event => { + self.leftover_event = Some(event.clone().into_owned()); + + return Err(Error::UnexpectedEvent { + expected_event_name: format!("start({tag_name})"), + found_event: event.describe().unwrap(), + }); + } + }; + + if let IgnoreEnd::No = ignore_end { + self.read_end_event(tag_name.as_bytes()) + .map_err(Error::into_with_de_error)?; + } + + Ok(deserialized) + } + + fn de_tag_list( + &mut self, + tag_name: Option<&str>, + ) -> Result, Error> + { + let mut deserialized_items = Vec::new(); + + loop { + let start = match self.read_event()? { + Event::Start(start) + if tag_name.map_or_else( + || true, + |expected_tag_name| { + start.name().as_ref() == expected_tag_name.as_bytes() + }, + ) => + { + TagStart::from_inner(start) + } + Event::Comment(_) => { + continue; + } + event => { + self.leftover_event = Some(event.into_owned()); + break; + } + }; + + let deserialized = + De::deserialize(&start, self).map_err(Error::DeserializeFailed)?; + + self.read_end_event(start.name()) + .map_err(Error::into_with_de_error)?; + + deserialized_items.push(deserialized); + } + + Ok(deserialized_items) + } + + fn de_text(&mut self) -> Result> + { + let text = match self.read_event::()? { + Event::Text(text) => Ok(text), + event => { + self.leftover_event = Some(event.clone().into_owned()); + + Err(Error::::UnexpectedEvent { + expected_event_name: "text".to_string(), + found_event: event.describe().unwrap(), + }) + } + }? + .unescape() + .map_err(|err| Error::::XMLError(err.into()))?; + + Ok(text.to_string()) + } + + fn skip_to_tag_start(&mut self, tag_name: &str) -> Result<(), Error> + { + loop { + match self.read_event::()? { + Event::Start(start) if start.name().as_ref() == tag_name.as_bytes() => { + self.leftover_event = Some(Event::Start(start).into_owned()); + + break; + } + _ => {} + } + } + + Ok(()) + } + + fn skip_to_tag_end(&mut self, tag_name: &str) -> Result<(), Error> + { + loop { + match self.read_event::()? { + Event::End(end) if end.name().as_ref() == tag_name.as_bytes() => { + self.leftover_event = Some(Event::End(end).into_owned()); + + return Ok(()); + } + _ => {} + } + } + } +} + +impl Buffered +where + Source: BufRead, +{ + fn read_end_event(&mut self, tag_name: &[u8]) -> Result<(), Error> + { + let event = self.read_event::()?; + + if matches!(&event, Event::End(end) if end.name().as_ref() == tag_name) { + return Ok(()); + } + + Err(Error::UnexpectedEvent { + expected_event_name: "end".to_string(), + found_event: event.describe().unwrap(), + }) + } + + fn read_event(&mut self) -> Result, Error> + { + let event = if let Some(leftover_event) = self.leftover_event.take() { + leftover_event + } else { + self.reader + .read_event_into(&mut self.buf) + .map_err(|err| Error::::XMLError(err.into()))? + .into_owned() + }; + + if let Event::Eof = &event { + return Err(Error::UnexpectedEndOfFile); + } + + Ok(event) + } +} diff --git a/src/deserializer/mod.rs b/src/deserializer/mod.rs new file mode 100644 index 0000000..bd0c0e4 --- /dev/null +++ b/src/deserializer/mod.rs @@ -0,0 +1,179 @@ +//! Deserializer. +use std::convert::Infallible; + +use crate::tagged::TagStart; +use crate::DeserializeTagged; + +pub mod buffered; + +/// XML deserializer. +pub trait Deserializer +{ + /// Deserializes a tagged element. + /// + /// # Errors + /// Returns `Err` if deserialization fails. + fn de_tag( + &mut self, + tag_name: &str, + ignore_end: IgnoreEnd, + ) -> Result>; + + /// Deserializes a tagged element using the given function. + /// + /// # Errors + /// Returns `Err` if deserialization fails. + fn de_tag_with( + &mut self, + tag_name: &str, + ignore_end: IgnoreEnd, + deserialize: DeserializeFn, + ) -> Result> + where + Err: std::error::Error + Send + Sync + 'static, + DeserializeFn: FnOnce(&TagStart, &mut Self) -> Result; + + /// Deserializes a list of tagged elements. + /// + /// # Errors + /// Returns `Err` if deserialization fails. + fn de_tag_list( + &mut self, + tag_name: Option<&str>, + ) -> Result, Error>; + + /// Deserializes a text element. + /// + /// # Errors + /// Returns `Err` if deserialization fails. + fn de_text(&mut self) -> Result>; + + /// Skips past all elements until a tagged element with the name `tag_name` is + /// reached. + /// + /// # Errors + /// Returns `Err` if unsuccessful. + fn skip_to_tag_start(&mut self, tag_name: &str) -> Result<(), Error>; + + /// Skips past all elements until the end of a tagged element with the name `tag_name` + /// is reached. + /// + /// # Errors + /// Returns `Err` if unsuccessful. + fn skip_to_tag_end(&mut self, tag_name: &str) -> Result<(), Error>; +} + +/// Whether or not to skip the end tag of a tagged element. +/// +/// **Should be `No`**. +#[derive(Debug, Default)] +pub enum IgnoreEnd +{ + /// Skip the end tag. + /// + /// **Will cause problems in most cases and should be used very carefully**. + Yes, + + /// Don't skip the end tag. + #[default] + No, +} + +/// [`Deserializer`] error. +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum Error +{ + /// A XML error occurred. + #[error("A XML error occurred")] + XMLError(#[source] XMLError), + + /// Failed to deserialize. + #[error("Failed to deserialize")] + DeserializeFailed(#[from] DeError), + + /// Unexpected event. + #[error("Expected {expected_event_name} event. Found {found_event}")] + UnexpectedEvent + { + /// The name of the expected event. + expected_event_name: String, + + /// The found event. + found_event: String, + }, + + /// Unexpected end of file. + #[error("Unexpected end of file")] + UnexpectedEndOfFile, +} + +impl Error +{ + /// Returns `Self` with `DeError` as [`Infallible`]. + /// + /// # Panics + /// Will panic if `Self` is the `DeserializeFailed` variant. + pub fn into_never_de_err(self) -> Error + { + match self { + Self::XMLError(xml_err) => Error::XMLError(xml_err), + Self::DeserializeFailed(_) => { + panic!("is a deserialization error"); + } + Self::UnexpectedEvent { + expected_event_name, + found_event, + } => Error::UnexpectedEvent { + expected_event_name, + found_event, + }, + Self::UnexpectedEndOfFile => Error::UnexpectedEndOfFile, + } + } +} + +impl Error +{ + fn into_with_de_error(self) -> Error + { + match self { + Self::XMLError(xml_err) => Error::XMLError(xml_err), + Self::DeserializeFailed(_) => { + unreachable!(); + } + Self::UnexpectedEvent { + expected_event_name, + found_event, + } => Error::UnexpectedEvent { + expected_event_name, + found_event, + }, + Self::UnexpectedEndOfFile => Error::UnexpectedEndOfFile, + } + } +} + +impl From>> for Error +{ + fn from(err: Error>) -> Self + { + match err { + Error::XMLError(xml_err) => Self::XMLError(xml_err), + Error::DeserializeFailed(de_err) => de_err, + Error::UnexpectedEvent { + expected_event_name, + found_event, + } => Self::UnexpectedEvent { + expected_event_name, + found_event, + }, + Error::UnexpectedEndOfFile => Self::UnexpectedEndOfFile, + } + } +} + +/// XML error. +#[derive(Debug, thiserror::Error)] +#[error(transparent)] +pub struct XMLError(#[from] quick_xml::Error); diff --git a/src/event.rs b/src/event.rs new file mode 100644 index 0000000..ae0624d --- /dev/null +++ b/src/event.rs @@ -0,0 +1,54 @@ +use std::str::Utf8Error; + +use quick_xml::events::Event; + +#[allow(clippy::module_name_repetitions)] +pub trait EventExt +{ + fn describe(&self) -> Result; +} + +impl<'a> EventExt for Event<'a> +{ + fn describe(&self) -> Result + { + Ok(match self { + Event::Start(start) => { + format!( + "tag start with name \"{}\"", + std::str::from_utf8(start.name().as_ref())? + ) + } + Event::End(end) => { + format!( + "tag end with name \"{}\"", + std::str::from_utf8(end.name().as_ref())? + ) + } + Event::Empty(start) => { + format!( + "empty tag with name \"{}\"", + std::str::from_utf8(start.name().as_ref())? + ) + } + Event::Text(text) => { + format!("text \"{}\"", std::str::from_utf8(text)?) + } + Event::Comment(comment) => { + format!("comment \"{}\"", std::str::from_utf8(comment)?) + } + Event::CData(cdata) => { + format!("cdata \"{}\"", std::str::from_utf8(cdata)?) + } + Event::Decl(_) => "XML declaration".to_string(), + Event::PI(processing_instruction) => { + format!( + "processing instruction \"{}\"", + std::str::from_utf8(processing_instruction)? + ) + } + Event::DocType(_) => "doctype".to_string(), + Event::Eof => "end of file".to_string(), + }) + } +} diff --git a/src/lib.rs b/src/lib.rs index 8b13789..e5086bc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1 +1,55 @@ +//! XML is awful. +#![deny(clippy::all, clippy::pedantic, unsafe_code, missing_docs)] +use crate::deserializer::{Deserializer, Error as DeserializerError}; +use crate::tagged::TagStart; +pub mod attribute; +pub mod deserializer; +pub mod tagged; + +mod event; + +/// Trait implemented by types that want to be deserializable from tagged XML elements. +pub trait DeserializeTagged: Sized +{ + /// Error type. + type Error: std::error::Error + Send + Sync + 'static; + + /// Deserializes into a new `Self`. + /// + /// # Errors + /// When or if a error is returned is decided by the type implementing this trait. + fn deserialize( + start: &TagStart, + deserializer: &mut TDeserializer, + ) -> Result; +} + +/// Result extension. +pub trait ResultExt +{ + /// Returns `Ok(None)` if `Err` is `DeserializerError::UnexpectedEvent`. + fn try_event(self) -> Result, DeserializerError>; +} + +impl ResultExt + for Result> +{ + fn try_event(self) -> Result, DeserializerError> + { + self.map_or_else( + |err| { + if let DeserializerError::UnexpectedEvent { + expected_event_name: _, + found_event: _, + } = err + { + return Ok(None); + } + + Err(err) + }, + |value| Ok(Some(value)), + ) + } +} diff --git a/src/tagged.rs b/src/tagged.rs new file mode 100644 index 0000000..19ae03b --- /dev/null +++ b/src/tagged.rs @@ -0,0 +1,62 @@ +//! Tagged element. + +use std::borrow::Cow; +use std::str::Utf8Error; + +use quick_xml::events::BytesStart; + +use crate::attribute::Iter as AttributeIter; + +/// The start tag of a tagged element. +/// +/// The `` in `Hello` +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TagStart<'a> +{ + inner: BytesStart<'a>, +} + +impl<'a> TagStart<'a> +{ + /// Returns a new `TagStart`. + pub fn new(name: impl Into>) -> Self + { + Self { + inner: BytesStart::new(name), + } + } + + /// Returns the tag name. + #[must_use] + pub fn name(&self) -> &[u8] + { + let name_length = self.inner.name().as_ref().len(); + + &self.inner.as_ref()[..name_length] + } + + /// Returns the tag name in UTF-8. + /// + /// # Errors + /// Returns `Err` if the name is not valid UTF-8. + pub fn name_utf8(&self) -> Result<&str, Utf8Error> + { + std::str::from_utf8(self.name()) + } + + /// Returns the tag attributes. + #[must_use] + pub fn attributes(&'a self) -> AttributeIter<'a> + { + AttributeIter::new(self.inner.attributes()) + } +} + +// Crate-local functions +impl<'a> TagStart<'a> +{ + pub(crate) fn from_inner(inner: BytesStart<'a>) -> Self + { + Self { inner } + } +} -- cgit v1.2.3-18-g5258