summary refs log tree commit diff
path: root/src/encoding.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/encoding.rs')
-rw-r--r--src/encoding.rs43
1 files changed, 36 insertions, 7 deletions
diff --git a/src/encoding.rs b/src/encoding.rs
index 08ffb39..ccd2031 100644
--- a/src/encoding.rs
+++ b/src/encoding.rs
@@ -2,7 +2,13 @@
 use crate::types::*;
 use smol::prelude::*;
 
-use smol::io::BoxedReader;
+
+#[derive(Debug)]
+pub struct Decode {
+  pub c: char,
+  pub skipped_bytes: usize,
+  pub found_bytes: usize,
+}
 
 
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
@@ -31,9 +37,12 @@ pub fn get_utf8_byte_type(b: u8) -> UTF8ByteType {
 }
 
 
-pub async fn read_utf8_char(input: &mut BoxedReader) -> Result<char> {
+pub async fn read_utf8_char(input: &mut (impl AsyncRead + Unpin))
+    -> Result<Decode>
+{
   let mut buf = vec![0; 4];
   let mut unread_byte: Option<u8> = None;
+  let mut skipped_bytes = 0;
 
   loop {
     if let Some(byte) = unread_byte {
@@ -43,56 +52,76 @@ pub async fn read_utf8_char(input: &mut BoxedReader) -> Result<char> {
       input.read_exact(&mut buf[0 .. 1]).await?;
     }
 
-    match get_utf8_byte_type(buf[0]) {
-      UTF8ByteType::Single => { },
+    let found_bytes = match get_utf8_byte_type(buf[0]) {
+      UTF8ByteType::Single => {
+        1
+      },
+
       UTF8ByteType::Introducer(2) => {
         input.read_exact(&mut buf[1 .. 2]).await?;
         if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation {
           unread_byte = Some(buf[1]);
+          skipped_bytes += 1;
           continue;
         }
+
+        2
       },
+
       UTF8ByteType::Introducer(3) => {
         input.read_exact(&mut buf[1 .. 2]).await?;
         if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation {
           unread_byte = Some(buf[1]);
+          skipped_bytes += 1;
           continue;
         }
 
         input.read_exact(&mut buf[2 .. 3]).await?;
         if get_utf8_byte_type(buf[2]) != UTF8ByteType::Continuation {
           unread_byte = Some(buf[2]);
+          skipped_bytes += 2;
           continue;
         }
+
+        3
       },
+
       UTF8ByteType::Introducer(4) => {
         input.read_exact(&mut buf[1 .. 2]).await?;
         if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation {
           unread_byte = Some(buf[1]);
+          skipped_bytes += 1;
           continue;
         }
 
         input.read_exact(&mut buf[2 .. 3]).await?;
         if get_utf8_byte_type(buf[2]) != UTF8ByteType::Continuation {
           unread_byte = Some(buf[2]);
+          skipped_bytes += 2;
           continue;
         }
 
         input.read_exact(&mut buf[3 .. 4]).await?;
         if get_utf8_byte_type(buf[3]) != UTF8ByteType::Continuation {
           unread_byte = Some(buf[3]);
+          skipped_bytes += 3;
           continue;
         }
+
+        4
       },
 
       /* If it's not the start of a valid character, ignore it. */
-      _ => continue,
-    }
+      _ => {
+        skipped_bytes += 1;
+        continue;
+      }
+    };
 
     if let Ok(string) = std::str::from_utf8(&buf)
        && let Some(c) = string.chars().next()
     {
-      return Ok(c);
+      return Ok(Decode { c, skipped_bytes, found_bytes });
     }
   }
 }