summary refs log tree commit diff
path: root/src/encoding.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/encoding.rs')
-rw-r--r--src/encoding.rs72
1 files changed, 71 insertions, 1 deletions
diff --git a/src/encoding.rs b/src/encoding.rs
index 7d5326e..08ffb39 100644
--- a/src/encoding.rs
+++ b/src/encoding.rs
@@ -1,7 +1,11 @@
 #![forbid(unsafe_code)]
+use crate::types::*;
+use smol::prelude::*;
 
+use smol::io::BoxedReader;
 
-#[derive(Clone, Copy, Debug)]
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum UTF8ByteType {
   Single,
   Introducer(u8),
@@ -26,3 +30,69 @@ pub fn get_utf8_byte_type(b: u8) -> UTF8ByteType {
   }
 }
 
+
+pub async fn read_utf8_char(input: &mut BoxedReader) -> Result<char> {
+  let mut buf = vec![0; 4];
+  let mut unread_byte: Option<u8> = None;
+
+  loop {
+    if let Some(byte) = unread_byte {
+      buf[0] = byte;
+      unread_byte = None;
+    } else {
+      input.read_exact(&mut buf[0 .. 1]).await?;
+    }
+
+    match get_utf8_byte_type(buf[0]) {
+      UTF8ByteType::Single => { },
+      UTF8ByteType::Introducer(2) => {
+        input.read_exact(&mut buf[1 .. 2]).await?;
+        if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation {
+          unread_byte = Some(buf[1]);
+          continue;
+        }
+      },
+      UTF8ByteType::Introducer(3) => {
+        input.read_exact(&mut buf[1 .. 2]).await?;
+        if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation {
+          unread_byte = Some(buf[1]);
+          continue;
+        }
+
+        input.read_exact(&mut buf[2 .. 3]).await?;
+        if get_utf8_byte_type(buf[2]) != UTF8ByteType::Continuation {
+          unread_byte = Some(buf[2]);
+          continue;
+        }
+      },
+      UTF8ByteType::Introducer(4) => {
+        input.read_exact(&mut buf[1 .. 2]).await?;
+        if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation {
+          unread_byte = Some(buf[1]);
+          continue;
+        }
+
+        input.read_exact(&mut buf[2 .. 3]).await?;
+        if get_utf8_byte_type(buf[2]) != UTF8ByteType::Continuation {
+          unread_byte = Some(buf[2]);
+          continue;
+        }
+
+        input.read_exact(&mut buf[3 .. 4]).await?;
+        if get_utf8_byte_type(buf[3]) != UTF8ByteType::Continuation {
+          unread_byte = Some(buf[3]);
+          continue;
+        }
+      },
+
+      /* If it's not the start of a valid character, ignore it. */
+      _ => continue,
+    }
+
+    if let Ok(string) = std::str::from_utf8(&buf)
+       && let Some(c) = string.chars().next()
+    {
+      return Ok(c);
+    }
+  }
+}