summary refs log tree commit diff
path: root/src/encoding.rs
diff options
context:
space:
mode:
authorIrene Knapp <ireneista@irenes.space>2026-03-27 08:52:52 -0700
committerIrene Knapp <ireneista@irenes.space>2026-03-27 08:52:52 -0700
commit916bce453c48f10d42eb3744aa4c62d8ca2c4c69 (patch)
tree6984f689fb0a57d9e5c65e6c3ddb48e4d6aac778 /src/encoding.rs
parent3f3d62639b3160bd9ea7dc2c5ec6a53b3e9e11bc (diff)
deal with broken UTF8 even better
now it should self-synchronize properly if there's something really weird happening

also, that code is all refactored into encodings.rs

Force-Push: yes
Change-Id: I8bd9682448fc309b7aa6c0513e9b94cb5a4ace11
Diffstat (limited to 'src/encoding.rs')
-rw-r--r--src/encoding.rs72
1 files changed, 71 insertions, 1 deletions
diff --git a/src/encoding.rs b/src/encoding.rs
index 7d5326e..08ffb39 100644
--- a/src/encoding.rs
+++ b/src/encoding.rs
@@ -1,7 +1,11 @@
 #![forbid(unsafe_code)]
+use crate::types::*;
+use smol::prelude::*;
 
+use smol::io::BoxedReader;
 
-#[derive(Clone, Copy, Debug)]
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum UTF8ByteType {
   Single,
   Introducer(u8),
@@ -26,3 +30,69 @@ pub fn get_utf8_byte_type(b: u8) -> UTF8ByteType {
   }
 }
 
+
+pub async fn read_utf8_char(input: &mut BoxedReader) -> Result<char> {
+  let mut buf = vec![0; 4];
+  let mut unread_byte: Option<u8> = None;
+
+  loop {
+    if let Some(byte) = unread_byte {
+      buf[0] = byte;
+      unread_byte = None;
+    } else {
+      input.read_exact(&mut buf[0 .. 1]).await?;
+    }
+
+    match get_utf8_byte_type(buf[0]) {
+      UTF8ByteType::Single => { },
+      UTF8ByteType::Introducer(2) => {
+        input.read_exact(&mut buf[1 .. 2]).await?;
+        if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation {
+          unread_byte = Some(buf[1]);
+          continue;
+        }
+      },
+      UTF8ByteType::Introducer(3) => {
+        input.read_exact(&mut buf[1 .. 2]).await?;
+        if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation {
+          unread_byte = Some(buf[1]);
+          continue;
+        }
+
+        input.read_exact(&mut buf[2 .. 3]).await?;
+        if get_utf8_byte_type(buf[2]) != UTF8ByteType::Continuation {
+          unread_byte = Some(buf[2]);
+          continue;
+        }
+      },
+      UTF8ByteType::Introducer(4) => {
+        input.read_exact(&mut buf[1 .. 2]).await?;
+        if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation {
+          unread_byte = Some(buf[1]);
+          continue;
+        }
+
+        input.read_exact(&mut buf[2 .. 3]).await?;
+        if get_utf8_byte_type(buf[2]) != UTF8ByteType::Continuation {
+          unread_byte = Some(buf[2]);
+          continue;
+        }
+
+        input.read_exact(&mut buf[3 .. 4]).await?;
+        if get_utf8_byte_type(buf[3]) != UTF8ByteType::Continuation {
+          unread_byte = Some(buf[3]);
+          continue;
+        }
+      },
+
+      /* If it's not the start of a valid character, ignore it. */
+      _ => continue,
+    }
+
+    if let Ok(string) = std::str::from_utf8(&buf)
+       && let Some(c) = string.chars().next()
+    {
+      return Ok(c);
+    }
+  }
+}