fully implement the first-nonblank-column thing for H and L

surprisingly intricate, but although it looks messy now, this approach will clean up nicely Force-Push: yes Change-Id: Ic9c90982787a58110ec0a189844742a1e6c2216f
author: Irene Knapp <ireneista@irenes.space> 2026-03-27 16:59:00 -0700
committer: Irene Knapp <ireneista@irenes.space> 2026-03-27 16:59:00 -0700
commit: a80f9a1b97e1be194cb91a3b78717b0824d3bce8 (patch)
tree: a7481653125ce36b7d15a80ee763e87642ae053b /src/encoding.rs
parent: 8d0a78e708dd46aec40d3a06459c86d9c10f1e3b (diff)
1 files changed, 36 insertions, 7 deletions
diff --git a/src/encoding.rs b/src/encoding.rs
index 08ffb39..ccd2031 100644
--- a/src/encoding.rs
+++ b/src/encoding.rs
@@ -2,7 +2,13 @@
 use crate::types::*;
 use smol::prelude::*;
 
-use smol::io::BoxedReader;
+
+#[derive(Debug)]
+pub struct Decode {
+  pub c: char,
+  pub skipped_bytes: usize,
+  pub found_bytes: usize,
+}
 
 
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
@@ -31,9 +37,12 @@ pub fn get_utf8_byte_type(b: u8) -> UTF8ByteType {
 }
 
 
-pub async fn read_utf8_char(input: &mut BoxedReader) -> Result<char> {
+pub async fn read_utf8_char(input: &mut (impl AsyncRead + Unpin))
+    -> Result<Decode>
+{
   let mut buf = vec![0; 4];
   let mut unread_byte: Option<u8> = None;
+  let mut skipped_bytes = 0;
 
   loop {
     if let Some(byte) = unread_byte {
@@ -43,56 +52,76 @@ pub async fn read_utf8_char(input: &mut BoxedReader) -> Result<char> {
       input.read_exact(&mut buf[0 .. 1]).await?;
     }
 
-    match get_utf8_byte_type(buf[0]) {
-      UTF8ByteType::Single => { },
+    let found_bytes = match get_utf8_byte_type(buf[0]) {
+      UTF8ByteType::Single => {
+        1
+      },
+
       UTF8ByteType::Introducer(2) => {
         input.read_exact(&mut buf[1 .. 2]).await?;
         if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation {
           unread_byte = Some(buf[1]);
+          skipped_bytes += 1;
           continue;
         }
+
+        2
       },
+
       UTF8ByteType::Introducer(3) => {
         input.read_exact(&mut buf[1 .. 2]).await?;
         if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation {
           unread_byte = Some(buf[1]);
+          skipped_bytes += 1;
           continue;
         }
 
         input.read_exact(&mut buf[2 .. 3]).await?;
         if get_utf8_byte_type(buf[2]) != UTF8ByteType::Continuation {
           unread_byte = Some(buf[2]);
+          skipped_bytes += 2;
           continue;
         }
+
+        3
       },
+
       UTF8ByteType::Introducer(4) => {
         input.read_exact(&mut buf[1 .. 2]).await?;
         if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation {
           unread_byte = Some(buf[1]);
+          skipped_bytes += 1;
           continue;
         }
 
         input.read_exact(&mut buf[2 .. 3]).await?;
         if get_utf8_byte_type(buf[2]) != UTF8ByteType::Continuation {
           unread_byte = Some(buf[2]);
+          skipped_bytes += 2;
           continue;
         }
 
         input.read_exact(&mut buf[3 .. 4]).await?;
         if get_utf8_byte_type(buf[3]) != UTF8ByteType::Continuation {
           unread_byte = Some(buf[3]);
+          skipped_bytes += 3;
           continue;
         }
+
+        4
       },
 
       /* If it's not the start of a valid character, ignore it. */
-      _ => continue,
-    }
+      _ => {
+        skipped_bytes += 1;
+        continue;
+      }
+    };
 
     if let Ok(string) = std::str::from_utf8(&buf)
        && let Some(c) = string.chars().next()
     {
-      return Ok(c);
+      return Ok(Decode { c, skipped_bytes, found_bytes });
     }
   }
 }
author	Irene Knapp <ireneista@irenes.space>	2026-03-27 16:59:00 -0700
committer	Irene Knapp <ireneista@irenes.space>	2026-03-27 16:59:00 -0700
commit	a80f9a1b97e1be194cb91a3b78717b0824d3bce8 (patch)
tree	a7481653125ce36b7d15a80ee763e87642ae053b /src/encoding.rs
parent	8d0a78e708dd46aec40d3a06459c86d9c10f1e3b (diff)