diff options
| author | Irene Knapp <ireneista@irenes.space> | 2026-03-27 16:59:00 -0700 |
|---|---|---|
| committer | Irene Knapp <ireneista@irenes.space> | 2026-03-27 16:59:00 -0700 |
| commit | a80f9a1b97e1be194cb91a3b78717b0824d3bce8 (patch) | |
| tree | a7481653125ce36b7d15a80ee763e87642ae053b /src/encoding.rs | |
| parent | 8d0a78e708dd46aec40d3a06459c86d9c10f1e3b (diff) | |
fully implement the first-nonblank-column thing for H and L
surprisingly intricate, but although it looks messy now, this approach will clean up nicely Force-Push: yes Change-Id: Ic9c90982787a58110ec0a189844742a1e6c2216f
Diffstat (limited to 'src/encoding.rs')
| -rw-r--r-- | src/encoding.rs | 43 |
1 files changed, 36 insertions, 7 deletions
diff --git a/src/encoding.rs b/src/encoding.rs index 08ffb39..ccd2031 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -2,7 +2,13 @@ use crate::types::*; use smol::prelude::*; -use smol::io::BoxedReader; + +#[derive(Debug)] +pub struct Decode { + pub c: char, + pub skipped_bytes: usize, + pub found_bytes: usize, +} #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -31,9 +37,12 @@ pub fn get_utf8_byte_type(b: u8) -> UTF8ByteType { } -pub async fn read_utf8_char(input: &mut BoxedReader) -> Result<char> { +pub async fn read_utf8_char(input: &mut (impl AsyncRead + Unpin)) + -> Result<Decode> +{ let mut buf = vec![0; 4]; let mut unread_byte: Option<u8> = None; + let mut skipped_bytes = 0; loop { if let Some(byte) = unread_byte { @@ -43,56 +52,76 @@ pub async fn read_utf8_char(input: &mut BoxedReader) -> Result<char> { input.read_exact(&mut buf[0 .. 1]).await?; } - match get_utf8_byte_type(buf[0]) { - UTF8ByteType::Single => { }, + let found_bytes = match get_utf8_byte_type(buf[0]) { + UTF8ByteType::Single => { + 1 + }, + UTF8ByteType::Introducer(2) => { input.read_exact(&mut buf[1 .. 2]).await?; if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation { unread_byte = Some(buf[1]); + skipped_bytes += 1; continue; } + + 2 }, + UTF8ByteType::Introducer(3) => { input.read_exact(&mut buf[1 .. 2]).await?; if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation { unread_byte = Some(buf[1]); + skipped_bytes += 1; continue; } input.read_exact(&mut buf[2 .. 3]).await?; if get_utf8_byte_type(buf[2]) != UTF8ByteType::Continuation { unread_byte = Some(buf[2]); + skipped_bytes += 2; continue; } + + 3 }, + UTF8ByteType::Introducer(4) => { input.read_exact(&mut buf[1 .. 2]).await?; if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation { unread_byte = Some(buf[1]); + skipped_bytes += 1; continue; } input.read_exact(&mut buf[2 .. 3]).await?; if get_utf8_byte_type(buf[2]) != UTF8ByteType::Continuation { unread_byte = Some(buf[2]); + skipped_bytes += 2; continue; } input.read_exact(&mut buf[3 .. 4]).await?; if get_utf8_byte_type(buf[3]) != UTF8ByteType::Continuation { unread_byte = Some(buf[3]); + skipped_bytes += 3; continue; } + + 4 }, /* If it's not the start of a valid character, ignore it. */ - _ => continue, - } + _ => { + skipped_bytes += 1; + continue; + } + }; if let Ok(string) = std::str::from_utf8(&buf) && let Some(c) = string.chars().next() { - return Ok(c); + return Ok(Decode { c, skipped_bytes, found_bytes }); } } } |