diff options
| author | Irene Knapp <ireneista@irenes.space> | 2026-03-27 08:52:52 -0700 |
|---|---|---|
| committer | Irene Knapp <ireneista@irenes.space> | 2026-03-27 08:52:52 -0700 |
| commit | 916bce453c48f10d42eb3744aa4c62d8ca2c4c69 (patch) | |
| tree | 6984f689fb0a57d9e5c65e6c3ddb48e4d6aac778 /src/encoding.rs | |
| parent | 3f3d62639b3160bd9ea7dc2c5ec6a53b3e9e11bc (diff) | |
deal with broken UTF8 even better
now it should self-synchronize properly if there's something really weird happening also, that code is all refactored into encodings.rs Force-Push: yes Change-Id: I8bd9682448fc309b7aa6c0513e9b94cb5a4ace11
Diffstat (limited to 'src/encoding.rs')
| -rw-r--r-- | src/encoding.rs | 72 |
1 files changed, 71 insertions, 1 deletions
diff --git a/src/encoding.rs b/src/encoding.rs index 7d5326e..08ffb39 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -1,7 +1,11 @@ #![forbid(unsafe_code)] +use crate::types::*; +use smol::prelude::*; +use smol::io::BoxedReader; -#[derive(Clone, Copy, Debug)] + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum UTF8ByteType { Single, Introducer(u8), @@ -26,3 +30,69 @@ pub fn get_utf8_byte_type(b: u8) -> UTF8ByteType { } } + +pub async fn read_utf8_char(input: &mut BoxedReader) -> Result<char> { + let mut buf = vec![0; 4]; + let mut unread_byte: Option<u8> = None; + + loop { + if let Some(byte) = unread_byte { + buf[0] = byte; + unread_byte = None; + } else { + input.read_exact(&mut buf[0 .. 1]).await?; + } + + match get_utf8_byte_type(buf[0]) { + UTF8ByteType::Single => { }, + UTF8ByteType::Introducer(2) => { + input.read_exact(&mut buf[1 .. 2]).await?; + if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation { + unread_byte = Some(buf[1]); + continue; + } + }, + UTF8ByteType::Introducer(3) => { + input.read_exact(&mut buf[1 .. 2]).await?; + if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation { + unread_byte = Some(buf[1]); + continue; + } + + input.read_exact(&mut buf[2 .. 3]).await?; + if get_utf8_byte_type(buf[2]) != UTF8ByteType::Continuation { + unread_byte = Some(buf[2]); + continue; + } + }, + UTF8ByteType::Introducer(4) => { + input.read_exact(&mut buf[1 .. 2]).await?; + if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation { + unread_byte = Some(buf[1]); + continue; + } + + input.read_exact(&mut buf[2 .. 3]).await?; + if get_utf8_byte_type(buf[2]) != UTF8ByteType::Continuation { + unread_byte = Some(buf[2]); + continue; + } + + input.read_exact(&mut buf[3 .. 4]).await?; + if get_utf8_byte_type(buf[3]) != UTF8ByteType::Continuation { + unread_byte = Some(buf[3]); + continue; + } + }, + + /* If it's not the start of a valid character, ignore it. */ + _ => continue, + } + + if let Ok(string) = std::str::from_utf8(&buf) + && let Some(c) = string.chars().next() + { + return Ok(c); + } + } +} |