#![forbid(unsafe_code)] use crate::types::*; use smol::prelude::*; use smol::io::BoxedReader; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum UTF8ByteType { Single, Introducer(u8), Continuation, Invalid, } pub fn get_utf8_byte_type(b: u8) -> UTF8ByteType { if b & 0x80 == 0 { UTF8ByteType::Single } else if b & 0xC0 == 0x80 { UTF8ByteType::Continuation } else if b & 0xE0 == 0xC0 { UTF8ByteType::Introducer(2) } else if b & 0xF0 == 0xE0 { UTF8ByteType::Introducer(3) } else if b & 0xF8 == 0xF0 { UTF8ByteType::Introducer(4) } else { UTF8ByteType::Invalid } } pub async fn read_utf8_char(input: &mut BoxedReader) -> Result { let mut buf = vec![0; 4]; let mut unread_byte: Option = None; loop { if let Some(byte) = unread_byte { buf[0] = byte; unread_byte = None; } else { input.read_exact(&mut buf[0 .. 1]).await?; } match get_utf8_byte_type(buf[0]) { UTF8ByteType::Single => { }, UTF8ByteType::Introducer(2) => { input.read_exact(&mut buf[1 .. 2]).await?; if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation { unread_byte = Some(buf[1]); continue; } }, UTF8ByteType::Introducer(3) => { input.read_exact(&mut buf[1 .. 2]).await?; if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation { unread_byte = Some(buf[1]); continue; } input.read_exact(&mut buf[2 .. 3]).await?; if get_utf8_byte_type(buf[2]) != UTF8ByteType::Continuation { unread_byte = Some(buf[2]); continue; } }, UTF8ByteType::Introducer(4) => { input.read_exact(&mut buf[1 .. 2]).await?; if get_utf8_byte_type(buf[1]) != UTF8ByteType::Continuation { unread_byte = Some(buf[1]); continue; } input.read_exact(&mut buf[2 .. 3]).await?; if get_utf8_byte_type(buf[2]) != UTF8ByteType::Continuation { unread_byte = Some(buf[2]); continue; } input.read_exact(&mut buf[3 .. 4]).await?; if get_utf8_byte_type(buf[3]) != UTF8ByteType::Continuation { unread_byte = Some(buf[3]); continue; } }, /* If it's not the start of a valid character, ignore it. */ _ => continue, } if let Ok(string) = std::str::from_utf8(&buf) && let Some(c) = string.chars().next() { return Ok(c); } } }