formatting_nostd/
utf8.rs

1const UNICODE_REPLACEMENT_CHAR_STR: &str = "�";
2
3/// Split buffer after the first valid utf8 character.
4/// Returns an error if `buf` does not start with a valid utf8 character.
5///
6/// Panics if `buf` is empty.
7///
8/// ```
9/// # use formatting_nostd::utf8::split_at_first_char;
10/// assert_eq!(split_at_first_char(
11///   &['1' as u8, '2' as u8, '3' as u8][..]),
12///   Some(("1", &['2' as u8, '3' as u8][..])));
13/// assert_eq!(split_at_first_char(
14///   &[0x80, '2' as u8, '3' as u8][..]),
15///   None);
16/// ```
17pub fn split_at_first_char(buf: &[u8]) -> Option<(&str, &[u8])> {
18    assert!(!buf.is_empty());
19    for charlen in 1..=core::cmp::min(4, buf.len()) {
20        if let Ok(s) = core::str::from_utf8(&buf[..charlen]) {
21            return Some((s, &buf[charlen..]));
22        }
23    }
24    None
25}
26
27#[cfg(test)]
28#[test]
29fn test_split_at_first_char() {
30    // Valid first char; multiple lengths
31    assert_eq!(
32        split_at_first_char(&[b'1', b'2', b'3'][..]),
33        Some(("1", &[b'2', b'3'][..]))
34    );
35    assert_eq!(
36        split_at_first_char(&[b'1', b'2'][..]),
37        Some(("1", &[b'2'][..]))
38    );
39    assert_eq!(split_at_first_char(&[b'1'][..]), Some(("1", &[][..])));
40
41    // Invalid first char; multiple lengths
42    assert_eq!(split_at_first_char(&[0x80, b'2', b'3'][..]), None);
43    assert_eq!(split_at_first_char(&[0x80, b'2'][..]), None);
44    assert_eq!(split_at_first_char(&[0x80][..]), None);
45
46    // > 1-byte first characters
47    assert_eq!(
48        split_at_first_char(&[0xc2, 0xa1, 0][..]),
49        Some(("¡", &[0][..]))
50    );
51    assert_eq!(
52        split_at_first_char(&[0xe0, 0xa4, 0xb9, 0][..]),
53        Some(("ह", &[0][..]))
54    );
55    assert_eq!(
56        split_at_first_char(&[0xf0, 0x90, 0x8d, 0x88, 0][..]),
57        Some(("𐍈", &[0][..]))
58    );
59}
60
61/// Split buffer after the first valid utf8 character.
62///
63/// If `buf` doesn't start with a valid utf8 character,
64/// returns the unicode replacement character and the buffer
65/// after skipping invalid bytes.
66///
67/// Panics if `buf` is empty.
68///
69/// ```
70/// # use formatting_nostd::utf8::split_at_first_char_lossy;
71/// assert_eq!(split_at_first_char_lossy(
72///   &['1' as u8, '2' as u8, '3' as u8][..]),
73///   ("1", &['2' as u8, '3' as u8][..]));
74/// assert_eq!(split_at_first_char_lossy(
75///   &[0x80, 0x80, '2' as u8, '3' as u8][..]),
76///   ("�", &['2' as u8, '3' as u8][..]));
77/// ```
78pub fn split_at_first_char_lossy(mut buf: &[u8]) -> (&str, &[u8]) {
79    assert!(!buf.is_empty());
80    let mut invalid_seq = false;
81    loop {
82        let res = split_at_first_char(buf);
83        if let Some((first_char, therest)) = res {
84            return if invalid_seq {
85                // We're at the end of an invalid sequence.
86                (UNICODE_REPLACEMENT_CHAR_STR, buf)
87            } else {
88                (first_char, therest)
89            };
90        }
91        // We're in an invalid sequence.
92        invalid_seq = true;
93
94        // Move forward one byte
95        buf = &buf[1..];
96
97        if buf.is_empty() {
98            return (UNICODE_REPLACEMENT_CHAR_STR, buf);
99        }
100    }
101}
102
103#[cfg(test)]
104#[test]
105fn test_split_at_first_char_lossy() {
106    // Using the implementatin knowledge that this is implemented using `split_at_first_char`;
107    // just focus on testing the cases that are different for this method.
108    assert_eq!(
109        split_at_first_char_lossy(&[b'1', 2, 3][..]),
110        ("1", &[2, 3][..])
111    );
112    assert_eq!(
113        split_at_first_char_lossy(&[0x80, 2, 3][..]),
114        (UNICODE_REPLACEMENT_CHAR_STR, &[2, 3][..])
115    );
116    assert_eq!(
117        split_at_first_char_lossy(&[0x80, 0x80, 2, 3][..]),
118        (UNICODE_REPLACEMENT_CHAR_STR, &[2, 3][..])
119    );
120}
121
122pub struct DecodeLossyIterator<'a> {
123    bytes: &'a [u8],
124}
125
126impl<'a> core::iter::Iterator for DecodeLossyIterator<'a> {
127    type Item = &'a str;
128
129    fn next(&mut self) -> Option<Self::Item> {
130        if self.bytes.is_empty() {
131            return None;
132        }
133        let (item, next_bytes) = split_at_first_char_lossy(self.bytes);
134        self.bytes = next_bytes;
135        Some(item)
136    }
137}
138
139pub fn decode_lossy(bytes: &[u8]) -> DecodeLossyIterator {
140    DecodeLossyIterator { bytes }
141}
142
143#[cfg(test)]
144#[test]
145fn test_lossy_decode_iterator() {
146    assert_eq!(
147        decode_lossy("123".as_bytes()).collect::<Vec<_>>(),
148        vec!["1", "2", "3"]
149    );
150    assert_eq!(
151        decode_lossy(&[0x80, 0x80, b'x', 0x80]).collect::<Vec<_>>(),
152        vec![
153            UNICODE_REPLACEMENT_CHAR_STR,
154            "x",
155            UNICODE_REPLACEMENT_CHAR_STR
156        ]
157    );
158}