ucs2/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4// UNSAFETY: Defining and implementing from_slice_unchecked.
5#![expect(unsafe_code)]
6
7//! Wrappers around possibly misaligned `[u8]` buffers containing UCS-2 LE data.
8
9use std::fmt;
10use thiserror::Error;
11
12/// Errors which may occur while parsing UCS-2
13#[derive(Debug, Error)]
14pub enum Ucs2ParseError {
15    /// buffer's length was not a multiple of 2
16    #[error("buffer's length was not a multiple of 2")]
17    NotMultiple2,
18    /// buffer did not contain a null terminator
19    #[error("buffer did not contain a null terminator")]
20    MissingNullTerm,
21}
22
23/// Wrapper around `Vec<u8>` containing a valid null-terminated UCS-2 LE string.
24///
25/// **This type is not FFI compatible with `*const u16`!**
26///
27/// Because `Ucs2LeVec` uses a `[u8]` as the backing data type (as opposed to a
28/// `[u16]`), the data is **not** guaranteed to be `u16` aligned!
29///
30/// DEVNOTE: While we want `Ucs2LeSlice` to be backed by a `[u8]`, `Ucs2LeVec`
31/// should likely get switched over to a `Vec<u16>`, so we can get proper `u16`
32/// alignment. Note that in this case, we could use a bit of (trivially save)
33/// `unsafe` code to impl `Deref<Target = Ucs2LeSlice>` by reinterpretting the
34/// `Vec<u16>` as a `&[u8]`, so there wouldn't be any major ergonomic hit.
35#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
36pub struct Ucs2LeVec(Vec<u8>);
37
38impl Ucs2LeVec {
39    /// Validate that the provided `Vec<u8>` is a valid null-terminated UCS-2 LE
40    /// string, truncating the slice to the position of the first null u16.
41    pub fn from_vec_with_nul(mut buf: Vec<u8>) -> Result<Ucs2LeVec, Ucs2ParseError> {
42        let slice = Ucs2LeSlice::from_slice_with_nul(&buf)?;
43        // SAFETY: `from_slice_with_nul` performs the truncation on a slice-view
44        // of the buf, so using that slice to truncate the buffer is ok.
45        buf.truncate(slice.0.len());
46        Ok(Ucs2LeVec(buf))
47    }
48
49    /// Consume self, returning the underlying raw `Vec<u8>`
50    pub fn into_inner(self) -> Vec<u8> {
51        self.0
52    }
53}
54
55impl Default for Ucs2LeVec {
56    fn default() -> Ucs2LeVec {
57        let s: &Ucs2LeSlice = Default::default();
58        s.to_ucs2_le_vec()
59    }
60}
61
62impl fmt::Debug for Ucs2LeVec {
63    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
64        fmt::Debug::fmt(&self.as_ref(), f)
65    }
66}
67
68impl fmt::Display for Ucs2LeVec {
69    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
70        fmt::Display::fmt(&self.as_ref(), f)
71    }
72}
73
74impl AsRef<Ucs2LeSlice> for Ucs2LeVec {
75    fn as_ref(&self) -> &Ucs2LeSlice {
76        // SAFETY: Ucs2LeVec can only contain valid UCS-2 data
77        unsafe { Ucs2LeSlice::from_slice_unchecked(&self.0) }
78    }
79}
80
81impl std::ops::Deref for Ucs2LeVec {
82    type Target = Ucs2LeSlice;
83
84    fn deref(&self) -> &Ucs2LeSlice {
85        self.as_ref()
86    }
87}
88
89impl std::borrow::Borrow<Ucs2LeSlice> for Ucs2LeVec {
90    fn borrow(&self) -> &Ucs2LeSlice {
91        self.as_ref()
92    }
93}
94
95impl<'a> From<&'a Ucs2LeSlice> for std::borrow::Cow<'a, Ucs2LeSlice> {
96    fn from(val: &'a Ucs2LeSlice) -> Self {
97        std::borrow::Cow::Borrowed(val)
98    }
99}
100
101impl From<Ucs2LeVec> for std::borrow::Cow<'_, Ucs2LeSlice> {
102    fn from(val: Ucs2LeVec) -> Self {
103        std::borrow::Cow::Owned(val)
104    }
105}
106
107impl<'a> From<&'a str> for Ucs2LeVec {
108    fn from(s: &'a str) -> Ucs2LeVec {
109        let mut s = s
110            .encode_utf16()
111            .flat_map(|w| [w as u8, (w >> 8) as u8])
112            .collect::<Vec<u8>>();
113        s.push(0);
114        s.push(0);
115        // SAFETY: UTF-8 str has been converted into a valid null-terminated UCS-2 Le string
116        Ucs2LeVec(s)
117    }
118}
119
120impl From<String> for Ucs2LeVec {
121    fn from(s: String) -> Ucs2LeVec {
122        Ucs2LeVec::from(s.as_str())
123    }
124}
125
126/// Wrapper around `[u8]` containing a valid null-terminated UCS-2 LE string.
127///
128/// **This type is not FFI compatible with `*const u16`!**
129///
130/// Because `Ucs2LeSlice` uses a `[u8]` as the backing data type (as opposed to
131/// a `[u16]`), the data is **not** guaranteed to be `u16` aligned!
132///
133/// # Example
134///
135/// ```
136/// # use ucs2::Ucs2LeSlice;
137/// let raw = [b'O', 0, b'K', 0, 0, 0];
138/// let s = Ucs2LeSlice::from_slice_with_nul(&raw).unwrap();
139/// assert_eq!(s.as_bytes().len(), raw.len());
140/// assert_eq!(s.to_string(), "OK");
141/// ```
142#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)]
143pub struct Ucs2LeSlice([u8]);
144
145impl<'a> Default for &'a Ucs2LeSlice {
146    fn default() -> &'a Ucs2LeSlice {
147        // SAFETY: &[0, 0] is a valid null-terminated UCS-2 LE string.
148        unsafe { Ucs2LeSlice::from_slice_unchecked(&[0, 0]) }
149    }
150}
151
152impl Ucs2LeSlice {
153    /// Validate that the provided `&[u8]` is a valid null-terminated UCS-2 LE
154    /// string, truncating the slice to the position of the first null u16.
155    pub fn from_slice_with_nul(buf: &[u8]) -> Result<&Ucs2LeSlice, Ucs2ParseError> {
156        if buf.len() % 2 != 0 {
157            return Err(Ucs2ParseError::NotMultiple2);
158        }
159
160        // Unlike UTF-8 or UTF-16, UCS-2 doesn't require any complex semantic
161        // validation, as all values from 0 to 0xFFFF are valid codepoints.
162
163        let mut buf_as_u16_iter = buf
164            .chunks_exact(2)
165            .map(|c| u16::from_le_bytes(c.try_into().unwrap()));
166
167        match buf_as_u16_iter.position(|c| c == 0) {
168            None => Err(Ucs2ParseError::MissingNullTerm),
169            // SAFETY: buf has been validated to contain valid data
170            Some(idx) => Ok(unsafe { Ucs2LeSlice::from_slice_unchecked(&buf[..(idx + 1) * 2]) }),
171        }
172    }
173
174    /// Create a `Ucs2LeSlice` from a raw `&[u8]` without performing any
175    /// validation.
176    ///
177    /// # Safety
178    ///
179    /// Callers must ensure that the buf has a length that is a multiple of 2,
180    /// contains valid UCS-2 codepoints, and terminates with a single null u16.
181    unsafe fn from_slice_unchecked(buf: &[u8]) -> &Ucs2LeSlice {
182        // SAFETY: caller has maintained invariants, and `Ucs2LeSlice` has the
183        // same representation as [u8]
184        unsafe { std::mem::transmute(buf) }
185    }
186
187    /// View the underlying data as raw bytes.
188    pub fn as_bytes(&self) -> &[u8] {
189        &self.0
190    }
191
192    /// View the underlying data as raw bytes, without the trailing null `u16`.
193    pub fn as_bytes_without_nul(&self) -> &[u8] {
194        self.0.strip_suffix(&[0, 0]).unwrap()
195    }
196
197    /// Copies `self` into a new [`Ucs2LeVec`].
198    pub fn to_ucs2_le_vec(&self) -> Ucs2LeVec {
199        Ucs2LeVec(self.0.to_vec())
200    }
201
202    fn to_string_inner(&self) -> String {
203        // TODO: this isn't strictly correct, since UCS-2 handles chars in the
204        // surragate range (0xD800–0xDFFF) differently from UTF-16.
205        //
206        // Properly converting UCS-2 to UTF-8/16 is a bit more subtle, and
207        // handling this properly will require a PR in its own right.
208        String::from_utf16_lossy(
209            &self
210                .0
211                .chunks_exact(2)
212                .map(|c| u16::from_le_bytes(c.try_into().unwrap()))
213                .take_while(|b| *b != 0)
214                .collect::<Vec<u16>>(),
215        )
216    }
217}
218
219impl ToOwned for Ucs2LeSlice {
220    type Owned = Ucs2LeVec;
221
222    fn to_owned(&self) -> Ucs2LeVec {
223        self.to_ucs2_le_vec()
224    }
225}
226
227impl fmt::Debug for Ucs2LeSlice {
228    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
229        fmt::Debug::fmt(&self.to_string_inner(), f)
230    }
231}
232
233impl fmt::Display for Ucs2LeSlice {
234    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
235        fmt::Display::fmt(&self.to_string_inner(), f)
236    }
237}
238
239#[cfg(test)]
240mod test {
241    use super::*;
242
243    fn str_to_utf16_buf(s: &str) -> Vec<u8> {
244        // TODO: This _technically_ incorrect, and will result in wonky behavior
245        // if the string contains a code point outside of the Basic Multilingual
246        // Plane (i.e: 0x0000-0xffff)
247        //
248        // Wonky != Invalid, since technically, UCS-2 doesn't have any "invalid"
249        // values...
250        //
251        // In any case, this is test code, and we aren't using any funky chars
252        // here, so it's not _super_ important.
253        //
254        // Too bad UEFI doesn't support proper UTF-16... imagine if we could use
255        // Emoji as nvram variable names 👀
256        s.encode_utf16()
257            .flat_map(|b| b.to_le_bytes().into_iter())
258            .collect::<Vec<u8>>()
259    }
260
261    #[test]
262    fn smoke() {
263        let s8 = "hello!\0";
264
265        let s16 = str_to_utf16_buf(s8);
266        let s16 = Ucs2LeSlice::from_slice_with_nul(&s16).unwrap();
267        assert_eq!(s16.as_bytes().len(), s8.len() * 2);
268        assert_eq!(
269            s16.as_bytes().chunks_exact(2).last(),
270            Some([0u8, 0].as_ref())
271        )
272    }
273
274    #[test]
275    fn interior_middle_null() {
276        let s8 = "hello!\0extra";
277        let s8_expected = "hello!\0";
278
279        let s16 = str_to_utf16_buf(s8);
280        let s16_expected = str_to_utf16_buf(s8_expected);
281
282        let s16 = Ucs2LeSlice::from_slice_with_nul(&s16).unwrap();
283        let s16_expected = Ucs2LeSlice::from_slice_with_nul(&s16_expected).unwrap();
284
285        assert_eq!(s16, s16_expected)
286    }
287
288    #[test]
289    fn zero_len() {
290        let s8 = "\0";
291
292        let s16 = str_to_utf16_buf(s8);
293        let s16 = Ucs2LeSlice::from_slice_with_nul(&s16).unwrap();
294        assert_eq!(s16.as_bytes().len(), 2);
295        assert_eq!(s16.as_bytes(), [0u8, 0].as_ref())
296    }
297
298    #[test]
299    fn not_multiple_2() {
300        let s8 = "so close!\0";
301
302        let mut s16 = str_to_utf16_buf(s8);
303        s16.push(0);
304
305        let res = Ucs2LeSlice::from_slice_with_nul(&s16);
306        assert!(matches!(res, Err(Ucs2ParseError::NotMultiple2)))
307    }
308
309    #[test]
310    fn missing_null_term() {
311        let s8 = "so close!";
312
313        let s16 = str_to_utf16_buf(s8);
314        let res = Ucs2LeSlice::from_slice_with_nul(&s16);
315        assert!(matches!(res, Err(Ucs2ParseError::MissingNullTerm)))
316    }
317}