8000 move CStr8, CStr16, and CString16 from uefi to ucs2 by phip1611 · Pull Request #23 · rust-osdev/ucs2-rs · GitHub
[go: up one dir, main page]

Skip to content

move CStr8, CStr16, and CString16 from uefi to ucs2 #23

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
lib: move code to encoding module
In a next step, we can add more modules.
  • Loading branch information
phip1611 committed Jul 30, 2024
commit cead90b6c4bf704541de8796dc7c0fd178f8b631
2 changes: 1 addition & 1 deletion src/macros.rs → src/encoding/macros.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::{ucs2_from_utf8_at_offset, Error};
use super::{ucs2_from_utf8_at_offset, Error};

/// Count the number of UCS-2 characters in a string. Return an error if
/// the string cannot be encoded in UCS-2.
Expand Down
223 changes: 223 additions & 0 deletions src/encoding/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
//! Low-level encoding and decoding facilities for UCS-2 strings.

mod macros;

/// These need to be public for the `ucs2_cstr!` macro, but are not
/// intended to be called directly.
#[doc(hidden)]
pub use macros::{str_num_ucs2_chars, str_to_ucs2};

use bit_field::BitField;
use core::fmt::{self, Display, Formatter};

/// Possible errors when encoding UCS-2 strings..
#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
pub enum Error {
/// Not enough space left in the output buffer.
BufferOverflow,
/// Input contained a character which cannot be represented in UCS-2.
MultiByte,
}

impl Display for Error {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self {
Self::BufferOverflow => f.write_str("output buffer is too small"),
Self::MultiByte => {
f.write_str("input contains a character which cannot be represented in UCS-2")
}
}
}
}

type Result<T> = core::result::Result<T, Error>;

/// Value returned by `ucs2_from_utf8_at_offset`.
struct Ucs2CharFromUtf8 {
/// UCS-2 character.
val: u16,
/// Number of bytes needed to encode the character in UTF-8.
num_bytes: u8,
}

/// Get a UCS-2 character from a UTF-8 byte slice at the given offset.
///
/// # Safety
///
/// The input `bytes` must be valid UTF-8.
const unsafe fn ucs2_from_utf8_at_offset(bytes: &[u8], offset: usize) -> Result<Ucs2CharFromUtf8> {
let len = bytes.len();
let ch;
let ch_len;

if bytes[offset] & 0b1000_0000 == 0b0000_0000 {
ch = bytes[offset] as u16;
ch_len = 1;
} else if bytes[offset] & 0b1110_0000 == 0b1100_0000 {
// 2 byte codepoint
if offset + 1 >= len {
// safe: len is the length of bytes,
// and bytes is a direct view into the
// buffer of input, which in order to be a valid
// utf-8 string _must_ contain `i + 1`.
unsafe { core::hint::unreachable_unchecked() }
}

let a = (bytes[offset] & 0b0001_1111) as u16;
let b = (bytes[offset + 1] & 0b0011_1111) as u16;
ch = a << 6 | b;
ch_len = 2;
} else if bytes[offset] & 0b1111_0000 == 0b1110_0000 {
// 3 byte codepoint
if offset + 2 >= len || offset + 1 >= len {
// safe: impossible utf-8 string.
unsafe { core::hint::unreachable_unchecked() }
}

let a = (bytes[offset] & 0b0000_1111) as u16;
let b = (bytes[offset + 1] & 0b0011_1111) as u16;
let c = (bytes[offset + 2] & 0b0011_1111) as u16;
ch = a << 12 | b << 6 | c;
ch_len = 3;
} else if bytes[offset] & 0b1111_0000 == 0b1111_0000 {
return Err(Error::MultiByte); // UTF-16
} else {
// safe: impossible utf-8 string.
unsafe { core::hint::unreachable_unchecked() }
}

Ok(Ucs2CharFromUtf8 {
val: ch,
num_bytes: ch_len,
})
}

/// Encodes an input UTF-8 string into a UCS-2 string.
///
/// The returned `usize` represents the length of the returned buffer,
/// measured in 2-byte characters.
pub fn encode(input: &str, buffer: &mut [u16]) -> Result<usize> {
let buffer_size = buffer.len();
let mut i = 0;

encode_with(input, |ch| {
if i >= buffer_size {
Err(Error::BufferOverflow)
} else {
buffer[i] = ch;
i += 1;
Ok(())
}
})?;

Ok(i)
}

/// Encode UTF-8 string to UCS-2 with a custom callback function.
///
/// `output` is a function which receives every encoded character.
pub fn encode_with<F>(input: &str, mut output: F) -> Result<()>
where
F: FnMut(u16) -> Result<()>,
{
let bytes = input.as_bytes();
let len = bytes.len();
let mut i = 0;

while i < len {
// SAFETY: `bytes` is valid UTF-8.
let ch = unsafe { ucs2_from_utf8_at_offset(bytes, i) }?;
i += usize::from(ch.num_bytes);
output(ch.val)?;
}
Ok(())
}

/// Decode UCS-2 string to UTF-8 with a custom callback function.
///
/// `output` is a function which receives every decoded character.
/// Due to the nature of UCS-2, the function can receive an UTF-8 character
/// of up to three bytes, for every input character.
pub fn decode_with<F>(input: &[u16], mut output: F) -> Result<usize>
where
F: FnMut(&[u8]) -> Result<()>,
{
let mut written = 0;

for ch in input.iter() {
/*
* We need to find how many bytes of UTF-8 this UCS-2 code-point needs. Because UCS-2 can only encode
* the Basic Multilingual Plane, a maximum of three bytes are needed.
*/
if (0x000..0x0080).contains(ch) {
output(&[*ch as u8])?;

written += 1;
} else if (0x0080..0x0800).contains(ch) {
let first = 0b1100_0000 + ch.get_bits(6..11) as u8;
let last = 0b1000_0000 + ch.get_bits(0..6) as u8;

output(&[first, last])?;

written += 2;
} else {
let first = 0b1110_0000 + ch.get_bits(12..16) as u8;
let mid = 0b1000_0000 + ch.get_bits(6..12) as u8;
let last = 0b1000_0000 + ch.get_bits(0..6) as u8;

output(&[first, mid, last])?;

written += 3;
}
}

Ok(written)
}

/// Decode an input UCS-2 string into a UTF-8 string.
///
/// The returned `usize` represents the length of the returned buffer,
/// in bytes. Due to the nature of UCS-2, the output buffer could end up with
/// three bytes for every character in the input buffer.
pub fn decode(input: &[u16], output: &mut [u8]) -> Result<usize> {
let buffer_size = output.len();
let mut i = 0;

decode_with(input, |bytes| {
if bytes.len() == 1 {
// Can be encoded in a single byte
if i >= buffer_size {
return Err(Error::BufferOverflow);
}

output[i] = bytes[0];

i += 1;
} else if bytes.len() == 2 {
// Can be encoded two bytes
if i + 1 >= buffer_size {
return Err(Error::BufferOverflow);
}

output[i] = bytes[0];
output[i + 1] = bytes[1];

i += 2;
} else if bytes.len() == 3 {
// Can be encoded three bytes
if i + 2 >= buffer_size {
return Err(Error::BufferOverflow);
}

output[i] = bytes[0];
output[i + 1] = bytes[1];
output[i + 2] = bytes[2];

i += 3;
} else {
unreachable!("More than three bytes per UCS-2 character.");
}

Ok(())
})
}
Loading
0