Skip to content

Commit 8d2beb5

Browse files
committed
Allow access to OsStr bytes
`OsStr` has historically kept its implementation details private out of concern for locking us into a specific encoding on Windows. This is an alternative to #95290 which proposed specifying the encoding on Windows. Instead, this only specifies that for cross-platform code, `OsStr`'s encoding is a superset of UTF-8 and defines rules for safely interacting with it At minimum, this can greatly simplify the `os_str_bytes` crate and every arg parser that interacts with `OsStr` directly (which is most of those that support invalid UTF-8).
1 parent 70e04bd commit 8d2beb5

File tree

6 files changed

+91
-9
lines changed

6 files changed

+91
-9
lines changed

library/std/src/ffi/mod.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,14 @@
127127
//! trait, which provides a [`from_wide`] method to convert a native Windows
128128
//! string (without the terminating nul character) to an [`OsString`].
129129
//!
130+
//! ## On all platforms
131+
//!
132+
//! On all platforms, [`OsStr`] consists of a sequence of bytes that is encoded as a superset of
133+
//! UTF-8; see [`OsString`] for more details on its encoding on different platforms.
134+
//!
135+
//! For limited, inexpensive conversions from and to bytes, see [`OsStr::as_os_str_bytes`] and
136+
//! [`OsStr::from_os_str_bytes_unchecked`].
137+
//!
130138
//! [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
131139
//! [Unicode code point]: https://www.unicode.org/glossary/#code_point
132140
//! [`env::set_var()`]: crate::env::set_var "env::set_var"

library/std/src/ffi/os_str.rs

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,45 @@ impl OsStr {
667667
s.as_ref()
668668
}
669669

670+
/// Converts a slice of bytes to an OS string slice without checking that the string contains
671+
/// valid `OsStr`-encoded data.
672+
///
673+
/// See the [module's toplevel documentation about conversions][conversions] for safe,
674+
/// cross-platform [conversions] from/to native representations.
675+
///
676+
/// # Safety
677+
///
678+
/// `OsStr`'s encoding is an unspecified superset of UTF-8 and callers must
679+
/// pass in bytes that originated as a mixture of validated UTF-8 and bytes from
680+
/// [`OsStr::as_os_str_bytes`] from within the same rust version built for the same target
681+
/// platform. The bytes from `OsStr::as_os_str_bytes` may be split either
682+
/// immediately before or immediately after some valid non-empty UTF-8 substring
683+
///
684+
/// # Example
685+
///
686+
/// ```
687+
/// #![feature(os_str_bytes)]
688+
///
689+
/// use std::ffi::OsStr;
690+
///
691+
/// let os_str = OsStr::new("Mary had a little lamb");
692+
/// let bytes = os_str.as_os_str_bytes();
693+
/// let words = bytes.split(|b| *b == b' ');
694+
/// let words: Vec<&OsStr> = words.map(|word| {
695+
/// // SAFETY:
696+
/// // - Each `word` only contains content that originated from `OsStr::as_os_str_bytes`
697+
/// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring
698+
/// unsafe { OsStr::from_os_str_bytes_unchecked(word) }
699+
/// }).collect();
700+
/// ```
701+
///
702+
/// [conversions]: super#conversions
703+
#[inline]
704+
#[unstable(feature = "os_str_bytes", issue = "111544")]
705+
pub fn from_os_str_bytes_unchecked(bytes: &[u8]) -> &Self {
706+
Self::from_inner(Slice::from_os_str_bytes_unchecked(bytes))
707+
}
708+
670709
#[inline]
671710
fn from_inner(inner: &Slice) -> &OsStr {
672711
// SAFETY: OsStr is just a wrapper of Slice,
@@ -837,13 +876,28 @@ impl OsStr {
837876
OsString { inner: Buf::from_box(boxed) }
838877
}
839878

879+
/// Converts an OS string slice to a byte slice. To convert the byte slice back into an OS
880+
/// string slice, use the [`OsStr::from_os_str_bytes_unchecked`] function.
881+
///
882+
/// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should
883+
/// be treated as opaque and only comparable within the same rust version built for the same
884+
/// target platform. See [`OsString`] for more encoding details and [`std::ffi`] for
885+
/// platform-specific, specified conversions.
886+
///
887+
/// [`std::ffi`]: crate::ffi
888+
#[inline]
889+
#[unstable(feature = "os_str_bytes", issue = "111544")]
890+
pub fn as_os_str_bytes(&self) -> &[u8] {
891+
self.inner.as_os_str_bytes()
892+
}
893+
840894
/// Gets the underlying byte representation.
841895
///
842896
/// Note: it is *crucial* that this API is not externally public, to avoid
843897
/// revealing the internal, platform-specific encodings.
844898
#[inline]
845899
pub(crate) fn bytes(&self) -> &[u8] {
846-
unsafe { &*(&self.inner as *const _ as *const [u8]) }
900+
self.as_os_str_bytes()
847901
}
848902

849903
/// Converts this string to its ASCII lower case equivalent in-place.

library/std/src/sys/unix/os_str.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,13 +193,18 @@ impl Buf {
193193

194194
impl Slice {
195195
#[inline]
196-
fn from_u8_slice(s: &[u8]) -> &Slice {
196+
pub fn as_os_str_bytes(&self) -> &[u8] {
197+
&self.inner
198+
}
199+
200+
#[inline]
201+
pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice {
197202
unsafe { mem::transmute(s) }
198203
}
199204

200205
#[inline]
201206
pub fn from_str(s: &str) -> &Slice {
202-
Slice::from_u8_slice(s.as_bytes())
207+
unsafe { Slice::from_os_str_bytes_unchecked(s.as_bytes()) }
203208
}
204209

205210
pub fn to_str(&self) -> Option<&str> {

library/std/src/sys/unix/os_str/tests.rs

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use super::*;
22

33
#[test]
44
fn slice_debug_output() {
5-
let input = Slice::from_u8_slice(b"\xF0hello,\tworld");
5+
let input = unsafe { Slice::from_os_str_bytes_unchecked(b"\xF0hello,\tworld") };
66
let expected = r#""\xF0hello,\tworld""#;
77
let output = format!("{input:?}");
88

@@ -11,8 +11,7 @@ fn slice_debug_output() {
1111

1212
#[test]
1313
fn display() {
14-
assert_eq!(
15-
"Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye",
16-
Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(),
17-
);
14+
assert_eq!("Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye", unsafe {
15+
Slice::from_os_str_bytes_unchecked(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string()
16+
},);
1817
}

library/std/src/sys/windows/os_str.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,16 @@ impl Buf {
151151
}
152152

153153
impl Slice {
154+
#[inline]
155+
pub fn as_os_str_bytes(&self) -> &[u8] {
156+
self.inner.as_bytes()
157+
}
158+
159+
#[inline]
160+
pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice {
161+
mem::transmute(Wtf8::from_bytes_unchecked(s))
162+
}
163+
154164
#[inline]
155165
pub fn from_str(s: &str) -> &Slice {
156166
unsafe { mem::transmute(Wtf8::from_str(s)) }

library/std/src/sys_common/wtf8.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -570,7 +570,7 @@ impl Wtf8 {
570570
/// Since the byte slice is not checked for valid WTF-8, this functions is
571571
/// marked unsafe.
572572
#[inline]
573-
unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
573+
pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
574574
mem::transmute(value)
575575
}
576576

@@ -614,6 +614,12 @@ impl Wtf8 {
614614
Wtf8CodePoints { bytes: self.bytes.iter() }
615615
}
616616

617+
/// Access raw bytes of WTF-8 data
618+
#[inline]
619+
pub fn as_bytes(&self) -> &[u8] {
620+
&self.bytes
621+
}
622+
617623
/// Tries to convert the string to UTF-8 and return a `&str` slice.
618624
///
619625
/// Returns `None` if the string contains surrogates.

0 commit comments

Comments
 (0)