auto merge of #6029 : Kimundi/rust/ascii-encoding, r=thestinger

bors · bors · commit ee3789b4e434 · 2013-04-24T13:33:29.000-07:00
Replaced {str, char, u8}::is_ascii
Replaced str::to_lower and str::to_upper
diff --git a/doc/rust.md b/doc/rust.md
@@ -802,7 +802,7 @@ An example of `use` declarations:
 
 ~~~~
 use core::float::sin;
-use core::str::{slice, to_upper};
+use core::str::{slice, contains};
 use core::option::Some;
 
 fn main() {
@@ -813,8 +813,8 @@ fn main() {
     info!(Some(1.0));
 
     // Equivalent to
-    // 'info!(core::str::to_upper(core::str::slice("foo", 0, 1)));'
-    info!(to_upper(slice("foo", 0, 1)));
+    // 'info!(core::str::contains(core::str::slice("foo", 0, 1), "oo"));'
+    info!(contains(slice("foo", 0, 1), "oo"));
 }
 ~~~~
 
diff --git a/src/compiletest/errors.rs b/src/compiletest/errors.rs
@@ -50,7 +50,11 @@ fn parse_expected(line_num: uint, line: ~str) -> ~[ExpectedError] {
     while idx < len && line[idx] == (' ' as u8) { idx += 1u; }
     let start_kind = idx;
     while idx < len && line[idx] != (' ' as u8) { idx += 1u; }
-    let kind = str::to_lower(str::slice(line, start_kind, idx).to_owned());
+
+    // FIXME: #4318 Instead of to_ascii and to_str_ascii, could use
+    // to_ascii_consume and to_str_consume to not do a unnecessary copy.
+    let kind = str::slice(line, start_kind, idx);
+    let kind = kind.to_ascii().to_lower().to_str_ascii();
 
     // Extract msg:
     while idx < len && line[idx] == (' ' as u8) { idx += 1u; }
diff --git a/src/libcore/char.rs b/src/libcore/char.rs
@@ -100,12 +100,6 @@ pub fn is_alphanumeric(c: char) -> bool {
         unicode::general_category::No(c);
 }
 
-/// Indicates whether the character is an ASCII character
-#[inline(always)]
-pub fn is_ascii(c: char) -> bool {
-   c - ('\x7F' & c) == '\x00'
-}
-
 /// Indicates whether the character is numeric (Nd, Nl, or No)
 #[inline(always)]
 pub fn is_digit(c: char) -> bool {
@@ -116,7 +110,7 @@ pub fn is_digit(c: char) -> bool {
 
 /**
  * Checks if a character parses as a numeric digit in the given radix.
- * Compared to `is_digit()`, this function only recognizes the ascii
+ * Compared to `is_digit()`, this function only recognizes the
  * characters `0-9`, `a-z` and `A-Z`.
  *
  * Returns `true` if `c` is a valid digit under `radix`, and `false`
@@ -163,7 +157,7 @@ pub fn to_digit(c: char, radix: uint) -> Option<uint> {
 }
 
 /**
- * Converts a number to the ascii character representing it.
+ * Converts a number to the character representing it.
  *
  * Returns `Some(char)` if `num` represents one digit under `radix`,
  * using one character of `0-9` or `a-z`, or `None` if it doesn't.
@@ -316,12 +310,6 @@ fn test_to_digit() {
     assert!(to_digit('$', 36u).is_none());
 }
 
-#[test]
-fn test_is_ascii() {
-   assert!(str::all(~"banana", is_ascii));
-   assert!(! str::all(~"ประเทศไทย中华Việt Nam", is_ascii));
-}
-
 #[test]
 fn test_is_digit() {
    assert!(is_digit('2'));
diff --git a/src/libcore/num/uint-template/u8.rs b/src/libcore/num/uint-template/u8.rs
@@ -10,16 +10,9 @@
 
 //! Operations and constants for `u8`
 
-pub use self::inst::is_ascii;
-
 mod inst {
     pub type T = u8;
     #[allow(non_camel_case_types)]
     pub type T_SIGNED = i8;
     pub static bits: uint = 8;
-
-    // Type-specific functions here. These must be reexported by the
-    // parent module so that they appear in core::u8 and not core::u8::u8;
-
-    pub fn is_ascii(x: T) -> bool { return 0 as T == x & 128 as T; }
 }
diff --git a/src/libcore/path.rs b/src/libcore/path.rs
@@ -19,6 +19,7 @@ use libc;
 use option::{None, Option, Some};
 use str;
 use to_str::ToStr;
+use ascii::{AsciiCast, AsciiStr};
 
 #[deriving(Clone, Eq)]
 pub struct WindowsPath {
@@ -753,7 +754,9 @@ impl GenericPath for WindowsPath {
     fn is_restricted(&self) -> bool {
         match self.filestem() {
             Some(stem) => {
-                match stem.to_lower() {
+                // FIXME: #4318 Instead of to_ascii and to_str_ascii, could use
+                // to_ascii_consume and to_str_consume to not do a unnecessary copy.
+                match stem.to_ascii().to_lower().to_str_ascii() {
                     ~"con" | ~"aux" | ~"com1" | ~"com2" | ~"com3" | ~"com4" |
                     ~"lpt1" | ~"lpt2" | ~"lpt3" | ~"prn" | ~"nul" => true,
                     _ => false
@@ -809,7 +812,10 @@ impl GenericPath for WindowsPath {
             host: copy self.host,
             device: match self.device {
                 None => None,
-                Some(ref device) => Some(device.to_upper())
+
+                // FIXME: #4318 Instead of to_ascii and to_str_ascii, could use
+                // to_ascii_consume and to_str_consume to not do a unnecessary copy.
+                Some(ref device) => Some(device.to_ascii().to_upper().to_str_ascii())
             },
             is_absolute: self.is_absolute,
             components: normalize(self.components)
diff --git a/src/libcore/str.rs b/src/libcore/str.rs
@@ -27,7 +27,6 @@ use option::{None, Option, Some};
 use iterator::Iterator;
 use ptr;
 use str;
-use u8;
 use uint;
 use vec;
 use to_str::ToStr;
@@ -787,22 +786,6 @@ pub fn each_split_within<'a>(ss: &'a str,
     }
 }
 
-/// Convert a string to lowercase. ASCII only
-pub fn to_lower(s: &str) -> ~str {
-    do map(s) |c| {
-        assert!(char::is_ascii(c));
-        (unsafe{libc::tolower(c as libc::c_char)}) as char
-    }
-}
-
-/// Convert a string to uppercase. ASCII only
-pub fn to_upper(s: &str) -> ~str {
-    do map(s) |c| {
-        assert!(char::is_ascii(c));
-        (unsafe{libc::toupper(c as libc::c_char)}) as char
-    }
-}
-
 /**
  * Replace all occurrences of one string with another
  *
@@ -1610,13 +1593,6 @@ pub fn ends_with<'a,'b>(haystack: &'a str, needle: &'b str) -> bool {
 Section: String properties
 */
 
-/// Determines if a string contains only ASCII characters
-pub fn is_ascii(s: &str) -> bool {
-    let mut i: uint = len(s);
-    while i > 0u { i -= 1u; if !u8::is_ascii(s[i]) { return false; } }
-    return true;
-}
-
 /// Returns true if the string has length 0
 pub fn is_empty(s: &str) -> bool { len(s) == 0u }
 
@@ -2403,8 +2379,6 @@ pub trait StrSlice<'self> {
     fn each_split_str<'a>(&self, sep: &'a str, it: &fn(&'self str) -> bool);
     fn starts_with<'a>(&self, needle: &'a str) -> bool;
     fn substr(&self, begin: uint, n: uint) -> &'self str;
-    fn to_lower(&self) -> ~str;
-    fn to_upper(&self) -> ~str;
     fn escape_default(&self) -> ~str;
     fn escape_unicode(&self) -> ~str;
     fn trim(&self) -> &'self str;
@@ -2565,12 +2539,6 @@ impl<'self> StrSlice<'self> for &'self str {
     fn substr(&self, begin: uint, n: uint) -> &'self str {
         substr(*self, begin, n)
     }
-    /// Convert a string to lowercase
-    #[inline]
-    fn to_lower(&self) -> ~str { to_lower(*self) }
-    /// Convert a string to uppercase
-    #[inline]
-    fn to_upper(&self) -> ~str { to_upper(*self) }
     /// Escape each char in `s` with char::escape_default.
     #[inline]
     fn escape_default(&self) -> ~str { escape_default(*self) }
@@ -3084,27 +3052,6 @@ mod tests {
         assert!(repeat(~"hi", 0) == ~"");
     }
 
-    #[test]
-    fn test_to_upper() {
-        // libc::toupper, and hence str::to_upper
-        // are culturally insensitive: they only work for ASCII
-        // (see Issue #1347)
-        let unicode = ~""; //"\u65e5\u672c"; // uncomment once non-ASCII works
-        let input = ~"abcDEF" + unicode + ~"xyz:.;";
-        let expected = ~"ABCDEF" + unicode + ~"XYZ:.;";
-        let actual = to_upper(input);
-        assert!(expected == actual);
-    }
-
-    #[test]
-    fn test_to_lower() {
-        // libc::tolower, and hence str::to_lower
-        // are culturally insensitive: they only work for ASCII
-        // (see Issue #1347)
-        assert!(~"" == to_lower(""));
-        assert!(~"ymca" == to_lower("YMCA"));
-    }
-
     #[test]
     fn test_unsafe_slice() {
         assert!("ab" == unsafe {raw::slice_bytes("abc", 0, 2)});
@@ -3337,13 +3284,6 @@ mod tests {
         assert!((!is_whitespace(~"   _   ")));
     }
 
-    #[test]
-    fn test_is_ascii() {
-        assert!((is_ascii(~"")));
-        assert!((is_ascii(~"a")));
-        assert!((!is_ascii(~"\u2009")));
-    }
-
     #[test]
     fn test_shift_byte() {
         let mut s = ~"ABC";
diff --git a/src/libcore/str/ascii.rs b/src/libcore/str/ascii.rs
@@ -199,6 +199,7 @@ impl ToStrConsume for ~[Ascii] {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use str;
 
     macro_rules! v2ascii (
         ( [$($e:expr),*]) => ( [$(Ascii{chr:$e}),*]);
@@ -221,6 +222,9 @@ mod tests {
         assert_eq!('['.to_ascii().to_lower().to_char(), '[');
         assert_eq!('`'.to_ascii().to_upper().to_char(), '`');
         assert_eq!('{'.to_ascii().to_upper().to_char(), '{');
+
+        assert!(str::all(~"banana", |c| c.is_ascii()));
+        assert!(! str::all(~"ประเทศไทย中华Việt Nam", |c| c.is_ascii()));
     }
 
     #[test]
@@ -234,6 +238,15 @@ mod tests {
 
         assert_eq!("abCDef&?#".to_ascii().to_lower().to_str_ascii(), ~"abcdef&?#");
         assert_eq!("abCDef&?#".to_ascii().to_upper().to_str_ascii(), ~"ABCDEF&?#");
+
+        assert_eq!("".to_ascii().to_lower().to_str_ascii(), ~"");
+        assert_eq!("YMCA".to_ascii().to_lower().to_str_ascii(), ~"ymca");
+        assert_eq!("abcDEFxyz:.;".to_ascii().to_upper().to_str_ascii(), ~"ABCDEFXYZ:.;");
+
+        assert!("".is_ascii());
+        assert!("a".is_ascii());
+        assert!(!"\u2009".is_ascii());
+
     }
 
     #[test]
diff --git a/src/libcore/unstable/extfmt.rs b/src/libcore/unstable/extfmt.rs
@@ -520,7 +520,13 @@ pub mod rt {
             match cv.ty {
               TyDefault => uint_to_str_prec(u, 10, prec),
               TyHexLower => uint_to_str_prec(u, 16, prec),
-              TyHexUpper => str::to_upper(uint_to_str_prec(u, 16, prec)),
+
+              // FIXME: #4318 Instead of to_ascii and to_str_ascii, could use
+              // to_ascii_consume and to_str_consume to not do a unnecessary copy.
+              TyHexUpper => {
+                let s = uint_to_str_prec(u, 16, prec);
+                s.to_ascii().to_upper().to_str_ascii()
+              }
               TyBits => uint_to_str_prec(u, 2, prec),
               TyOctal => uint_to_str_prec(u, 8, prec)
             };
diff --git a/src/librustc/driver/driver.rs b/src/librustc/driver/driver.rs
@@ -546,7 +546,11 @@ pub fn build_session_options(binary: @~str,
     let lint_dict = lint::get_lint_dict();
     for lint_levels.each |level| {
         let level_name = lint::level_to_str(*level);
-        let level_short = level_name.substr(0,1).to_upper();
+
+        // FIXME: #4318 Instead of to_ascii and to_str_ascii, could use
+        // to_ascii_consume and to_str_consume to not do a unnecessary copy.
+        let level_short = level_name.substr(0,1);
+        let level_short = level_short.to_ascii().to_upper().to_str_ascii();
         let flags = vec::append(getopts::opt_strs(matches, level_short),
                                 getopts::opt_strs(matches, level_name));
         for flags.each |lint_name| {
diff --git a/src/librustdoc/markdown_index_pass.rs b/src/librustdoc/markdown_index_pass.rs
@@ -157,7 +157,9 @@ pub fn pandoc_header_id(header: &str) -> ~str {
         let s = str::replace(s, ~" ", ~"-");
         return s;
     }
-    fn convert_to_lowercase(s: &str) -> ~str { str::to_lower(s) }
+    // FIXME: #4318 Instead of to_ascii and to_str_ascii, could use
+    // to_ascii_consume and to_str_consume to not do a unnecessary copy.
+    fn convert_to_lowercase(s: &str) -> ~str { s.to_ascii().to_lower().to_str_ascii() }
     fn remove_up_to_first_letter(s: &str) -> ~str { s.to_str() }
     fn maybe_use_section_id(s: &str) -> ~str { s.to_str() }
 }
diff --git a/src/libstd/semver.rs b/src/libstd/semver.rs
@@ -220,7 +220,7 @@ fn parse_reader(rdr: @io::Reader) -> Version {
 
 
 pub fn parse(s: &str) -> Option<Version> {
-    if ! str::is_ascii(s) {
+    if !s.is_ascii() {
         return None;
     }
     let s = s.trim();
diff --git a/src/libstd/sort.rs b/src/libstd/sort.rs
@@ -885,8 +885,12 @@ mod tests {
         // tjc: funny that we have to use parens
         fn ile(x: &(&'static str), y: &(&'static str)) -> bool
         {
-            let x = x.to_lower();
-            let y = y.to_lower();
+            // FIXME: #4318 Instead of to_ascii and to_str_ascii, could use
+            // to_ascii_consume and to_str_consume to not do a unnecessary copy.
+            // (Actually, could just remove the to_str_* call, but needs an deriving(Ord) on
+            // Ascii)
+            let x = x.to_ascii().to_lower().to_str_ascii();
+            let y = y.to_ascii().to_lower().to_str_ascii();
             x <= y
         }
 
diff --git a/src/test/bench/shootout-k-nucleotide-pipes.rs b/src/test/bench/shootout-k-nucleotide-pipes.rs
@@ -59,7 +59,10 @@ fn sort_and_fmt(mm: &HashMap<~[u8], uint>, total: uint) -> ~str {
    for pairs_sorted.each |kv| {
        let (k,v) = copy *kv;
        unsafe {
-           buffer += (fmt!("%s %0.3f\n", str::to_upper(str::raw::from_bytes(k)), v));
+           let b = str::raw::from_bytes(k);
+           // FIXME: #4318 Instead of to_ascii and to_str_ascii, could use
+           // to_ascii_consume and to_str_consume to not do a unnecessary copy.
+           buffer += (fmt!("%s %0.3f\n", b.to_ascii().to_upper().to_str_ascii(), v));
        }
    }
 
@@ -68,7 +71,9 @@ fn sort_and_fmt(mm: &HashMap<~[u8], uint>, total: uint) -> ~str {
 
 // given a map, search for the frequency of a pattern
 fn find(mm: &HashMap<~[u8], uint>, key: ~str) -> uint {
-   match mm.find(&str::to_bytes(str::to_lower(key))) {
+   // FIXME: #4318 Instead of to_ascii and to_str_ascii, could use
+   // to_ascii_consume and to_str_consume to not do a unnecessary copy.
+   match mm.find(&str::to_bytes(key.to_ascii().to_lower().to_str_ascii())) {
       option::None      => { return 0u; }
       option::Some(&num) => { return num; }
    }

Original file line number	Diff line number	Diff line change
`@@ -157,7 +157,9 @@ pub fn pandoc_header_id(header: &str) -> ~str {`
`157`	`157`	`let s = str::replace(s, ~" ", ~"-");`
`158`	`158`	`return s;`
`159`	`159`	`}`
`160`		`- fn convert_to_lowercase(s: &str) -> ~str { str::to_lower(s) }`
	`160`	`+ // FIXME: #4318 Instead of to_ascii and to_str_ascii, could use`
	`161`	`+ // to_ascii_consume and to_str_consume to not do a unnecessary copy.`
	`162`	`+ fn convert_to_lowercase(s: &str) -> ~str { s.to_ascii().to_lower().to_str_ascii() }`
`161`	`163`	`fn remove_up_to_first_letter(s: &str) -> ~str { s.to_str() }`
`162`	`164`	`fn maybe_use_section_id(s: &str) -> ~str { s.to_str() }`
`163`	`165`	`}`
Original file line number	Diff line number	Diff line change
`@@ -220,7 +220,7 @@ fn parse_reader(rdr: @io::Reader) -> Version {`
`220`	`220`
`221`	`221`
`222`	`222`	`pub fn parse(s: &str) -> Option<Version> {`
`223`		`- if ! str::is_ascii(s) {`
	`223`	`+ if !s.is_ascii() {`
`224`	`224`	`return None;`
`225`	`225`	`}`
`226`	`226`	`let s = s.trim();`
Original file line number	Diff line number	Diff line change
`@@ -885,8 +885,12 @@ mod tests {`
`885`	`885`	`// tjc: funny that we have to use parens`
`886`	`886`	`fn ile(x: &(&'static str), y: &(&'static str)) -> bool`
`887`	`887`	`{`
`888`		`- let x = x.to_lower();`
`889`		`- let y = y.to_lower();`
	`888`	`+ // FIXME: #4318 Instead of to_ascii and to_str_ascii, could use`
	`889`	`+ // to_ascii_consume and to_str_consume to not do a unnecessary copy.`
	`890`	`+ // (Actually, could just remove the to_str_* call, but needs an deriving(Ord) on`
	`891`	`+ // Ascii)`
	`892`	`+ let x = x.to_ascii().to_lower().to_str_ascii();`
	`893`	`+ let y = y.to_ascii().to_lower().to_str_ascii();`
`890`	`894`	`x <= y`
`891`	`895`	`}`
`892`	`896`