@@ -275,20 +275,69 @@ mod prim_bool {}
275
275
mod prim_never { }
276
276
277
277
#[ doc( primitive = "char" ) ]
278
+ #[ allow( rustdoc:: invalid_rust_codeblocks) ]
278
279
/// A character type.
279
280
///
280
281
/// The `char` type represents a single character. More specifically, since
281
282
/// 'character' isn't a well-defined concept in Unicode, `char` is a '[Unicode
282
- /// scalar value]', which is similar to, but not the same as, a '[Unicode code
283
- /// point]'.
284
- ///
285
- /// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
286
- /// [Unicode code point]: https://www.unicode.org/glossary/#code_point
283
+ /// scalar value]'.
287
284
///
288
285
/// This documentation describes a number of methods and trait implementations on the
289
286
/// `char` type. For technical reasons, there is additional, separate
290
287
/// documentation in [the `std::char` module](char/index.html) as well.
291
288
///
289
+ /// # Validity
290
+ ///
291
+ /// A `char` is a '[Unicode scalar value]', which is any '[Unicode code point]'
292
+ /// other than a [surrogate code point]. This has a fixed numerical definition:
293
+ /// code points are in the range 0 to 0x10FFFF, inclusive.
294
+ /// Surrogate code points, used by UTF-16, are in the range 0xD800 to 0xDFFF.
295
+ ///
296
+ /// No `char` may be constructed, whether as a literal or at runtime, that is not a
297
+ /// Unicode scalar value:
298
+ ///
299
+ /// ```compile_fail
300
+ /// // Each of these is a compiler error
301
+ /// ['\u{D800}', '\u{DFFF}', '\u{110000}'];
302
+ /// ```
303
+ ///
304
+ /// ```should_panic
305
+ /// // Panics; from_u32 returns None.
306
+ /// char::from_u32(0xDE01).unwrap();
307
+ /// ```
308
+ ///
309
+ /// ```no_run
310
+ /// // Undefined behaviour
311
+ /// unsafe { char::from_u32_unchecked(0x110000) };
312
+ /// ```
313
+ ///
314
+ /// USVs are also the exact set of values that may be encoded in UTF-8. Because
315
+ /// `char` values are USVs and `str` values are valid UTF-8, it is safe to store
316
+ /// any `char` in a `str` or read any character from a `str` as a `char`.
317
+ ///
318
+ /// The gap in valid `char` values is understood by the compiler, so in the
319
+ /// below example the two ranges are understood to cover the whole range of
320
+ /// possible `char` values and there is no error for a [non-exhaustive match].
321
+ ///
322
+ /// ```
323
+ /// let c: char = 'a';
324
+ /// match c {
325
+ /// '\0' ..= '\u{D7FF}' => false,
326
+ /// '\u{E000}' ..= '\u{10FFFF}' => true,
327
+ /// };
328
+ /// ```
329
+ ///
330
+ /// All USVs are valid `char` values, but not all of them represent a real
331
+ /// character. Many USVs are not currently assigned to a character, but may be
332
+ /// in the future ("reserved"); some will never be a character
333
+ /// ("noncharacters"); and some may be given different meanings by different
334
+ /// users ("private use").
335
+ ///
336
+ /// [Unicode code point]: https://www.unicode.org/glossary/#code_point
337
+ /// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
338
+ /// [non-exhaustive match]: ../book/ch06-02-match.html#matches-are-exhaustive
339
+ /// [surrogate code point]: https://www.unicode.org/glossary/#surrogate_code_point
340
+ ///
292
341
/// # Representation
293
342
///
294
343
/// `char` is always four bytes in size. This is a different representation than
0 commit comments