3232#include " unicode/normalizer2.h"
3333#include " unicode/numfmt.h"
3434#include " unicode/numsys.h"
35- #include " unicode/regex.h"
36- #include " unicode/smpdtfmt.h"
3735#include " unicode/timezone.h"
38- #include " unicode/ucol.h"
39- #include " unicode/ures.h"
4036#include " unicode/ustring.h"
41- #include " unicode/uvernum.h"
42- #include " unicode/uversion.h"
4337
4438namespace v8 {
4539namespace internal {
@@ -573,124 +567,6 @@ MaybeHandle<Object> Intl::LegacyUnwrapReceiver(Isolate* isolate,
573567 return receiver;
574568}
575569
576- namespace {
577-
578- #if USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
579- // Define general regexp macros.
580- // Note "(?:" means the regexp group a non-capture group.
581- #define REGEX_ALPHA " [a-z]"
582- #define REGEX_DIGIT " [0-9]"
583- #define REGEX_ALPHANUM " (?:" REGEX_ALPHA " |" REGEX_DIGIT " )"
584-
585- void BuildLanguageTagRegexps (Isolate* isolate) {
586- // Define the language tag regexp macros.
587- // For info on BCP 47 see https://tools.ietf.org/html/bcp47 .
588- // Because language tags are case insensitive per BCP 47 2.1.1 and regexp's
589- // defined below will always be used after lowercasing the input, uppercase
590- // ranges in BCP 47 2.1 are dropped and grandfathered tags are all lowercased.
591- // clang-format off
592- #define BCP47_REGULAR \
593- " (?:art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|" \
594- " zh-min|zh-min-nan|zh-xiang)"
595- #define BCP47_IRREGULAR \
596- " (?:en-gb-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|" \
597- " i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|" \
598- " i-tsu|sgn-be-fr|sgn-be-nl|sgn-ch-de)"
599- #define BCP47_GRANDFATHERED " (?:" BCP47_IRREGULAR " |" BCP47_REGULAR " )"
600- #define BCP47_PRIVATE_USE " (?:x(?:-" REGEX_ALPHANUM " {1,8})+)"
601-
602- #define BCP47_SINGLETON " (?:" REGEX_DIGIT " |" " [a-wy-z])"
603-
604- #define BCP47_EXTENSION " (?:" BCP47_SINGLETON " (?:-" REGEX_ALPHANUM " {2,8})+)"
605- #define BCP47_VARIANT \
606- " (?:" REGEX_ALPHANUM " {5,8}" " |" " (?:" REGEX_DIGIT REGEX_ALPHANUM " {3}))"
607-
608- #define BCP47_REGION " (?:" REGEX_ALPHA " {2}" " |" REGEX_DIGIT " {3})"
609- #define BCP47_SCRIPT " (?:" REGEX_ALPHA " {4})"
610- #define BCP47_EXT_LANG " (?:" REGEX_ALPHA " {3}(?:-" REGEX_ALPHA " {3}){0,2})"
611- #define BCP47_LANGUAGE " (?:" REGEX_ALPHA " {2,3}(?:-" BCP47_EXT_LANG " )?" \
612- " |" REGEX_ALPHA " {4}" " |" REGEX_ALPHA " {5,8})"
613- #define BCP47_LANG_TAG \
614- BCP47_LANGUAGE \
615- " (?:-" BCP47_SCRIPT " )?" \
616- " (?:-" BCP47_REGION " )?" \
617- " (?:-" BCP47_VARIANT " )*" \
618- " (?:-" BCP47_EXTENSION " )*" \
619- " (?:-" BCP47_PRIVATE_USE " )?"
620- // clang-format on
621-
622- constexpr char kLanguageTagSingletonRegexp [] = " ^" BCP47_SINGLETON " $" ;
623- constexpr char kLanguageTagVariantRegexp [] = " ^" BCP47_VARIANT " $" ;
624- constexpr char kLanguageTagRegexp [] =
625- " ^(?:" BCP47_LANG_TAG " |" BCP47_PRIVATE_USE " |" BCP47_GRANDFATHERED " )$" ;
626-
627- UErrorCode status = U_ZERO_ERROR;
628- icu::RegexMatcher* language_singleton_regexp_matcher = new icu::RegexMatcher (
629- icu::UnicodeString (kLanguageTagSingletonRegexp , -1 , US_INV), 0 , status);
630- icu::RegexMatcher* language_tag_regexp_matcher = new icu::RegexMatcher (
631- icu::UnicodeString (kLanguageTagRegexp , -1 , US_INV), 0 , status);
632- icu::RegexMatcher* language_variant_regexp_matcher = new icu::RegexMatcher (
633- icu::UnicodeString (kLanguageTagVariantRegexp , -1 , US_INV), 0 , status);
634- CHECK (U_SUCCESS (status));
635-
636- isolate->set_language_tag_regexp_matchers (language_singleton_regexp_matcher,
637- language_tag_regexp_matcher,
638- language_variant_regexp_matcher);
639- // Undefine the language tag regexp macros.
640- #undef BCP47_EXTENSION
641- #undef BCP47_EXT_LANG
642- #undef BCP47_GRANDFATHERED
643- #undef BCP47_IRREGULAR
644- #undef BCP47_LANG_TAG
645- #undef BCP47_LANGUAGE
646- #undef BCP47_PRIVATE_USE
647- #undef BCP47_REGION
648- #undef BCP47_REGULAR
649- #undef BCP47_SCRIPT
650- #undef BCP47_SINGLETON
651- #undef BCP47_VARIANT
652- }
653-
654- // Undefine the general regexp macros.
655- #undef REGEX_ALPHA
656- #undef REGEX_DIGIT
657- #undef REGEX_ALPHANUM
658-
659- icu::RegexMatcher* GetLanguageSingletonRegexMatcher (Isolate* isolate) {
660- icu::RegexMatcher* language_singleton_regexp_matcher =
661- isolate->language_singleton_regexp_matcher ();
662- if (language_singleton_regexp_matcher == nullptr ) {
663- BuildLanguageTagRegexps (isolate);
664- language_singleton_regexp_matcher =
665- isolate->language_singleton_regexp_matcher ();
666- }
667- return language_singleton_regexp_matcher;
668- }
669-
670- icu::RegexMatcher* GetLanguageTagRegexMatcher (Isolate* isolate) {
671- icu::RegexMatcher* language_tag_regexp_matcher =
672- isolate->language_tag_regexp_matcher ();
673- if (language_tag_regexp_matcher == nullptr ) {
674- BuildLanguageTagRegexps (isolate);
675- language_tag_regexp_matcher = isolate->language_tag_regexp_matcher ();
676- }
677- return language_tag_regexp_matcher;
678- }
679-
680- icu::RegexMatcher* GetLanguageVariantRegexMatcher (Isolate* isolate) {
681- icu::RegexMatcher* language_variant_regexp_matcher =
682- isolate->language_variant_regexp_matcher ();
683- if (language_variant_regexp_matcher == nullptr ) {
684- BuildLanguageTagRegexps (isolate);
685- language_variant_regexp_matcher =
686- isolate->language_variant_regexp_matcher ();
687- }
688- return language_variant_regexp_matcher;
689- }
690- #endif // USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
691-
692- } // anonymous namespace
693-
694570Maybe<bool > Intl::GetStringOption (Isolate* isolate, Handle<JSReceiver> options,
695571 const char * property,
696572 std::vector<const char *> values,
@@ -776,111 +652,6 @@ char AsciiToLower(char c) {
776652 return c | (1 << 5 );
777653}
778654
779- #if USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
780- /* *
781- * Check the structural Validity of the language tag per ECMA 402 6.2.2:
782- * - Well-formed per RFC 5646 2.1
783- * - There are no duplicate variant subtags
784- * - There are no duplicate singleton (extension) subtags
785- *
786- * One extra-check is done (from RFC 5646 2.2.9): the tag is compared
787- * against the list of grandfathered tags. However, subtags for
788- * primary/extended language, script, region, variant are not checked
789- * against the IANA language subtag registry.
790- *
791- * ICU 62 or earlier is too permissible and lets invalid tags, like
792- * hant-cmn-cn, through.
793- *
794- * Returns false if the language tag is invalid.
795- */
796- bool IsStructurallyValidLanguageTag (Isolate* isolate,
797- const std::string& locale_in) {
798- if (!String::IsAscii (locale_in.c_str (),
799- static_cast <int >(locale_in.length ()))) {
800- return false ;
801- }
802- std::string locale (locale_in);
803- icu::RegexMatcher* language_tag_regexp_matcher =
804- GetLanguageTagRegexMatcher (isolate);
805-
806- // Check if it's well-formed, including grandfathered tags.
807- icu::UnicodeString locale_uni (locale.c_str (), -1 , US_INV);
808- // Note: icu::RegexMatcher::reset does not make a copy of the input string
809- // so cannot use a temp value; ie: cannot create it as a call parameter.
810- language_tag_regexp_matcher->reset (locale_uni);
811- UErrorCode status = U_ZERO_ERROR;
812- bool is_valid_lang_tag = language_tag_regexp_matcher->matches (status);
813- if (!is_valid_lang_tag || V8_UNLIKELY (U_FAILURE (status))) {
814- return false ;
815- }
816-
817- // Just return if it's a x- form. It's all private.
818- if (locale.find (" x-" ) == 0 ) {
819- return true ;
820- }
821-
822- // Check if there are any duplicate variants or singletons (extensions).
823-
824- // Remove private use section.
825- locale = locale.substr (0 , locale.find (" -x-" ));
826-
827- // Skip language since it can match variant regex, so we start from 1.
828- // We are matching i-klingon here, but that's ok, since i-klingon-klingon
829- // is not valid and would fail LANGUAGE_TAG_RE test.
830- size_t pos = 0 ;
831- std::vector<std::string> parts;
832- while ((pos = locale.find (' -' )) != std::string::npos) {
833- std::string token = locale.substr (0 , pos);
834- parts.push_back (token);
835- locale = locale.substr (pos + 1 );
836- }
837- if (locale.length () != 0 ) {
838- parts.push_back (locale);
839- }
840-
841- icu::RegexMatcher* language_variant_regexp_matcher =
842- GetLanguageVariantRegexMatcher (isolate);
843-
844- icu::RegexMatcher* language_singleton_regexp_matcher =
845- GetLanguageSingletonRegexMatcher (isolate);
846-
847- std::vector<std::string> variants;
848- std::vector<std::string> extensions;
849- for (auto it = parts.begin () + 1 ; it != parts.end (); it++) {
850- icu::UnicodeString part (it->data (), -1 , US_INV);
851- language_variant_regexp_matcher->reset (part);
852- bool is_language_variant = language_variant_regexp_matcher->matches (status);
853- if (V8_UNLIKELY (U_FAILURE (status))) {
854- return false ;
855- }
856- if (is_language_variant && extensions.size () == 0 ) {
857- if (std::find (variants.begin (), variants.end (), *it) == variants.end ()) {
858- variants.push_back (*it);
859- } else {
860- return false ;
861- }
862- }
863-
864- language_singleton_regexp_matcher->reset (part);
865- bool is_language_singleton =
866- language_singleton_regexp_matcher->matches (status);
867- if (V8_UNLIKELY (U_FAILURE (status))) {
868- return false ;
869- }
870- if (is_language_singleton) {
871- if (std::find (extensions.begin (), extensions.end (), *it) ==
872- extensions.end ()) {
873- extensions.push_back (*it);
874- } else {
875- return false ;
876- }
877- }
878- }
879-
880- return true ;
881- }
882- #endif // USE_CHROMIUM_ICU == 0 || U_ICU_VERSION_MAJOR_NUM < 63
883-
884655bool IsLowerAscii (char c) { return c >= ' a' && c < ' z' ; }
885656
886657bool IsTwoLetterLanguage (const std::string& locale) {
@@ -953,15 +724,6 @@ Maybe<std::string> Intl::CanonicalizeLanguageTag(Isolate* isolate,
953724 // the input before any more check.
954725 std::transform (locale.begin (), locale.end (), locale.begin (), AsciiToLower);
955726
956- #if USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
957- if (!IsStructurallyValidLanguageTag (isolate, locale)) {
958- THROW_NEW_ERROR_RETURN_VALUE (
959- isolate,
960- NewRangeError (MessageTemplate::kInvalidLanguageTag , locale_str),
961- Nothing<std::string>());
962- }
963- #endif
964-
965727 // ICU maps a few grandfathered tags to what looks like a regular language
966728 // tag even though IANA language tag registry does not have a preferred
967729 // entry map for them. Return them as they're with lowercasing.
@@ -986,9 +748,7 @@ Maybe<std::string> Intl::CanonicalizeLanguageTag(Isolate* isolate,
986748 uloc_forLanguageTag (locale.c_str (), icu_result, ULOC_FULLNAME_CAPACITY,
987749 &parsed_length, &error);
988750 if (U_FAILURE (error) ||
989- #if USE_CHROMIUM_ICU == 1 || U_ICU_VERSION_MAJOR_NUM >= 63
990751 static_cast <size_t >(parsed_length) < locale.length () ||
991- #endif
992752 error == U_STRING_NOT_TERMINATED_WARNING) {
993753 THROW_NEW_ERROR_RETURN_VALUE (
994754 isolate,
0 commit comments