Make WordPress Core

Changeset 58147

Timestamp:
05/14/2024 06:03:43 PM (2 months ago)
Author:
dmsnell
Message:

Normalize UTF-8 charset slug detection.

There are several exist places in Core that attempt to detect if a blog charset
is UTF-8. Each place attempts to perform the same check, except the logic is
spread throughout and there's no single method provided to make this
determination in a consistent way. The _canonical_charset() method exists,
but is marked private for use.

In this patch the new unicode module provides is_utf8_charset() as a method
taking an optional charset slug and indicating if it represents UTF-8,
examining all of the allowable variants of that slug. Associated code is
updated to use this new function, including _canonical_charset(). If no slug
is provided, it will look up the current get_option( 'blog_charset' ).

Finally, the test functions governing _canonical_charset() have been
rewritten as a single test with a data provider instead of as separate test
functions.

Developed in https://github.com/WordPress/wordpress-develop/pull/6535
Discussed in https://core.trac.wordpress.org/ticket/61182

Fixes #61182.
Props dmsnell, jonsurrell.

Location:
trunk
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-admin/options-reading.php

    r57797 r58147  
    6565settings_fields( 'reading' );
    6666
    67 if ( ! in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
     67if ( ! i) ) {
    6868    add_settings_field( 'blog_charset', __( 'Encoding for pages and feeds' ), 'options_reading_blog_charset', 'reading', 'default', array( 'label_for' => 'blog_charset' ) );
    6969}
  • trunk/src/wp-admin/options.php

    r58140 r58147  
    161161$mail_options = array( 'mailserver_url', 'mailserver_port', 'mailserver_login', 'mailserver_pass' );
    162162
    163 if ( ! in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
     163if ( ! i) ) {
    164164    $allowed_options['reading'][] = 'blog_charset';
    165165}
  • trunk/src/wp-includes/compat.php

    r57985 r58147  
    9292     * charset just use built-in substr().
    9393     */
    94     if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
     94    if ( ! i ) ) {
    9595        return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length );
    9696    }
     
    177177     * just use built-in strlen().
    178178     */
    179     if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
     179    if ( ! i ) ) {
    180180        return strlen( $str );
    181181    }
  • trunk/src/wp-includes/formatting.php

    r57910 r58147  
    961961    }
    962962
    963     // Store the site charset as a static to avoid multiple calls to wp_load_alloptions().
    964     if ( ! $charset ) {
    965         static $_charset = null;
    966         if ( ! isset( $_charset ) ) {
    967             $alloptions = wp_load_alloptions();
    968             $_charset   = isset( $alloptions['blog_charset'] ) ? $alloptions['blog_charset'] : '';
    969         }
    970         $charset = $_charset;
    971     }
    972 
    973     if ( in_array( $charset, array( 'utf8', 'utf-8', 'UTF8' ), true ) ) {
    974         $charset = 'UTF-8';
    975     }
     963    $charset = _canonical_charset( $charset ? $charset : get_option( 'blog_charset' ) );
    976964
    977965    $_quote_style = $quote_style;
     
    11151103    static $is_utf8 = null;
    11161104    if ( ! isset( $is_utf8 ) ) {
    1117         $is_utf8 = in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true );
     1105        $is_utf8 = i);
    11181106    }
    11191107    if ( ! $is_utf8 ) {
  • trunk/src/wp-includes/functions.php

    r58130 r58147  
    74757475 * @see https://core.trac.wordpress.org/ticket/23688
    74767476 *
    7477  * @param string $charset A charset name.
     7477 * @param string $charset A charset name.
    74787478 * @return string The canonical form of the charset.
    74797479 */
    74807480function _canonical_charset( $charset ) {
    7481     if ( 'utf-8' === strtolower( $charset ) || 'utf8' === strtolower( $charset ) ) {
    7482 
     7481    if ( is_utf8_charset( $charset ) ) {
    74837482        return 'UTF-8';
    74847483    }
    74857484
    7486     if ( 'iso-8859-1' === strtolower( $charset ) || 'iso8859-1' === strtolower( $charset ) ) {
    7487 
     7485    /*
     7486     * Normalize the ISO-8859-1 family of languages.
     7487     *
     7488     * This is not required for htmlspecialchars(), as it properly recognizes all of
     7489     * the input character sets that here are transformed into "ISO-8859-1".
     7490     *
     7491     * @todo Should this entire check be removed since it's not required for the stated purpose?
     7492     * @todo Should WordPress transform other potential charset equivalents, such as "latin1"?
     7493     */
     7494    if (
     7495        ( 0 === strcasecmp( 'iso-8859-1', $charset ) ) ||
     7496        ( 0 === strcasecmp( 'iso8859-1', $charset ) )
     7497    ) {
    74887498        return 'ISO-8859-1';
    74897499    }
  • trunk/src/wp-settings.php

    r57748 r58147  
    107107
    108108// Load early WordPress files.
     109
    109110require ABSPATH . WPINC . '/class-wp-list-util.php';
    110111require ABSPATH . WPINC . '/formatting.php';
  • trunk/tests/phpunit/tests/functions/canonicalCharset.php

    r56971 r58147  
    1111 */
    1212class Tests_Functions_CanonicalCharset extends WP_UnitTestCase {
    13 
    14     public function test_utf_8_lower() {
    15         $this->assertSame( 'UTF-8', _canonical_charset( 'utf-8' ) );
     13    /**
     14     * Ensures that charset variants for common encodings normalize to the expected form.
     15     *
     16     * @ticket 61182
     17     *
     18     * @dataProvider data_charset_normalizations
     19     *
     20     * @param string $given_charset      Potential charset provided by user.
     21     * @param string $normalized_charset Expected normalized form of charset.
     22     */
     23    public function test_properly_normalizes_charset_variants( $given_charset, $normalized_charset ) {
     24        $this->assertSame(
     25            $normalized_charset,
     26            _canonical_charset( $given_charset ),
     27            'Did not properly transform the provided charset into its normalized form.'
     28        );
    1629    }
    1730
    18     public function test_utf_8_upper() {
    19         $this->assertSame( 'UTF-8', _canonical_charset( 'UTF-8' ) );
    20     }
     31    /**
     32     * Data provider.
     33     *
     34     * @return array[].
     35     */
     36    public static function data_charset_normalizations() {
     37        return array(
     38            // UTF-8 family.
     39            array( 'UTF-8', 'UTF-8' ),
     40            array( 'Utf-8', 'UTF-8' ),
     41            array( 'Utf-8', 'UTF-8' ),
     42            array( 'UTF8', 'UTF-8' ),
    2143
    22     public function test_utf_8_mixxed() {
    23         $this->assertSame( 'UTF-8', _canonical_charset( 'Utf-8' ) );
    24     }
     44            // Almost UTF-8.
     45            array( 'UTF-8*', 'UTF-8*' ),
     46            array( 'UTF.8', 'UTF.8' ),
     47            array( 'UTF88', 'UTF88' ),
     48            array( 'UTF-7', 'UTF-7' ),
     49            array( 'X-UTF-8', 'X-UTF-8' ),
    2550
    26     public function test_utf_8() {
    27         $this->assertSame( 'UTF-8', _canonical_charset( 'UTF8' ) );
    28     }
     51            // ISO-8859-1 family.
     52            array( 'iso-8859-1', 'ISO-8859-1' ),
     53            array( 'ISO-8859-1', 'ISO-8859-1' ),
     54            array( 'Iso-8859-1', 'ISO-8859-1' ),
     55            array( 'ISO8859-1', 'ISO-8859-1' ),
    2956
    30     public function test_iso_lower() {
    31         $this->assertSame( 'ISO-8859-1', _canonical_charset( 'iso-8859-1' ) );
    32     }
    33 
    34     public function test_iso_upper() {
    35         $this->assertSame( 'ISO-8859-1', _canonical_charset( 'ISO-8859-1' ) );
    36     }
    37 
    38     public function test_iso_mixxed() {
    39         $this->assertSame( 'ISO-8859-1', _canonical_charset( 'Iso8859-1' ) );
    40     }
    41 
    42     public function test_iso() {
    43         $this->assertSame( 'ISO-8859-1', _canonical_charset( 'ISO8859-1' ) );
    44     }
    45 
    46     public function test_random() {
    47         $this->assertSame( 'random', _canonical_charset( 'random' ) );
    48     }
    49 
    50     public function test_empty() {
    51         $this->assertSame( '', _canonical_charset( '' ) );
     57            // Other charset slugs should not be adjusted.
     58            array( 'random', 'random' ),
     59            array( '', '' ),
     60        );
    5261    }
    5362
Note: See TracChangeset for help on using the changeset viewer.