Skip to content

Commit 3eecaeb

Browse files
committed
Improve detection of charset from HTML
HTML allows spaces around the attribute’s equal sign and does not require quoting when special characters are not used. While at it, let’s improve the regular expressions: - Use single quotes to avoid the need to escape everything twice. - Since we are not using solidus, use it for delimiters. This will make Sublime Text highlight the syntax inside the string. - Use named reference for the quote. - Make HTML 4 `content` attribute matching case-insensitive. Ideally, we would enable `PCRE_DUPNAMES` using the `PCRE_INFO_JCHANGED` (`J` modifier) to be able to use the same named capture group in multiple branches but it does not to work with `preg_match` before PHP 7.4.4 due to <https://bugs.php.net/bug.php?id=79257>.
1 parent bdcaf57 commit 3eecaeb

File tree

3 files changed

+246
-4
lines changed

3 files changed

+246
-4
lines changed

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
## unreleased
44
- Raise minimum PHP version to 7.2.5.
55
- Add support for fossar/transcoder 2.0.
6+
- Improve detection of charset from HTML.
67

78
## 0.2.3 – 2022-01-23
89
- Added support for guzzlehttp/psr7 2.0

src/ContentTypeExtractor.php

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ class ContentTypeExtractor {
88
/**
99
* Regex pattern for HTML 4 meta tag – e.g. <meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">.
1010
*/
11-
private const PATTERN_HTML4 = "#<meta[^>]+http-equiv=[\"']?content-type[\"']?[^>]*?>#i";
11+
private const PATTERN_HTML4 = '/<meta[^>]+http-equiv\s*=\s*(?P<quote>["\']?)content-type\g{quote}[^>]*?>/i';
1212
/**
1313
* Regex pattern for HTML 5 meta tag – e.g. <meta charset=iso-8859-1>.
1414
*/
15-
private const PATTERN_HTML5 = "#(?P<before><meta[^>]+?)charset=(?P<quote>[\"'])(?P<charset>[^\"' ]+?)\\2(?P<after>[^>]*?>)#i";
15+
private const PATTERN_HTML5 = '/(?P<before><meta[^>]+?)charset\s*=\s*(?:(?P<quote>["\'])(?P<charset1>[^"\' ]+?)\g{quote}|(?P<charset2>[^"\'=<>`\s]+))(?P<after>[^>]*?>)/iJ';
1616

1717
/**
1818
* Converts the given $content to the $targetEncoding.
@@ -69,7 +69,7 @@ public static function getContentTypeFromHtml(string $content, string $targetEnc
6969

7070
// find http-equiv
7171
if (preg_match(self::PATTERN_HTML4, $content, $match)) {
72-
$pattern = "#(?P<before>.*)content=(?P<quote>[\"'])(?P<content>.*?)\\2(?P<after>.*)#";
72+
$pattern = '/(?P<before>.*)content\s*=\s*(?P<quote>["\'])(?P<content>.*?)\g{quote}(?P<after>.*)/i';
7373
if (preg_match($pattern, $match[0], $innerMatch)) {
7474
$parsed = Utils::splitHttpHeaderWords($innerMatch['content']);
7575
if (\count($parsed) > 0) {
@@ -82,7 +82,7 @@ public static function getContentTypeFromHtml(string $content, string $targetEnc
8282
$replacements[$match[0]] = $newMeta;
8383
}
8484
} elseif (preg_match(self::PATTERN_HTML5, $content, $match)) {
85-
$bodyDeclaredEncoding = $match['charset'];
85+
$bodyDeclaredEncoding = $match['charset1'] . $match['charset2'];
8686
$newMeta = $match['before'] . "charset={$match['quote']}" . $targetEncoding . "{$match['quote']}" . $match['after'];
8787
$replacements[$match[0]] = $newMeta;
8888
}

tests/ContentTypeExtractorTest.php

Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Fossar\GuzzleTranscoder\Tests;
6+
7+
use Fossar\GuzzleTranscoder\ContentTypeExtractor;
8+
use PHPUnit\Framework\TestCase;
9+
10+
class ContentTypeExtractorTest extends TestCase {
11+
/**
12+
* @dataProvider contentTypes
13+
*
14+
* @param array<string, string> $expectedContentReplacements
15+
*/
16+
public function testGetContentTypeFromHtml(string $html, string $expectedDeclaredEncoding, array $expectedContentReplacements): void {
17+
[$bodyDeclaredEncoding, $contentReplacements] = ContentTypeExtractor::getContentTypeFromHtml($html, 'placeholder-encoding');
18+
$this->assertSame($expectedContentReplacements, $contentReplacements);
19+
$this->assertSame($expectedDeclaredEncoding, $bodyDeclaredEncoding);
20+
}
21+
22+
/**
23+
* @return iterable<string, array{string, string, array<string, string>}>
24+
*/
25+
public function contentTypes(): iterable {
26+
// https://html.spec.whatwg.org/multipage/syntax.html#attributes-2
27+
yield 'HTML5, double quotes' => [
28+
<<<HTML
29+
<meta charset="iso-8859-1">
30+
HTML
31+
,
32+
'iso-8859-1',
33+
[
34+
'<meta charset="iso-8859-1">' => '<meta charset="placeholder-encoding">',
35+
],
36+
];
37+
38+
yield 'HTML5, single quotes' => [
39+
<<<HTML
40+
<meta charset='iso-8859-1'>
41+
HTML
42+
,
43+
'iso-8859-1',
44+
[
45+
"<meta charset='iso-8859-1'>" => "<meta charset='placeholder-encoding'>",
46+
],
47+
];
48+
49+
yield 'HTML5, unquoted' => [
50+
<<<HTML
51+
<meta charset=iso-8859-1>
52+
HTML
53+
,
54+
'iso-8859-1',
55+
[
56+
'<meta charset=iso-8859-1>' => '<meta charset=placeholder-encoding>',
57+
],
58+
];
59+
60+
yield 'HTML5, unquoted, spaces around' => [
61+
<<<HTML
62+
<meta charset = iso-8859-1>
63+
HTML
64+
,
65+
'iso-8859-1',
66+
[
67+
'<meta charset = iso-8859-1>' => '<meta charset=placeholder-encoding>',
68+
],
69+
];
70+
71+
yield 'HTML5, unquoted, extra attributes' => [
72+
<<<HTML
73+
<meta foo charset=iso-8859-1 bar baz="2">
74+
HTML
75+
,
76+
'iso-8859-1',
77+
[
78+
'<meta foo charset=iso-8859-1 bar baz="2">' => '<meta foo charset=placeholder-encoding bar baz="2">',
79+
],
80+
];
81+
82+
yield 'HTML5, random case' => [
83+
<<<HTML
84+
<MeTA chArSEt="ISo-8859-1">
85+
HTML
86+
,
87+
'ISo-8859-1',
88+
[
89+
'<MeTA chArSEt="ISo-8859-1">' => '<MeTA charset="placeholder-encoding">',
90+
],
91+
];
92+
93+
yield '(X)HTML5, unquoted' => [
94+
<<<HTML
95+
<meta charset=iso-8859-1 />
96+
HTML
97+
,
98+
'iso-8859-1',
99+
[
100+
'<meta charset=iso-8859-1 />' => '<meta charset=placeholder-encoding />',
101+
],
102+
];
103+
104+
yield '(X)HTML5, tight' => [
105+
<<<HTML
106+
<meta charset="iso-8859-1"/>
107+
HTML
108+
,
109+
'iso-8859-1',
110+
[
111+
'<meta charset="iso-8859-1"/>' => '<meta charset="placeholder-encoding"/>',
112+
],
113+
];
114+
115+
// If [a solidus in a start tag of a void element is] directly preceded by an unquoted attribute value, it becomes part of the attribute value rather than being discarded by the parser.
116+
// https://html.spec.whatwg.org/multipage/syntax.html#start-tags
117+
yield '(X)HTML5, unquoted, misplaced solidus' => [
118+
<<<HTML
119+
<meta charset=iso-8859-1/>
120+
HTML
121+
,
122+
'iso-8859-1/',
123+
[
124+
'<meta charset=iso-8859-1/>' => '<meta charset=placeholder-encoding>',
125+
],
126+
];
127+
128+
yield 'HTML4, double quotes' => [
129+
<<<HTML
130+
<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
131+
HTML
132+
,
133+
'ISO-8859-1',
134+
[
135+
'<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">' => '<meta http-equiv="content-type" content="text/html; charset=placeholder-encoding">',
136+
],
137+
];
138+
139+
yield 'HTML4, double quotes, other way around' => [
140+
<<<HTML
141+
<meta content="text/html; charset=ISO-8859-1" http-equiv="content-type">
142+
HTML
143+
,
144+
'ISO-8859-1',
145+
[
146+
'<meta content="text/html; charset=ISO-8859-1" http-equiv="content-type">' => '<meta content="text/html; charset=placeholder-encoding" http-equiv="content-type">',
147+
],
148+
];
149+
150+
yield 'HTML4, double quotes, extra attributes, other way around' => [
151+
<<<HTML
152+
<meta foo="bar" content="text/html; charset=ISO-8859-1" test middle http-equiv="content-type" after='something'>
153+
HTML
154+
,
155+
'ISO-8859-1',
156+
[
157+
'<meta foo="bar" content="text/html; charset=ISO-8859-1" test middle http-equiv="content-type" after=\'something\'>' => '<meta foo="bar" content="text/html; charset=placeholder-encoding" test middle http-equiv="content-type" after=\'something\'>',
158+
],
159+
];
160+
161+
yield 'HTML4, single quotes' => [
162+
<<<HTML
163+
<meta http-equiv='content-type' content='text/html; charset=ISO-8859-1'>
164+
HTML
165+
,
166+
'ISO-8859-1',
167+
[
168+
"<meta http-equiv='content-type' content='text/html; charset=ISO-8859-1'>" => "<meta http-equiv='content-type' content='text/html; charset=placeholder-encoding'>",
169+
],
170+
];
171+
172+
yield 'HTML4, unquoted+single quotes' => [
173+
<<<HTML
174+
<meta http-equiv=content-type content='text/html; charset=ISO-8859-1'>
175+
HTML
176+
,
177+
'ISO-8859-1',
178+
[
179+
"<meta http-equiv=content-type content='text/html; charset=ISO-8859-1'>" => "<meta http-equiv=content-type content='text/html; charset=placeholder-encoding'>",
180+
],
181+
];
182+
183+
// https://httpwg.org/specs/rfc9110.html#field.content-type
184+
yield 'HTML4, internally quoted, extra parameters' => [
185+
<<<HTML
186+
<meta http-equiv=content-type content='text/html;foo;charset="ISO-8859-1";bar'>
187+
HTML
188+
,
189+
'ISO-8859-1',
190+
[
191+
"<meta http-equiv=content-type content='text/html;foo;charset=\"ISO-8859-1\";bar'>" => "<meta http-equiv=content-type content='text/html; foo; charset=placeholder-encoding; bar'>",
192+
],
193+
];
194+
195+
yield 'HTML4, single quotes+double quotes+spaces around' => [
196+
<<<HTML
197+
<meta http-equiv = "content-type" content = "text/html; charset=ISO-8859-1">
198+
HTML
199+
,
200+
'ISO-8859-1',
201+
[
202+
'<meta http-equiv = "content-type" content = "text/html; charset=ISO-8859-1">' => '<meta http-equiv = "content-type" content="text/html; charset=placeholder-encoding">',
203+
],
204+
];
205+
206+
yield 'HTML4, random case' => [
207+
<<<HTML
208+
<meTA HTTp-EQuIv="conTeNt-TYpe" CoNTeNt="text/Html; cHArSeT=ISO-8859-1">
209+
HTML
210+
,
211+
'ISO-8859-1',
212+
[
213+
'<meTA HTTp-EQuIv="conTeNt-TYpe" CoNTeNt="text/Html; cHArSeT=ISO-8859-1">' => '<meTA HTTp-EQuIv="conTeNt-TYpe" content="text/Html; cHArSeT=placeholder-encoding">',
214+
],
215+
];
216+
217+
yield '(X)HTML4' => [
218+
<<<HTML
219+
<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1"/>
220+
HTML
221+
,
222+
'ISO-8859-1',
223+
[
224+
'<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1"/>' => '<meta http-equiv="content-type" content="text/html; charset=placeholder-encoding"/>',
225+
],
226+
];
227+
228+
yield 'multiple declarations' => [
229+
<<<HTML
230+
<meta http-equiv="content-type" test content="text/html; charset=ISO-8859-1">
231+
<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
232+
<meta charset=UTF-8>
233+
HTML
234+
,
235+
'ISO-8859-1',
236+
[
237+
'<meta http-equiv="content-type" test content="text/html; charset=ISO-8859-1">' => '<meta http-equiv="content-type" test content="text/html; charset=placeholder-encoding">',
238+
],
239+
];
240+
}
241+
}

0 commit comments

Comments
 (0)