|
| 1 | +<?php |
| 2 | + |
| 3 | +declare(strict_types=1); |
| 4 | + |
| 5 | +namespace Fossar\GuzzleTranscoder\Tests; |
| 6 | + |
| 7 | +use Fossar\GuzzleTranscoder\ContentTypeExtractor; |
| 8 | +use PHPUnit\Framework\TestCase; |
| 9 | + |
| 10 | +class ContentTypeExtractorTest extends TestCase { |
| 11 | + /** |
| 12 | + * @dataProvider contentTypes |
| 13 | + * |
| 14 | + * @param array<string, string> $expectedContentReplacements |
| 15 | + */ |
| 16 | + public function testGetContentTypeFromHtml(string $html, string $expectedDeclaredEncoding, array $expectedContentReplacements): void { |
| 17 | + [$bodyDeclaredEncoding, $contentReplacements] = ContentTypeExtractor::getContentTypeFromHtml($html, 'placeholder-encoding'); |
| 18 | + $this->assertSame($expectedContentReplacements, $contentReplacements); |
| 19 | + $this->assertSame($expectedDeclaredEncoding, $bodyDeclaredEncoding); |
| 20 | + } |
| 21 | + |
| 22 | + /** |
| 23 | + * @return iterable<string, array{string, string, array<string, string>}> |
| 24 | + */ |
| 25 | + public function contentTypes(): iterable { |
| 26 | + // https://html.spec.whatwg.org/multipage/syntax.html#attributes-2 |
| 27 | + yield 'HTML5, double quotes' => [ |
| 28 | + <<<HTML |
| 29 | + <meta charset="iso-8859-1"> |
| 30 | +HTML |
| 31 | + , |
| 32 | + 'iso-8859-1', |
| 33 | + [ |
| 34 | + '<meta charset="iso-8859-1">' => '<meta charset="placeholder-encoding">', |
| 35 | + ], |
| 36 | + ]; |
| 37 | + |
| 38 | + yield 'HTML5, single quotes' => [ |
| 39 | + <<<HTML |
| 40 | + <meta charset='iso-8859-1'> |
| 41 | +HTML |
| 42 | + , |
| 43 | + 'iso-8859-1', |
| 44 | + [ |
| 45 | + "<meta charset='iso-8859-1'>" => "<meta charset='placeholder-encoding'>", |
| 46 | + ], |
| 47 | + ]; |
| 48 | + |
| 49 | + yield 'HTML5, unquoted' => [ |
| 50 | + <<<HTML |
| 51 | + <meta charset=iso-8859-1> |
| 52 | +HTML |
| 53 | + , |
| 54 | + 'iso-8859-1', |
| 55 | + [ |
| 56 | + '<meta charset=iso-8859-1>' => '<meta charset=placeholder-encoding>', |
| 57 | + ], |
| 58 | + ]; |
| 59 | + |
| 60 | + yield 'HTML5, unquoted, spaces around' => [ |
| 61 | + <<<HTML |
| 62 | + <meta charset = iso-8859-1> |
| 63 | +HTML |
| 64 | + , |
| 65 | + 'iso-8859-1', |
| 66 | + [ |
| 67 | + '<meta charset = iso-8859-1>' => '<meta charset=placeholder-encoding>', |
| 68 | + ], |
| 69 | + ]; |
| 70 | + |
| 71 | + yield 'HTML5, unquoted, extra attributes' => [ |
| 72 | + <<<HTML |
| 73 | + <meta foo charset=iso-8859-1 bar baz="2"> |
| 74 | +HTML |
| 75 | + , |
| 76 | + 'iso-8859-1', |
| 77 | + [ |
| 78 | + '<meta foo charset=iso-8859-1 bar baz="2">' => '<meta foo charset=placeholder-encoding bar baz="2">', |
| 79 | + ], |
| 80 | + ]; |
| 81 | + |
| 82 | + yield 'HTML5, random case' => [ |
| 83 | + <<<HTML |
| 84 | + <MeTA chArSEt="ISo-8859-1"> |
| 85 | +HTML |
| 86 | + , |
| 87 | + 'ISo-8859-1', |
| 88 | + [ |
| 89 | + '<MeTA chArSEt="ISo-8859-1">' => '<MeTA charset="placeholder-encoding">', |
| 90 | + ], |
| 91 | + ]; |
| 92 | + |
| 93 | + yield '(X)HTML5, unquoted' => [ |
| 94 | + <<<HTML |
| 95 | + <meta charset=iso-8859-1 /> |
| 96 | +HTML |
| 97 | + , |
| 98 | + 'iso-8859-1', |
| 99 | + [ |
| 100 | + '<meta charset=iso-8859-1 />' => '<meta charset=placeholder-encoding />', |
| 101 | + ], |
| 102 | + ]; |
| 103 | + |
| 104 | + yield '(X)HTML5, tight' => [ |
| 105 | + <<<HTML |
| 106 | + <meta charset="iso-8859-1"/> |
| 107 | +HTML |
| 108 | + , |
| 109 | + 'iso-8859-1', |
| 110 | + [ |
| 111 | + '<meta charset="iso-8859-1"/>' => '<meta charset="placeholder-encoding"/>', |
| 112 | + ], |
| 113 | + ]; |
| 114 | + |
| 115 | + // If [a solidus in a start tag of a void element is] directly preceded by an unquoted attribute value, it becomes part of the attribute value rather than being discarded by the parser. |
| 116 | + // https://html.spec.whatwg.org/multipage/syntax.html#start-tags |
| 117 | + yield '(X)HTML5, unquoted, misplaced solidus' => [ |
| 118 | + <<<HTML |
| 119 | + <meta charset=iso-8859-1/> |
| 120 | +HTML |
| 121 | + , |
| 122 | + 'iso-8859-1/', |
| 123 | + [ |
| 124 | + '<meta charset=iso-8859-1/>' => '<meta charset=placeholder-encoding>', |
| 125 | + ], |
| 126 | + ]; |
| 127 | + |
| 128 | + yield 'HTML4, double quotes' => [ |
| 129 | + <<<HTML |
| 130 | + <meta http-equiv="content-type" content="text/html; charset=ISO-8859-1"> |
| 131 | +HTML |
| 132 | + , |
| 133 | + 'ISO-8859-1', |
| 134 | + [ |
| 135 | + '<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">' => '<meta http-equiv="content-type" content="text/html; charset=placeholder-encoding">', |
| 136 | + ], |
| 137 | + ]; |
| 138 | + |
| 139 | + yield 'HTML4, double quotes, other way around' => [ |
| 140 | + <<<HTML |
| 141 | + <meta content="text/html; charset=ISO-8859-1" http-equiv="content-type"> |
| 142 | +HTML |
| 143 | + , |
| 144 | + 'ISO-8859-1', |
| 145 | + [ |
| 146 | + '<meta content="text/html; charset=ISO-8859-1" http-equiv="content-type">' => '<meta content="text/html; charset=placeholder-encoding" http-equiv="content-type">', |
| 147 | + ], |
| 148 | + ]; |
| 149 | + |
| 150 | + yield 'HTML4, double quotes, extra attributes, other way around' => [ |
| 151 | + <<<HTML |
| 152 | + <meta foo="bar" content="text/html; charset=ISO-8859-1" test middle http-equiv="content-type" after='something'> |
| 153 | +HTML |
| 154 | + , |
| 155 | + 'ISO-8859-1', |
| 156 | + [ |
| 157 | + '<meta foo="bar" content="text/html; charset=ISO-8859-1" test middle http-equiv="content-type" after=\'something\'>' => '<meta foo="bar" content="text/html; charset=placeholder-encoding" test middle http-equiv="content-type" after=\'something\'>', |
| 158 | + ], |
| 159 | + ]; |
| 160 | + |
| 161 | + yield 'HTML4, single quotes' => [ |
| 162 | + <<<HTML |
| 163 | + <meta http-equiv='content-type' content='text/html; charset=ISO-8859-1'> |
| 164 | +HTML |
| 165 | + , |
| 166 | + 'ISO-8859-1', |
| 167 | + [ |
| 168 | + "<meta http-equiv='content-type' content='text/html; charset=ISO-8859-1'>" => "<meta http-equiv='content-type' content='text/html; charset=placeholder-encoding'>", |
| 169 | + ], |
| 170 | + ]; |
| 171 | + |
| 172 | + yield 'HTML4, unquoted+single quotes' => [ |
| 173 | + <<<HTML |
| 174 | + <meta http-equiv=content-type content='text/html; charset=ISO-8859-1'> |
| 175 | +HTML |
| 176 | + , |
| 177 | + 'ISO-8859-1', |
| 178 | + [ |
| 179 | + "<meta http-equiv=content-type content='text/html; charset=ISO-8859-1'>" => "<meta http-equiv=content-type content='text/html; charset=placeholder-encoding'>", |
| 180 | + ], |
| 181 | + ]; |
| 182 | + |
| 183 | + // https://httpwg.org/specs/rfc9110.html#field.content-type |
| 184 | + yield 'HTML4, internally quoted, extra parameters' => [ |
| 185 | + <<<HTML |
| 186 | + <meta http-equiv=content-type content='text/html;foo;charset="ISO-8859-1";bar'> |
| 187 | +HTML |
| 188 | + , |
| 189 | + 'ISO-8859-1', |
| 190 | + [ |
| 191 | + "<meta http-equiv=content-type content='text/html;foo;charset=\"ISO-8859-1\";bar'>" => "<meta http-equiv=content-type content='text/html; foo; charset=placeholder-encoding; bar'>", |
| 192 | + ], |
| 193 | + ]; |
| 194 | + |
| 195 | + yield 'HTML4, single quotes+double quotes+spaces around' => [ |
| 196 | + <<<HTML |
| 197 | + <meta http-equiv = "content-type" content = "text/html; charset=ISO-8859-1"> |
| 198 | +HTML |
| 199 | + , |
| 200 | + 'ISO-8859-1', |
| 201 | + [ |
| 202 | + '<meta http-equiv = "content-type" content = "text/html; charset=ISO-8859-1">' => '<meta http-equiv = "content-type" content="text/html; charset=placeholder-encoding">', |
| 203 | + ], |
| 204 | + ]; |
| 205 | + |
| 206 | + yield 'HTML4, random case' => [ |
| 207 | + <<<HTML |
| 208 | + <meTA HTTp-EQuIv="conTeNt-TYpe" CoNTeNt="text/Html; cHArSeT=ISO-8859-1"> |
| 209 | +HTML |
| 210 | + , |
| 211 | + 'ISO-8859-1', |
| 212 | + [ |
| 213 | + '<meTA HTTp-EQuIv="conTeNt-TYpe" CoNTeNt="text/Html; cHArSeT=ISO-8859-1">' => '<meTA HTTp-EQuIv="conTeNt-TYpe" content="text/Html; cHArSeT=placeholder-encoding">', |
| 214 | + ], |
| 215 | + ]; |
| 216 | + |
| 217 | + yield '(X)HTML4' => [ |
| 218 | + <<<HTML |
| 219 | + <meta http-equiv="content-type" content="text/html; charset=ISO-8859-1"/> |
| 220 | +HTML |
| 221 | + , |
| 222 | + 'ISO-8859-1', |
| 223 | + [ |
| 224 | + '<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1"/>' => '<meta http-equiv="content-type" content="text/html; charset=placeholder-encoding"/>', |
| 225 | + ], |
| 226 | + ]; |
| 227 | + |
| 228 | + yield 'multiple declarations' => [ |
| 229 | + <<<HTML |
| 230 | + <meta http-equiv="content-type" test content="text/html; charset=ISO-8859-1"> |
| 231 | + <meta http-equiv="content-type" content="text/html; charset=ISO-8859-1"> |
| 232 | + <meta charset=UTF-8> |
| 233 | +HTML |
| 234 | + , |
| 235 | + 'ISO-8859-1', |
| 236 | + [ |
| 237 | + '<meta http-equiv="content-type" test content="text/html; charset=ISO-8859-1">' => '<meta http-equiv="content-type" test content="text/html; charset=placeholder-encoding">', |
| 238 | + ], |
| 239 | + ]; |
| 240 | + } |
| 241 | +} |
0 commit comments