Merge pull request #84 from SoapBox/html-string

Nick Frasser · Nick Frasser · commit 92d61be78490 · 2015-12-14T22:35:52.000-05:00
New linkify-html interface
diff --git a/gulpfile.js b/gulpfile.js
@@ -21,7 +21,8 @@ wrap			= require('gulp-wrap');
 
 var paths = {
 	src: 'src/**/*.js',
-	lib: 'lib/**/*.js',
+	lib: ['lib/**/*.js'],
+	libTest: ['lib/*.js', 'lib/linkify/**/*.js'],
 	libCore: [
 		'lib/linkify/core/*.js',
 		'lib/linkify/utils/*.js',
@@ -99,38 +100,40 @@ gulp.task('build-core', ['babel'], function () {
 // Depends on build-core
 gulp.task('build-interfaces', ['babel-amd'], function () {
 
-	var stream, streams = [];
-
 	// Core linkify functionality as plugins
 	var interface, interfaces = [
 		'string',
 		'element',
-		['element', 'jquery'] // jQuery interface requires both element and jquery
+		['linkify-element.js', 'jquery'], // jQuery interface requires both element and jquery
+		[
+			'simple-html-tokenizer/*.js',
+			'simple-html-tokenizer.js',
+			'html'
+		]
 	];
 
-	var files = {js: null, amd: null};
-
 	// Globals browser interface
-	for (var i = 0; i < interfaces.length; i++) {
-		interface = interfaces[i];
+	var streams = [];
+
+	interfaces.forEach(function (interface) {
+
+		var files = {js: [], amd: []};
 
 		if (interface instanceof Array) {
-			// Interface has dependencies
-			files.js = [];
-			files.amd = [];
-			for (var j = 0; j < interface.length; j++) {
-				files.js.push('src/linkify-' + interface[j] + '.js');
-				files.amd.push('build/amd/linkify-' + interface[j] + '.js');
-			}
+			// Interface has other interface dependencies within this package
+			interface.forEach(function (i, idx) {
+				if (idx == interface.length - 1) { return; } // ignore last index
+				files.js.push('src/' + i);
+				files.amd.push('build/amd/' + i);
+			});
 
 			// The last dependency is the name of the interface
 			interface = interface.pop();
-
-		} else {
-			files.js = 'src/linkify-' + interface + '.js';
-			files.amd = 'build/amd/linkify-' + interface + '.js';
 		}
 
+		files.js.push('src/linkify-' + interface + '.js');
+		files.amd.push('build/amd/linkify-' + interface + '.js');
+
 		// Browser interface
 		stream = gulp.src(files.js)
 		.pipe(babel({
@@ -150,7 +153,7 @@ gulp.task('build-interfaces', ['babel-amd'], function () {
 		.pipe(gulp.dest('build'));
 
 		streams.push(stream);
-	}
+	});
 
 	return merge.apply(this, streams);
 });
@@ -230,7 +233,7 @@ gulp.task('mocha', ['build'], function () {
 */
 gulp.task('coverage', ['build'], function (cb) {
 	// IMPORTANT: return not required here (and will actually cause bugs!)
-	gulp.src(paths.lib)
+	gulp.src(paths.libTest)
 	.pipe(istanbul()) // Covering files
 	.pipe(istanbul.hookRequire()) // Force `require` to return covered files
 	.on('finish', function () {
diff --git a/html.js b/html.js
@@ -0,0 +1 @@
+module.exports = require('./lib/linkify-html');
diff --git a/package.json b/package.json
@@ -27,7 +27,8 @@
     "mention",
     "mentions"
   ],
-  "dependencies": {},
+  "dependencies": {
+  },
   "devDependencies": {
     "amd-optimize": "^0.6.1",
     "brfs": "^1.4.1",
@@ -58,7 +59,8 @@
     "lodash": "^3.10.1",
     "merge-stream": "^1.0.0",
     "mocha": "^2.3.3",
-    "phantomjs": "^1.9.18"
+    "phantomjs": "^1.9.18",
+    "simple-html-tokenizer": "https://github.com/nfrasser/simple-html-tokenizer.git#master"
   },
   "optionalDependencies": {
     "jquery": ">=1.9.0"
diff --git a/src/linkify-html.js b/src/linkify-html.js
@@ -0,0 +1,181 @@
+import HTML5Tokenizer from './simple-html-tokenizer';
+import * as linkify from './linkify';
+
+const StartTag = 'StartTag';
+const EndTag = 'EndTag';
+const Chars = 'Chars';
+const Comment = 'Comment';
+
+/**
+	`tokens` and `token` in this section refer to tokens generated by the HTML
+	parser.
+*/
+export default function linkifyHtml(str, opts={}) {
+	let tokens = HTML5Tokenizer.tokenize(str);
+	let linkifiedTokens = [];
+	let linkified = [];
+	var i;
+
+	opts = linkify.options.normalize(opts);
+
+	// Linkify the tokens given by the parser
+	for (i = 0; i < tokens.length; i++) {
+		let token = tokens[i];
+
+		if (token.type === StartTag && token.tagName.toUpperCase() === 'A') {
+			// Ignore all the contents of an anchor tag
+			let preskipLen = linkifiedTokens.length;
+			skipTokens('A', tokens, ++i, linkifiedTokens);
+
+			i += linkifiedTokens.length - preskipLen;
+			continue;
+
+		} else if (token.type !== Chars) {
+			// Skip this token, it's not important
+			linkifiedTokens.push(token);
+			continue;
+		}
+
+		// Valid text token, linkify it!
+		let linkifedChars = linkifyChars(token.chars, opts);
+		linkifiedTokens.push.apply(linkifiedTokens, linkifedChars);
+	}
+
+	// Convert the tokens back into a string
+	for (i = 0; i < linkifiedTokens.length; i++) {
+		let token = linkifiedTokens[i];
+		switch (token.type) {
+		case StartTag:
+			let attrs = attrsToStrings(token.attributes);
+			let link = '<' + token.tagName;
+			if (attrs.length > 0) { link += ' ' + attrs.join(' '); }
+			link += '>';
+			linkified.push(link);
+			break;
+		case EndTag:
+			linkified.push(`</${token.tagName}>`);
+			break;
+		case Chars:
+			linkified.push(escapeText(token.chars));
+			break;
+		case Comment:
+			linkified.push(`<!--${escapeText(token.chars)}-->`);
+			break;
+		}
+	}
+
+	return linkified.join('');
+}
+
+/**
+	`tokens` and `token` in this section referes to tokens returned by
+	`linkify.tokenize`. `linkified` will contain HTML Parser-style tokens
+*/
+function linkifyChars(str, opts) {
+	let tokens = linkify.tokenize(str);
+	let result = [];
+
+	for (var i = 0; i < tokens.length; i++) {
+		let token = tokens[i];
+		if (token.type === 'nl' && opts.nl2br) {
+			result.push({
+				type: StartTag,
+				tagName: 'br',
+				attributes: [],
+				selfClosing: true
+			});
+			continue;
+		} else if (!token.isLink) {
+			result.push({type: Chars, chars: token.toString()});
+			continue;
+		}
+
+		let href			= token.toHref(opts.defaultProtocol);
+		let formatted		= linkify.options.resolve(opts.format, token.toString(), token.type);
+		let formattedHref	= linkify.options.resolve(opts.formatHref, href, token.type);
+		let attributesHash	= linkify.options.resolve(opts.attributes, href, token.type);
+		let tagName			= linkify.options.resolve(opts.tagName, href, token.type);
+		let linkClass		= linkify.options.resolve(opts.linkClass, href, token.type);
+		let target			= linkify.options.resolve(opts.target, href, token.type);
+
+		// Build up attributes
+		let attributes = [
+			['href', formattedHref],
+			['class', linkClass]
+		];
+
+		if (target) {
+			attributes.push(['target', target]);
+		}
+
+		for (var attr in attributesHash) {
+			attributes.push([attr, attributesHash[attr]]);
+		}
+
+		// Add the required tokens
+		result.push({
+			type: StartTag,
+			tagName: tagName,
+			attributes: attributes,
+			selfClosing: false
+		});
+		result.push({type: Chars, chars: formatted});
+		result.push({type: EndTag, tagName: tagName});
+	}
+
+	return result;
+}
+
+/**
+	Returns a list of tokens skipped until the closing tag of tagName.
+
+	* `tagName` is the closing tag which will prompt us to stop skipping
+	* `tokens` is the array of tokens generated by HTML5Tokenizer which
+	* `i` is the index immediately after the opening tag to skip
+	* `skippedTokens` is an array which skipped tokens are being pushed into
+
+	Caveats
+
+	* Assumes that i is the first token after the given opening tagName
+	* The closing tag will be skipped, but nothing after it
+	* Will track whether there is a nested tag of the same type
+*/
+function skipTagTokens(tagName, tokens, i, skippedTokens) {
+
+	// number of tokens of this type on the [fictional] stack
+	var stackCount = 1;
+
+	while (i < tokens.length && stackCount > 0) {
+		let token = tokens[i];
+		if (token.type === StartTag && token.tagName.toUpperCase() === tagName) {
+			// Nested tag of the same type, "add to stack"
+			stackCount++;
+		} else if (token.type === EndTag && token.tagName.toUpperCase() === tagName) {
+			// Closing tag
+			stackCount--;
+		}
+		skippedTokens.push(token);
+		i++;
+	}
+
+	// Note that if stackCount > 0 here, the HTML is probably invalid
+	return skippedTokens;
+}
+
+function escapeText(text) {
+	// Not required, HTML tokenizer ensures this occurs properly
+	return text;
+}
+
+function escapeAttr(attr) {
+	return attr.replace(/"/g, '&quot;');
+}
+
+function attrsToStrings(attrs) {
+	let attrStrs = [];
+	for (let i = 0; i < attrs.length; i++) {
+		let [name, value] = attrs[i];
+		attrStrs.push(`${name}="${escapeAttr(value)}"`);
+	}
+	return attrStrs;
+}
diff --git a/src/linkify-string.js b/src/linkify-string.js
@@ -4,14 +4,14 @@
 
 import {tokenize, options} from './linkify';
 
-function cleanText(text) {
+function escapeText(text) {
 	return text
 	.replace(/&/g, '&amp;')
 	.replace(/</g, '&lt;')
 	.replace(/>/g, '&gt;');
 }
 
-function cleanAttr(href) {
+function escapeAttr(href) {
 	return href.replace(/"/g, '&quot;');
 }
 
@@ -22,7 +22,7 @@ function attributesToString(attributes) {
 
 	for (let attr in attributes) {
 		let val = (attributes[attr] + '').replace(/"/g, '&quot;');
-		result.push(`${attr}="${cleanAttr(val)}"`);
+		result.push(`${attr}="${escapeAttr(val)}"`);
 	}
 	return result.join(' ');
 }
@@ -35,7 +35,7 @@ function linkifyStr(str, opts={}) {
 	tokens = tokenize(str),
 	result = [];
 
-	for (let i = 0; i < tokens.length; i++ ) {
+	for (let i = 0; i < tokens.length; i++) {
 		let token = tokens[i];
 		if (token.isLink) {
 
@@ -48,16 +48,16 @@ function linkifyStr(str, opts={}) {
 			linkClass		= options.resolve(opts.linkClass, href, token.type),
 			target			= options.resolve(opts.target, href, token.type);
 
-			let link = `<${tagName} href="${cleanAttr(formattedHref)}" class="${cleanAttr(linkClass)}"`;
+			let link = `<${tagName} href="${escapeAttr(formattedHref)}" class="${escapeAttr(linkClass)}"`;
 			if (target) {
-				link += ` target="${cleanAttr(target)}"`;
+				link += ` target="${escapeAttr(target)}"`;
 			}
 
 			if (attributesHash) {
 				link += ` ${attributesToString(attributesHash)}`;
 			}
 
-			link += `>${cleanText(formatted)}</${tagName}>`;
+			link += `>${escapeText(formatted)}</${tagName}>`;
 			result.push(link);
 
 		} else if (token.type === 'nl' && opts.nl2br) {
@@ -67,7 +67,7 @@ function linkifyStr(str, opts={}) {
 				result.push('<br>\n');
 			}
 		} else {
-			result.push(cleanText(token.toString()));
+			result.push(escapeText(token.toString()));
 		}
 	}
 
diff --git a/src/simple-html-tokenizer b/src/simple-html-tokenizer
@@ -0,0 +1 @@
+../node_modules/simple-html-tokenizer/lib/simple-html-tokenizer
diff --git a/src/simple-html-tokenizer.js b/src/simple-html-tokenizer.js
@@ -0,0 +1,15 @@
+import HTML5NamedCharRefs from './simple-html-tokenizer/html5-named-char-refs';
+import EntityParser from './simple-html-tokenizer/entity-parser';
+import EventedTokenizer from './simple-html-tokenizer/evented-tokenizer';
+import Tokenizer from './simple-html-tokenizer/tokenizer';
+import tokenize from './simple-html-tokenizer/tokenize';
+
+var HTML5Tokenizer = {
+	HTML5NamedCharRefs,
+	EntityParser,
+	EventedTokenizer,
+	Tokenizer,
+	tokenize,
+};
+
+export default HTML5Tokenizer;
diff --git a/templates/linkify-html.amd.js b/templates/linkify-html.amd.js
@@ -0,0 +1 @@
+<%= contents %>
diff --git a/templates/linkify-html.js b/templates/linkify-html.js
@@ -0,0 +1,5 @@
+;(function (linkify) {
+"use strict";
+<%= contents %>
+window.linkifyHtml = linkifyHtml;
+})(window.linkify);
diff --git a/test/index.html b/test/index.html
diff --git a/test/spec/linkify-html-test.js b/test/spec/linkify-html-test.js

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+module.exports = require('./lib/linkify-html');`
Original file line number	Diff line number	Diff line change
`@@ -4,14 +4,14 @@`
`4`	`4`
`5`	`5`	`import {tokenize, options} from './linkify';`
`6`	`6`
`7`		`-function cleanText(text) {`
	`7`	`+function escapeText(text) {`
`8`	`8`	`return text`
`9`	`9`	`.replace(/&/g, '&')`
`10`	`10`	`.replace(/</g, '<')`
`11`	`11`	`.replace(/>/g, '>');`
`12`	`12`	`}`
`13`	`13`
`14`		`-function cleanAttr(href) {`
	`14`	`+function escapeAttr(href) {`
`15`	`15`	`return href.replace(/"/g, '"');`
`16`	`16`	`}`
`17`	`17`
`@@ -22,7 +22,7 @@ function attributesToString(attributes) {`
`22`	`22`
`23`	`23`	`for (let attr in attributes) {`
`24`	`24`	`let val = (attributes[attr] + '').replace(/"/g, '"');`
`25`		- result.push(`${attr}="${cleanAttr(val)}"`);
	`25`	+ result.push(`${attr}="${escapeAttr(val)}"`);
`26`	`26`	`}`
`27`	`27`	`return result.join(' ');`
`28`	`28`	`}`
`@@ -35,7 +35,7 @@ function linkifyStr(str, opts={}) {`
`35`	`35`	`tokens = tokenize(str),`
`36`	`36`	`result = [];`
`37`	`37`
`38`		`- for (let i = 0; i < tokens.length; i++ ) {`
	`38`	`+ for (let i = 0; i < tokens.length; i++) {`
`39`	`39`	`let token = tokens[i];`
`40`	`40`	`if (token.isLink) {`
`41`	`41`
`@@ -48,16 +48,16 @@ function linkifyStr(str, opts={}) {`
`48`	`48`	`linkClass = options.resolve(opts.linkClass, href, token.type),`
`49`	`49`	`target = options.resolve(opts.target, href, token.type);`
`50`	`50`
`51`		- let link = `<${tagName} href="${cleanAttr(formattedHref)}" class="${cleanAttr(linkClass)}"`;
	`51`	+ let link = `<${tagName} href="${escapeAttr(formattedHref)}" class="${escapeAttr(linkClass)}"`;
`52`	`52`	`if (target) {`
`53`		- link += ` target="${cleanAttr(target)}"`;
	`53`	+ link += ` target="${escapeAttr(target)}"`;
`54`	`54`	`}`
`55`	`55`
`56`	`56`	`if (attributesHash) {`
`57`	`57`	link += ` ${attributesToString(attributesHash)}`;
`58`	`58`	`}`
`59`	`59`
`60`		- link += `>${cleanText(formatted)}</${tagName}>`;
	`60`	+ link += `>${escapeText(formatted)}</${tagName}>`;
`61`	`61`	`result.push(link);`
`62`	`62`
`63`	`63`	`} else if (token.type === 'nl' && opts.nl2br) {`
`@@ -67,7 +67,7 @@ function linkifyStr(str, opts={}) {`
`67`	`67`	`result.push('<br>\n');`
`68`	`68`	`}`
`69`	`69`	`} else {`
`70`		`- result.push(cleanText(token.toString()));`
	`70`	`+ result.push(escapeText(token.toString()));`
`71`	`71`	`}`
`72`	`72`	`}`
`73`	`73`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../node_modules/simple-html-tokenizer/lib/simple-html-tokenizer`