While working on an Adobe AIR based source code editor I was looking for an easy way to remove comments from different kind of source codes. What first seemed like an easy regular expression turned out to be much more complex because comment pre- and suffixes can also occur in string literals or regular expressions and our old friend Internet Explorer allows HTML
conditional comments and JavaScript
conditional compilation. Great.
First I had a look at YUICompressor
s source code and Google Page Speed but then I decided to port a script for removing JavaScript
comments created by James Padolsey.
I just improved the recognition for regular expressions a little and since this code should work with all ECMAScript
based source codes I named my class ECMAScriptParser
instead of JavaScriptParser
:
package de.superclass.parser { /** * @author Markus Raab (superclass.de | blog.derRaab.com) */ public class ECMAScriptParser { /** * Removes inline and block comments from an ECMAScript source string. * * This is a port of James Padolsey with an slightly improved RegExp recognition. * @see: http://james.padolsey.com/javascript/removing-comments-in-javascript * * @param ecmaScript ECMAScript source string * @param removeCondComp Optional (default=false) - Whether to remove Internet Explorers JavasScript conditional compilation comments or not. * @return ECMAScript source string without comments */ public static function removeComments( ecmaScript : String, removeCondComp : Boolean = false ) : String { var modeSingleQuote : Boolean = false; var modeDoubleQuote : Boolean = false; var modeRegExp : Boolean = false; var modeBlockComment : Boolean = false; var modeLineComment : Boolean = false; var modeCondComp : Boolean = false; var vector : Vector.<String> = Vector.<String>( ( '__' + ecmaScript + '__' ).split( '' ) ); vector.fixed = true; var c : int = vector.length; for ( var i : int = 0; i < c; i++ ) { var string : String = vector[ i ]; if ( modeRegExp ) { if ( string === '/' && vector[ i - 1 ] !== '\\' ) { modeRegExp = false; } continue; } if ( modeSingleQuote ) { if ( string === "'" && vector[ i - 1 ] !== '\\' ) { modeSingleQuote = false; } continue; } if ( modeDoubleQuote ) { if ( string === '"' && vector[ i - 1 ] !== '\\' ) { modeDoubleQuote = false; } continue; } if ( modeBlockComment ) { if ( string === '*' && vector[ i + 1 ] === '/' ) { vector[ i + 1 ] = ''; modeBlockComment = false; } vector[ i ] = ''; continue; } if ( modeLineComment) { string = vector[ i + 1 ]; if ( string === '\n' || string === '\r' ) { modeLineComment = false; } vector[ i ] = ''; continue; } if ( modeCondComp ) { if ( vector[ i - 2 ] === '@' && vector[ i - 1 ] === '*' && string === '/' ) { modeCondComp = false; } continue; } if ( string === '"' ) { modeDoubleQuote = true; continue; } if ( string === "'" ) { modeSingleQuote = true; continue; } if ( string === '/' ) { if ( ! removeCondComp && vector[ i + 1 ] === '*' && vector[ i + 2 ] === '@' ) { modeCondComp = true; continue; } if ( vector[ i + 1 ] === '*' ) { vector[ i ] = ''; modeBlockComment = true; continue; } if ( vector[ i + 1 ] === '/' ) { vector[ i ] = ''; modeLineComment = true; continue; } for ( var k : int = i - 1; true; k-- ) { string = vector[ k ]; if ( string !== ' ' ) { if ( string === '=' ) { modeRegExp = true; } break; } } } } return vector.join( '' ).slice( 2, -2 ); } } }
Allright – CSS doesn’t allow inline or conditional comments so I removed this script functionality and created a CSSParser
class:
package de.superclass.parser { /** * @author Markus Raab (superclass.de | blog.derRaab.com) */ public class CSSParser { /** * Removes comments from a CSS string. * * This is a shrinked port of James Padolseys script. * @see: http://james.padolsey.com/javascript/removing-comments-in-javascript * * @param css CSS string * @return CSS string without comments */ public static function removeComments( css : String) : String { var modeSingleQuote : Boolean = false; var modeDoubleQuote : Boolean = false; var modeBlockComment : Boolean = false; var vector : Vector.<String> = Vector.<String>( ( '__' + css + '__' ).split( '' ) ); vector.fixed = true; var c : int = vector.length; for ( var i : int = 0; i < c; i++ ) { var string : String = vector[ i ]; if ( modeSingleQuote ) { if ( string === "'" && vector[ i - 1 ] !== '\\' ) { modeSingleQuote = false; } continue; } if ( modeDoubleQuote ) { if ( string === '"' && vector[ i - 1 ] !== '\\' ) { modeDoubleQuote = false; } continue; } if ( modeBlockComment ) { if ( string === '*' && vector[ i + 1 ] === '/' ) { vector[ i + 1 ] = ''; modeBlockComment = false; } vector[ i ] = ''; continue; } if ( string === '"' ) { modeDoubleQuote = true; continue; } if ( string === "'" ) { modeSingleQuote = true; continue; } if ( string === '/' ) { if ( vector[ i + 1 ] === '*' ) { vector[ i ] = ''; modeBlockComment = true; continue; } } } return vector.join( '' ).slice( 2, -2 ); } } }
Good. Lastly I extended the code to remove all comments within HTML
source code, which can also contain JavaScript
and of course CSS
. This is the HTMLParser
:
package de.superclass.parser { /** * @author Markus Raab (superclass.de | blog.derRaab.com) */ public class HTMLParser { /** * Removes all HTML, CSS and ECMAScript comments from a HTML string. * * This is an extended port of James Padolseys script with an improved RegExp recognition. * @see: http://james.padolsey.com/javascript/removing-comments-in-javascript * * @param html HTML string * @param removeHTMLCondComment Optional (default=false) - Whether to remove Internet Explorers HTML conditional comments or not. * @param removeJSCondComp Optional (default=false) - Whether to remove Internet Explorers JavasScript conditional compilation comments or not. * @return HTML string without comments */ public static function removeComments( html : String, removeHTMLCondComment : Boolean = false, removeJSCondComp : Boolean = false ) : String { var modeSingleQuote : Boolean = false; var modeDoubleQuote : Boolean = false; var modeRegExp : Boolean = false; var modeBlockComment : Boolean = false; var modeLineComment : Boolean = false; var modeJSCondComp : Boolean = false; var modeHTMLComment : Boolean = false; var modeHTMLCondComment : Boolean = false; var vector : Vector.<String> = Vector.<String>( ( '__' + html + '__' ).split( '' ) ); vector.fixed = true; var c : int = vector.length; for ( var i : int = 0; i < c; i++ ) { var string : String = vector[ i ]; if ( modeRegExp ) { if ( string === '/' && vector[ i - 1 ] !== '\\' ) { modeRegExp = false; } continue; } if ( modeSingleQuote ) { if ( string === "'" && vector[ i - 1 ] !== '\\' ) { modeSingleQuote = false; } continue; } if ( modeDoubleQuote ) { if ( string === '"' && vector[ i - 1 ] !== '\\') { modeDoubleQuote = false; } continue; } if ( modeBlockComment ) { if ( string === '*' && vector[ i + 1 ] === '/') { vector[ i + 1 ] = ''; modeBlockComment = false; } vector[ i ] = ''; continue; } if ( modeLineComment) { string = vector[ i + 1 ]; if ( string === '\n' || string === '\r' ) { modeLineComment = false; } vector[ i ] = ''; continue; } if ( modeJSCondComp ) { if ( vector[ i - 2 ] === '@' && vector[ i - 1 ] === '*' && string === '/' ) { modeJSCondComp = false; } continue; } if ( modeHTMLComment ) { // --> if ( string === '-' && vector[ i + 1 ] === '-' && vector[ i + 2 ] === '>' ) { vector[ i + 1 ] = ''; vector[ i + 2 ] = ''; modeHTMLComment = false; } vector[ i ] = ''; continue; } if ( modeHTMLCondComment ) { if ( string === 'i' && vector[ i + 1 ] === 'f' && vector[ i + 2 ] === ']' && vector[ i + 3 ] === '-' && vector[ i + 4 ] === '-' && vector[ i + 5 ] === '>' ) // if]--> { modeHTMLCondComment = false; i += 5; } continue; } if ( string === '<' && vector[ i + 1 ] === '!' && vector[ i + 2 ] === '-' && vector[ i + 3 ] === '-' ) // <!-- { if ( ! removeHTMLCondComment && vector[ i + 4 ] === '[' && vector[ i + 5 ] === 'i' && vector[ i + 6 ] === 'f' ) // <!--[if { modeHTMLCondComment = true; } else { vector[ i ] = ''; modeHTMLComment = true; } } if ( string === '"' ) { modeDoubleQuote = true; continue; } if ( string === "'" ) { modeSingleQuote = true; continue; } if ( string === '/' ) { if ( ! removeJSCondComp && vector[ i + 1 ] === '*' && vector[ i + 2 ] === '@' ) { modeJSCondComp = true; continue; } if ( vector[ i + 1 ] === '*' ) { vector[ i ] = ''; modeBlockComment = true; continue; } if ( vector[ i + 1 ] === '/' ) { vector[ i ] = ''; modeLineComment = true; continue; } for ( var k : int = i - 1; true; k-- ) { string = vector[ k ]; if ( string !== ' ' ) { if ( string === '=' ) { modeRegExp = true; } break; } } } } return vector.join( '' ).slice( 2, -2 ); } } }
Let me know if I made a mistake but it seems to work quite well. Or do you know a good library that already does this kind of optimizations?
Well, that’s it for now. Have fun coding!