Removing comments in CSS, HTML and ECMAScript (JavaScript)

While working on an Adobe AIR based source code editor I was looking for an easy way to remove comments from different kind of source codes. What first seemed like an easy regular expression turned out to be much more complex because comment pre- and suffixes can also occur in string literals or regular expressions and our old friend Internet Explorer allows HTML conditional comments and JavaScript conditional compilation. Great.

First I had a look at YUICompressors source code and Google Page Speed but then I decided to port a script for removing JavaScript comments created by James Padolsey.

I just improved the recognition for regular expressions a little and since this code should work with all ECMAScript based source codes I named my class ECMAScriptParser instead of JavaScriptParser:

package de.superclass.parser
{
	/**
	 * @author Markus Raab (superclass.de | blog.derRaab.com)
	 */
	public class ECMAScriptParser
	{
		/**
		 * Removes inline and block comments from an ECMAScript source string. 
		 * 
		 * This is a port of James Padolsey	 with an slightly improved RegExp recognition.
		 * @see: http://james.padolsey.com/javascript/removing-comments-in-javascript
		 * 
		 * @param ecmaScript		ECMAScript source string
		 * @param removeCondComp	Optional (default=false) - Whether to remove Internet Explorers JavasScript conditional compilation comments or not.   
		 * @return					ECMAScript source string without comments
		 */
		public static function removeComments( ecmaScript : String, removeCondComp : Boolean = false ) : String
		{
			var modeSingleQuote		: Boolean = false;
			var modeDoubleQuote		: Boolean = false;
			var modeRegExp			: Boolean = false;
			var modeBlockComment	: Boolean = false;
			var modeLineComment		: Boolean = false;
			var modeCondComp		: Boolean = false; 
 
			var vector : Vector.<String> = Vector.<String>( ( '__' + ecmaScript + '__' ).split( '' ) );
				vector.fixed = true;
 
			var c : int = vector.length;
 
			for ( var i : int = 0; i < c; i++ )
			{
				var string : String = vector[ i ];
 
				if ( modeRegExp )
				{
					if ( string === '/' && vector[ i - 1 ] !== '\\' )
					{
						modeRegExp = false;
					}
					continue;
				}
 
				if ( modeSingleQuote )
				{
					if ( string === "'" && vector[ i - 1 ] !== '\\' )
					{
						modeSingleQuote = false;
					}
					continue;
				}
 
				if ( modeDoubleQuote )
				{
					if ( string === '"' && vector[ i - 1 ] !== '\\' )
					{
						modeDoubleQuote = false;
					}
					continue;
				}
 
				if ( modeBlockComment )
				{
					if ( string === '*' && vector[ i + 1 ] === '/' )
					{
						vector[ i + 1 ] = '';
						modeBlockComment = false;
					}
					vector[ i ] = '';
					continue;
				}
 
				if ( modeLineComment)
				{
					string = vector[ i + 1 ];
					if ( string === '\n' || string === '\r' )
					{
						modeLineComment = false;
					}
					vector[ i ] = '';
					continue;
				}
 
				if ( modeCondComp )
				{
					if ( vector[ i - 2 ] === '@' && vector[ i - 1 ] === '*' && string === '/' )
					{
						modeCondComp = false;
					}
					continue;
				}
 
				if ( string === '"' )
				{
					modeDoubleQuote = true;
					continue;
				}
 
				if ( string === "'" )
				{
					modeSingleQuote = true;
					continue;
				}
 
				if ( string === '/' )
				{
					if ( ! removeCondComp && vector[ i + 1 ] === '*' && vector[ i + 2 ] === '@' )
					{
						modeCondComp = true;
						continue;
					}
 
					if ( vector[ i + 1 ] === '*' )
					{
						vector[ i ] = '';
 
						modeBlockComment = true;
						continue;
					}
 
					if ( vector[ i + 1 ] === '/' )
					{
						vector[ i ] = '';
						modeLineComment = true;
						continue;
					}
 
					for ( var k : int = i - 1; true; k-- )
					{
						string = vector[ k ];
						if ( string !== ' ' )
						{
							if ( string === '=' )
							{
								modeRegExp = true;
							}
							break;
						}
					}
				}
			}
			return vector.join( '' ).slice( 2, -2 );
		}
	}
}

Allright – CSS doesn’t allow inline or conditional comments so I removed this script functionality and created a CSSParser class:

package de.superclass.parser
{
	/**
	 * @author Markus Raab (superclass.de | blog.derRaab.com)
	 */
	public class CSSParser
	{
		/**
		 * Removes comments from a CSS string. 
		 * 
		 * This is a shrinked port of James Padolseys script.
		 * @see: http://james.padolsey.com/javascript/removing-comments-in-javascript
		 * 
		 * @param css				CSS string
		 * @return					CSS string without comments
		 */
		public static function removeComments( css : String) : String
		{
			var modeSingleQuote		: Boolean = false;
			var modeDoubleQuote		: Boolean = false;
			var modeBlockComment	: Boolean = false;
 
			var vector : Vector.<String> = Vector.<String>( ( '__' + css + '__' ).split( '' ) );
				vector.fixed = true;
 
			var c : int = vector.length;
 
			for ( var i : int = 0; i < c; i++ )
			{
				var string : String = vector[ i ];
 
				if ( modeSingleQuote )
				{
					if ( string === "'" && vector[ i - 1 ] !== '\\' )
					{
						modeSingleQuote = false;
					}
					continue;
				}
 
				if ( modeDoubleQuote )
				{
					if ( string === '"' && vector[ i - 1 ] !== '\\' )
					{
						modeDoubleQuote = false;
					}
					continue;
				}
 
				if ( modeBlockComment )
				{
					if ( string === '*' && vector[ i + 1 ] === '/' )
					{
						vector[ i + 1 ] = '';
						modeBlockComment = false;
					}
					vector[ i ] = '';
					continue;
				}
 
				if ( string === '"' )
				{
					modeDoubleQuote = true;
					continue;
				}
 
				if ( string === "'" )
				{
					modeSingleQuote = true;
					continue;
				}
 
				if ( string === '/' )
				{
					if ( vector[ i + 1 ] === '*' )
					{
						vector[ i ] = '';
 
						modeBlockComment = true;
						continue;
					}
				}
			}
			return vector.join( '' ).slice( 2, -2 );
		}
	}
}

Good. Lastly I extended the code to remove all comments within HTML source code, which can also contain JavaScript and of course CSS. This is the HTMLParser:

package de.superclass.parser
{
	/**
	 * @author Markus Raab (superclass.de | blog.derRaab.com)
	 */
	public class HTMLParser
	{
		/**
		 * Removes all HTML, CSS and ECMAScript comments from a HTML string.
		 * 
		 * This is an extended port of James Padolseys script with an improved RegExp recognition.
		 * @see: http://james.padolsey.com/javascript/removing-comments-in-javascript
		 * 
		 * @param html						HTML string
		 * @param removeHTMLCondComment		Optional (default=false) - Whether to remove Internet Explorers HTML conditional comments or not.
		 * @param removeJSCondComp			Optional (default=false) - Whether to remove Internet Explorers JavasScript conditional compilation comments or not.
		 * @return							HTML string without comments
		 */
		public static function removeComments( html : String, removeHTMLCondComment : Boolean = false, removeJSCondComp : Boolean = false ) : String
		{
			var modeSingleQuote		: Boolean = false;
			var modeDoubleQuote		: Boolean = false;
			var modeRegExp			: Boolean = false;
			var modeBlockComment	: Boolean = false;
			var modeLineComment		: Boolean = false;
 
			var modeJSCondComp		: Boolean = false;
 
			var modeHTMLComment		: Boolean = false;
			var modeHTMLCondComment	: Boolean = false;
 
			var vector : Vector.<String> = Vector.<String>( ( '__' + html + '__' ).split( '' ) );
				vector.fixed = true;
 
			var c : int = vector.length;
 
			for ( var i : int = 0; i < c; i++ )
			{
				var string : String = vector[ i ];
 
				if ( modeRegExp )
				{
					if ( string === '/' && vector[ i - 1 ] !== '\\' )
					{
						modeRegExp = false;
					}
					continue;
				}
 
				if ( modeSingleQuote )
				{
					if ( string === "'" && vector[ i - 1 ] !== '\\' )
					{
						modeSingleQuote = false;
					}
					continue;
				}
 
				if ( modeDoubleQuote )
				{
					if ( string === '"' && vector[ i - 1 ] !== '\\')
					{
						modeDoubleQuote = false;
					}
					continue;
				}
 
				if ( modeBlockComment )
				{
					if ( string === '*' && vector[ i + 1 ] === '/')
					{
						vector[ i + 1 ] = '';
						modeBlockComment = false;
					}
					vector[ i ] = '';
					continue;
				}
 
				if ( modeLineComment)
				{
					string = vector[ i + 1 ];
					if ( string === '\n' || string === '\r' )
					{
						modeLineComment = false;
					}
					vector[ i ] = '';
					continue;
				}
 
				if ( modeJSCondComp )
				{
					if ( vector[ i - 2 ] === '@' && vector[ i - 1 ] === '*' && string === '/' )
					{
						modeJSCondComp = false;
					}
					continue;
				}
 
				if ( modeHTMLComment )
				{
					// --> 
					if ( string === '-' && vector[ i + 1 ] === '-' && vector[ i + 2 ] === '>' )
					{
						vector[ i + 1 ] = '';
						vector[ i + 2 ] = '';
 
						modeHTMLComment = false;
					}
					vector[ i ] = '';
					continue;
				}
 
				if ( modeHTMLCondComment )
				{
					if ( string === 'i' && vector[ i + 1 ] === 'f' && vector[ i + 2 ] === ']' && vector[ i + 3 ] === '-' && vector[ i + 4 ] === '-' && vector[ i + 5 ] === '>' ) // if]--> 
					{
						modeHTMLCondComment = false;
						i += 5;
					}
					continue;
				}
 
				if ( string === '<' && vector[ i + 1 ] === '!' && vector[ i + 2 ] === '-' && vector[ i + 3 ] === '-' ) // <!--
				{
					if ( ! removeHTMLCondComment && vector[ i + 4 ] === '[' && vector[ i + 5 ] === 'i' && vector[ i + 6 ] === 'f' ) // <!--[if
					{
						modeHTMLCondComment = true;
					}
					else
					{
						vector[ i ] = '';
						modeHTMLComment = true;
					}
				}
 
				if ( string === '"' )
				{
					modeDoubleQuote = true;
					continue;
				}
 
				if ( string === "'" )
				{
					modeSingleQuote = true;
					continue;
				}
 
				if ( string === '/' )
				{
					if ( ! removeJSCondComp && vector[ i + 1 ] === '*' && vector[ i + 2 ] === '@' )
					{
						modeJSCondComp = true;
						continue;
					}
 
					if ( vector[ i + 1 ] === '*' )
					{
						vector[ i ] = '';
 
						modeBlockComment = true;
						continue;
					}
 
					if ( vector[ i + 1 ] === '/' )
					{
						vector[ i ] = '';
						modeLineComment = true;
						continue;
					}
 
					for ( var k : int = i - 1; true; k-- )
					{
						string = vector[ k ];
						if ( string !== ' ' )
						{
							if ( string === '=' )
							{
								modeRegExp = true;
							}
							break;
						}
					}
				}
			}
			return vector.join( '' ).slice( 2, -2 );
		}
	}
}

Let me know if I made a mistake but it seems to work quite well. Or do you know a good library that already does this kind of optimizations?

Well, that’s it for now. Have fun coding!

Leave a Reply

Your email address will not be published. Required fields are marked *

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>