C# Strip HTML and Attributes in MVC

Documenting method and technique for removing HTML elements and their attributes in an MVC app.

If you are receiving this error message when posting content that contains HTML:

A potentially dangerous Request.Form value was detected from the client

Add ValidateInput(false) to your controller’s HttpPost method:

[HttpPost, ValidateInput(false)]

Take a look at this blog article for the class that does the HTML strip work, http://www.dijksterhuis.org/safely-cleaning-html-with-strip_tags-in-csharp/.

In my implementation, the list of allowed tags is a string in the web.config:

<configuration>
    <appSettings>
        <add key="HtmlAllowed" value="a,b,br,em,i,p,strong"/>
    </appSettings>
</configuration>

The settings class contains this line to set the HtmlAllowed web.config appSetting to a char array:

public static string[] HtmlAllowed { get { return ConfigurationManager.AppSettings["HtmlAllowed"].Split(",".ToCharArray()); } }

Here is an example of a call to the method to strip HTML posted from a form input named “MyTextarea” while passing in the allowed tags array parameter in the second argument:

StripHtml.StripTagsAndAttributes(collection["MyTextarea"].Trim(), Settings.HtmlAllowed );

Source code

using System;
using System.Text.RegularExpressions;

namespace StripHTML
{
    class MainClass
    {

        private static string ReplaceFirst(string haystack, string needle, string replacement)
        {
            int pos = haystack.IndexOf(needle);
            if (pos < 0) return haystack;
            return haystack.Substring(0,pos) + replacement + haystack.Substring(pos+needle.Length);
        }

        private static string ReplaceAll(string haystack, string needle, string replacement)
        {
                int pos;
                // Avoid a possible infinite loop
                if (needle == replacement) return haystack;
                while((pos = haystack.IndexOf(needle))>0)
                        haystack = haystack.Substring(0,pos) + replacement + haystack.Substring(pos+needle.Length);
                        return haystack;
        }

        public static string StripTags(string Input, string[] AllowedTags)
        {
            Regex StripHTMLExp = new Regex(@"(<\/?[^>]+>)");
            string Output = Input;

            foreach(Match Tag in StripHTMLExp.Matches(Input))
            {
                string HTMLTag = Tag.Value.ToLower();
                bool IsAllowed = false;

                foreach(string AllowedTag in AllowedTags)
                {
                    int offset = -1;

                    // Determine if it is an allowed tag
                    // "" , "');
                    if (offset!=0) offset = HTMLTag.IndexOf('<'+AllowedTag+' ');
                    if (offset!=0) offset = HTMLTag.IndexOf(" m.Groups[1].Value + "href..;,;.." + m.Groups[2].Value;
            MatchEvaluator ClassMatch = m => m.Groups[1].Value + "class..;,;.." + m.Groups[2].Value;
            MatchEvaluator UnsafeMatch = m => m.Groups[1].Value + m.Groups[4].Value;

            /* Allow the "href" attribute */
            Output = new Regex("()").Replace(Output,HrefMatch);

            /* Allow the "class" attribute */
            Output = new Regex("()").Replace(Output,ClassMatch);

            /* Remove unsafe attributes in any of the remaining tags */
            Output = new Regex(@"(<.*) .*=(\'|\""|\w)[\w|.|(|)]*(\'|\""|\w)(.*>)").Replace(Output,UnsafeMatch);

            /* Return the allowed tags to their proper form */
            Output = ReplaceAll(Output,"..;,;..", "=");

            return Output;
        }
    }
}
comments powered by Disqus