How to Convert HTML to Plain Text in asp.net


Description:-

In this example we explain that how to convert Text to HTML in asp.net with c#. or how to convert HTML to Formatted Plain Text in asp.net.

Sometimes we needs to display text from a file or database that contain a HTML text or string. This text may be entered by the user that has not been formatted for HTML. In these cases, the text must be converted.

.Net already provide the HttpUtility.HtmlEncode () method to encode special characters so that they will appear as expected in a web browser. But problem is that , this method won't do anything with line breaks and paragraphs.

So When your application needs to convert or display unformatted text to Formatted Plain Text that contains multiple lines and paragraphs on a Web page, a little more work or code is required that we describe bellows.

Default.aspx:-

<div>
  <asp:Label ID="Label1" runat="server" Text="Label"></asp:Label>
</div>

Default.aspx.cs:-


protected void Page_Load(object sender, EventArgs e)
{
  Label1.Text = StripHTML("<html><head></head><p>Hi how are you </p><h1> this is heading </h1></html>");
}

private string StripHTML(string source)
{
  try
  {
    string result;
    // Remove HTML Development formatting
    // Replace line breaks with space
    // because browsers inserts space
    result = source.Replace("\r", " ");
    // Replace line breaks with space
    // because browsers inserts space
    result = result.Replace("\n", " ");
    // Remove step-formatting
    result = result.Replace("\t", string.Empty);
    // Remove repeating spaces because browsers ignore them
    result = System.Text.RegularExpressions.Regex.Replace(result, @"( )+", " ");
    // Remove the header (prepare first by clearing attributes)
    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*head([^>])*>", "<head>",     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*head( )*>)", "</head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, "(<head>).*(</head>)",     string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    // remove all scripts (prepare first by clearing attributes)
    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*script([^>])*>", "<script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*script( )*>)", "</script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    //result = System.Text.RegularExpressions.Regex.Replace(result,
    // @"(<script>)([^(<script>\.</script>)])*(</script>)",
    // string.Empty,
    // System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>).*(</script>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    // remove all styles (prepare first by clearing attributes)
    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*style([^>])*>", "<style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*style( )*>)", "</style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, "(<style>).*(</style>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    // insert tabs in spaces of <td> tags
    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*td([^>])*>", "\t",     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    // insert line breaks in places of <BR> and <LI> tags
    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*br( )*>", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*li( )*>", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    // insert line paragraphs (double line breaks) in place
    // if <P>, <DIV> and <TR> tags
    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*div([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*tr([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*p([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    // Remove remaining tags like <a>, links, images,
    // comments etc - anything that's enclosed inside <>
    result = System.Text.RegularExpressions.Regex.Replace(result, @"<[^>]*>", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    // replace special characters:
    result = System.Text.RegularExpressions.Regex.Replace(result, @" ", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, @"&bull;", " * ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, @"&lsaquo;", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, @"&rsaquo;", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, @"&trade;", "(tm)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, @"&frasl;", "/", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, @"&lt;", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, @"&gt;", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, @"&copy;", "(c)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, @"&reg;", "(r)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    // Remove all others. More can be added, see
    // http://hotwired.lycos.com/webmonkey/reference/special_characters/
    result = System.Text.RegularExpressions.Regex.Replace(result, @"&(.{2,6});", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    // for testing
    //System.Text.RegularExpressions.Regex.Replace(result,
    // this.txtRegex.Text,string.Empty,
    // System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    // make line breaking consistent
    result = result.Replace("\n", "\r");
    // Remove extra line breaks and tabs:
    // replace over 2 breaks with 2 and over 4 tabs with 4.
    // Prepare first to remove any whitespaces in between
    // the escaped characters and remove redundant tabs in between line breaks
    result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\t)", "\t\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\r)", "\t\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\t)", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    // Remove redundant tabs
    result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    // Remove multiple tabs following a line break with just one tab
    result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    // Initial replacement target string for line breaks
    string breaks = "\r\r\r";
    // Initial replacement target string for tabs
    string tabs = "\t\t\t\t\t";
    for (int index = 0; index <result.Length; index++)
    {
      result = result.Replace(breaks, "\r\r");
      result = result.Replace(tabs, "\t\t\t\t");
      breaks = breaks + "\r";
      tabs = tabs + "\t";
    }
    // That's it.
    return result;
  }
  catch
  {
    return source;
  }
}

Related Posts

Previous
Next Post »

2 comments

comments

Thanks for comments.....