Excel VBA to convert HTML to plain text

Microsoft Excel 2010 - complete product...
December 18, 2014 at 05:39:40
Specs: Windows 7 Service Pack 1
I have an Excel Query that pulls information in from another database one of the fields comes across in HTML format. I have found the code below to convert it to text however it leaves a lot of empty spaces above and below. Any ideas on how to remove the spaces?

Here is the HTML text that is being pulled in all I need to see is the Revision 1 sent 12/3 and the next line Revision 2 sent 12/15:

"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Transitional//EN"">
<HTML><HEAD>
<STYLE type=text/css> P, UL, OL, DL, DIR, MENU, PRE { margin: 0 auto;}</STYLE>

<META name=GENERATOR content=""MSHTML 9.00.8112.16599""></HEAD>
<BODY leftMargin=1 rightMargin=1 topMargin=1>
<DIV><font face="">
<DIV>Revision 1 sent 12/3</DIV>
<DIV>Revision 2 sent 12/15</DIV></FONT></DIV></BODY></HTML>

Here is the code I have found that does this but leaves empty space:

Function StripHTML(cell As Range) As String
Dim RegEx As Object
Set RegEx = CreateObject("vbscript.regexp")
Dim sInput As String
Dim sOut As String
sInput = cell.text
 
    sInput = Replace(sInput, "\x0D\x0A", Chr(10))
    sInput = Replace(sInput, "\x00", Chr(10))
 
    
    sInput = Replace(sInput, "</P>", Chr(10) & Chr(10))
    sInput = Replace(sInput, "
", Chr(10))
    sInput = Replace(sInput, "<li>", "-")
  
    sInput = Replace(sInput, "–", "–")
    sInput = Replace(sInput, "—", "—")
    sInput = Replace(sInput, "¡", "¡")
    sInput = Replace(sInput, "¿", "¿")
    sInput = Replace(sInput, """, "")
    sInput = Replace(sInput, "“", "“")
    sInput = Replace(sInput, "”", "”")
    sInput = Replace(sInput, "", "'")
    sInput = Replace(sInput, "‘", "‘")
    sInput = Replace(sInput, "’", "’")
    sInput = Replace(sInput, "«", "«")
    sInput = Replace(sInput, "»", "»")
    sInput = Replace(sInput, " ", "  ")
    sInput = Replace(sInput, "&", "&")
    sInput = Replace(sInput, "¢", "¢")
    sInput = Replace(sInput, "©", "©")
    sInput = Replace(sInput, "÷", "÷")
    sInput = Replace(sInput, ">", ">")
    sInput = Replace(sInput, "<", "<")
    sInput = Replace(sInput, "µ", "µ")
    sInput = Replace(sInput, "·", "·")
    sInput = Replace(sInput, "¶", "¶")
    sInput = Replace(sInput, "±", "±")
    sInput = Replace(sInput, "€", "€")
    sInput = Replace(sInput, "£", "£")
    sInput = Replace(sInput, "®", "®")
    sInput = Replace(sInput, "§", "§")
    sInput = Replace(sInput, "™", "™")
    sInput = Replace(sInput, "¥", "¥")
    sInput = Replace(sInput, "á", "á")
    sInput = Replace(sInput, "Á", "Á")
    sInput = Replace(sInput, "à", "à")
    sInput = Replace(sInput, "À", "À")
    sInput = Replace(sInput, "â", "â")
    sInput = Replace(sInput, "Â", "Â")
    sInput = Replace(sInput, "å", "å")
    sInput = Replace(sInput, "Å", "Å")
    sInput = Replace(sInput, "ã", "ã")
    sInput = Replace(sInput, "Ã", "Ã")
    sInput = Replace(sInput, "ä", "ä")
    sInput = Replace(sInput, "Ä", "Ä")
    sInput = Replace(sInput, "æ", "æ")
    sInput = Replace(sInput, "Æ", "Æ")
    sInput = Replace(sInput, "ç", "ç")
    sInput = Replace(sInput, "Ç", "Ç")
    sInput = Replace(sInput, "é", "é")
    sInput = Replace(sInput, "É", "É")
    sInput = Replace(sInput, "è", "è")
    sInput = Replace(sInput, "È", "È")
    sInput = Replace(sInput, "ê", "ê")
    sInput = Replace(sInput, "Ê", "Ê")
    sInput = Replace(sInput, "ë", "ë")
    sInput = Replace(sInput, "Ë", "Ë")
    sInput = Replace(sInput, "í", "í")
    sInput = Replace(sInput, "Í", "Í")
    sInput = Replace(sInput, "ì", "ì")
    sInput = Replace(sInput, "Ì", "Ì")
    sInput = Replace(sInput, "î", "î")
    sInput = Replace(sInput, "Î", "Î")
    sInput = Replace(sInput, "ï", "ï")
    sInput = Replace(sInput, "Ï", "Ï")
    sInput = Replace(sInput, "ñ", "ñ")
    sInput = Replace(sInput, "Ñ", "Ñ")
    sInput = Replace(sInput, "ó", "ó")
    sInput = Replace(sInput, "Ó", "Ó")
    sInput = Replace(sInput, "ò", "ò")
    sInput = Replace(sInput, "Ò", "Ò")
    sInput = Replace(sInput, "ô", "ô")
    sInput = Replace(sInput, "Ô", "Ô")
    sInput = Replace(sInput, "ø", "ø")
    sInput = Replace(sInput, "Ø", "Ø")
    sInput = Replace(sInput, "õ", "õ")
    sInput = Replace(sInput, "Õ", "Õ")
    sInput = Replace(sInput, "ö", "ö")
    sInput = Replace(sInput, "Ö", "Ö")
    sInput = Replace(sInput, "ß", "ß")
    sInput = Replace(sInput, "ú", "ú")
    sInput = Replace(sInput, "Ú", "Ú")
    sInput = Replace(sInput, "ù", "ù")
    sInput = Replace(sInput, "Ù", "Ù")
    sInput = Replace(sInput, "û", "û")
    sInput = Replace(sInput, "Û", "Û")
    sInput = Replace(sInput, "ü", "ü")
    sInput = Replace(sInput, "Ü", "Ü")
    sInput = Replace(sInput, "ÿ", "ÿ")
    sInput = Replace(sInput, "", "´")
    sInput = Replace(sInput, "", "`")
    sInput = Replace(sInput, "P, UL, OL, DL, DIR, MENU, PRE { margin: 0 auto;}", "")
       
With RegEx
   .Global = True
   .IgnoreCase = True
   .MultiLine = True
   .Pattern = "<[^>]+>" 'Regular Expression for HTML Tags.
 
End With
    sOut = RegEx.Replace(sInput, "")
    StripHTML = sOut
    Set RegEx = Nothing
 
End Function

Any ideas on how to accomplish this or any other way that anyone is aware of?

Thank you,
Sandi


See More: Excel VBA to convert HTML to plain text

Report •

#1
December 18, 2014 at 05:56:44
Without testing i would say that you can change the line that reads

sInput = Replace(sInput, " ", "  ")

to

sInput = Replace(sInput, " ", "")

This should stop the double spacing between words. Try that


Report •

#2
December 18, 2014 at 06:08:58
I am not finding that particular line in the code?

Report •

#3
December 18, 2014 at 07:00:45
its about 25 lines down from where the code begins, just below

sInput = Replace(sInput, "»", "»")


Report •

Related Solutions

#4
December 18, 2014 at 07:11:00
Ok found and tried it but it didn't work....I believe they are actually carriage returns I tried taking out the Chr(10)'s in the top portion of the formula but that didn't work either.

Report •

Ask Question