Does anybody know of a reliable way to strip out redundant html tags, inline styles and WORD formatting from product descriptions?
For example:
<span style="font-family="verdana, arial, sans-serif">MY CONTENT</span>
would become:
MY CONTENT
or
<p>MY CONTENT</p>
<p></p>
<p></p>
would become
<p>MY CONTENT</p>
or
<br /><br /><br />MY CONTENT
would become
<br />MY CONTENT
or
<SPAN lang=EN-IE style="mso-ansi-language: EN-IE">
<p class="MSO Normal">
<UL style="MARGIN-TOP: 0cm" type=circle><li class=MsoNormal style='mso-list:l3 level1 lfo3;tab-stops:list 36.0pt'>
<o:p>MY CONTENT</o:p></li></p></SPAN>
would become
<p>MY CONTENT</p>
I found this but not sure how to implement it - I've tried both and neither appear to work but then it could be something I am doing wrong. I would like to target '.mainDescription' content in product descriptions.
this link which explains it http://tim.mackey.ie/CommentView,gui...6602d5718.aspx
and also this code:
function cleanHTML(input) {
// 1. remove line breaks / Mso classes
var stringStripper = /(\n|\r| class=(")?Mso[a-zA-Z]+(")?)/g;
var output = input.replace(stringStripper, ' ');
// 2. strip Word generated HTML comments
var commentSripper = new RegExp('<!--(.*?)-->','g');
var output = output.replace(commentSripper, '');
var tagStripper = new RegExp('<(/)*(meta|link|span|\\?xml:|st1:|o:|font)(.*?)>','gi');
// 3. remove tags leave content if any
output = output.replace(tagStripper, '');
// 4. Remove everything in between and including tags '<style(.)style(.)>'
var badTags = ['style', 'script','applet','embed','noframes','noscript'];
for (var i=0; i< badTags.length; i++) {
tagStripper = new RegExp('<'+badTags[i]+'.*?'+badTags[i]+'(.*?)>', 'gi');
output = output.replace(tagStripper, '');
}
// 5. remove attributes ' style="..."'
var badAttributes = ['style', 'start'];
for (var i=0; i< badAttributes.length; i++) {
var attributeStripper = new RegExp(' ' + badAttributes[i] + '="(.*?)"','gi');
output = output.replace(attributeStripper, '');
}
return output;
}
Bookmarks