temboz - Check-in [462]
Not logged in
[Honeypot]  [Browse]  [Help]  [Home]  [Login]  [Reports
[Search]  [Ticket]  [Timeline]  [Wiki
  [Patchset]  [Tagging/Branching
Check-in Number: 462
Date: 2010-Feb-11 11:59:37 (local)
2010-Feb-11 19:59:37 (UTC)
User:majid
Branch:
Comment: #101 fix unbalanced CDATA sections
Tickets:
#101 HTML balancer does not deal with unbalanced CDATA sections
Inspections:
Files:
temboz/normalize.py      1.61 -> 1.62     7 inserted, 2 deleted

temboz/normalize.py 1.61 -> 1.62
--- /tmp/T0cjayA1	Mon Sep  6 18:09:00 2010
+++ /tmp/T1djayA1	Mon Sep  6 18:09:00 2010
@@ -289,9 +289,11 @@
 closing = set(closing)
 banned = set(banned)
 
-# XXX should really use html5lib for this, as this lexer is not robust, e.g.
+# XXX should really use html5lib for this once it has stabilized,
+# XXX as this lexer is not robust, e.g.
 # XXX <a href="javascript:alert('foo>bar')">
-tag_re = re.compile('(<.*?>)', re.DOTALL)
+tag_re = re.compile(r'(<>|<[^!].*?>|<!\[CDATA\[|\]\]>)',
+                    re.DOTALL | re.MULTILINE)
 def balance(html, limit_words=None, ellipsis=' ...'):
   word_count = 0
   tokens = tag_re.split(html)
@@ -298,6 +300,7 @@
   out = []
   stack = []
   for token in tokens:
+    print '@'*8, token, stack
     if not token.startswith('<'):
       if limit_words and word_count > limit_words:
         break
@@ -309,6 +312,8 @@
       else:
         out.append(token)
       continue
+    if token == '<![CDATA[': continue
+    if token == ']]>': continue
     if not token.endswith('>'): continue # invalid
     element = token[1:-1].split()[0].lower()
     if not element: continue # invalid