|
Check-in Number:
|
462 | |
| Date: |
2010-Feb-11 11:59:37 (local)
2010-Feb-11 19:59:37 (UTC) |
| User: | majid |
| Branch: | |
| Comment: |
#101 fix unbalanced CDATA sections |
| Tickets: |
|
#101 | |
HTML balancer does not deal with unbalanced CDATA sections |
|
| Inspections: |
|
| Files: |
|
temboz/normalize.py 1.61 -> 1.62
--- /tmp/T0cjayA1 Mon Sep 6 18:09:00 2010
+++ /tmp/T1djayA1 Mon Sep 6 18:09:00 2010
@@ -289,9 +289,11 @@
closing = set(closing)
banned = set(banned)
-# XXX should really use html5lib for this, as this lexer is not robust, e.g.
+# XXX should really use html5lib for this once it has stabilized,
+# XXX as this lexer is not robust, e.g.
# XXX <a href="javascript:alert('foo>bar')">
-tag_re = re.compile('(<.*?>)', re.DOTALL)
+tag_re = re.compile(r'(<>|<[^!].*?>|<!\[CDATA\[|\]\]>)',
+ re.DOTALL | re.MULTILINE)
def balance(html, limit_words=None, ellipsis=' ...'):
word_count = 0
tokens = tag_re.split(html)
@@ -298,6 +300,7 @@
out = []
stack = []
for token in tokens:
+ print '@'*8, token, stack
if not token.startswith('<'):
if limit_words and word_count > limit_words:
break
@@ -309,6 +312,8 @@
else:
out.append(token)
continue
+ if token == '<![CDATA[': continue
+ if token == ']]>': continue
if not token.endswith('>'): continue # invalid
element = token[1:-1].split()[0].lower()
if not element: continue # invalid