|
Check-in Number:
|
466 | |
| Date: |
2010-Mar-10 12:00:13 (local)
2010-Mar-10 20:00:13 (UTC) |
| User: | majid |
| Branch: | |
| Comment: |
balancing logic now takes care of unbalanced HTML comments that could mess up formatting |
| Tickets: |
|
| Inspections: |
|
| Files: |
|
temboz/normalize.py 1.63 -> 1.64
--- /tmp/T0ZhayCd Sun Sep 5 17:03:27 2010
+++ /tmp/T10hayCd Sun Sep 5 17:03:27 2010
@@ -292,7 +292,7 @@
# XXX should really use html5lib for this once it has stabilized,
# XXX as this lexer is not robust, e.g.
# XXX <a href="javascript:alert('foo>bar')">
-tag_re = re.compile(r'(<>|<[^!].*?>|<!\[CDATA\[|\]\]>)',
+tag_re = re.compile(r'(<>|<[^!].*?>|<!\[CDATA\[|\]\]>|<!--.*?-->|<[!]>)',
re.DOTALL | re.MULTILINE)
def balance(html, limit_words=None, ellipsis=' ...'):
word_count = 0
@@ -311,7 +311,7 @@
else:
out.append(token)
continue
- if token == '<![CDATA[': continue
+ if token.startswith('<!'): continue
if token == ']]>': continue
if not token.endswith('>'): continue # invalid
element = token[1:-1].split()[0].lower()