2 """html2text: Turn HTML into equivalent Markdown-structured text."""
4 __author__ = "Aaron Swartz (me@aaronsw.com)"
5 __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
6 __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
9 # Support decoded entities with unifiable.
11 if not hasattr(__builtins__, 'True'): True, False = 1, 0
12 import re, sys, urllib, htmlentitydefs, codecs
15 sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
17 try: from textwrap import wrap
20 # Use Unicode characters instead of their ascii psuedo-replacements
23 # Put the links after each paragraph instead of at the end.
24 LINKS_EACH_PARAGRAPH = 0
26 # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
29 # Don't show internal links (href="#local-anchor") -- corresponding link targets
30 # won't be visible in the plain text file anyway.
31 SKIP_INTERNAL_LINKS = False
33 ### Entity Nonsense ###
36 if k == 'apos': return ord("'")
37 if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
38 return htmlentitydefs.name2codepoint[k]
40 k = htmlentitydefs.entitydefs[k]
41 if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
42 return ord(codecs.latin_1_decode(k)[0])
44 unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
45 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
46 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
47 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
48 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
49 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
50 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
51 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
55 for k in unifiable.keys():
56 unifiable_n[name2cp(k)] = unifiable[k]
59 if name[0] in ['x','X']:
64 if not UNICODE_SNOB and c in unifiable_n.keys():
70 if not UNICODE_SNOB and c in unifiable.keys():
74 except KeyError: return "&" + c
75 else: return unichr(name2cp(c))
77 def replaceEntities(s):
81 else: return entityref(s)
83 r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
85 return r_unescape.sub(replaceEntities, s)
88 # Fix bug in sgmllib.py
89 if not attrs: return attrs
92 newattrs.append((attr[0], unescape(attr[1])))
95 ### End Entity Nonsense ###
98 """Return true if the line does only consist of whitespace characters."""
100 if c is not ' ' and c is not ' ':
105 """Wrap all paragraphs in the provided text."""
109 assert wrap, "Requires Python 2.3."
112 for para in text.split("\n"):
114 if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
115 for line in wrap(para, BODY_WIDTH):
116 result += line + "\n"
120 if not onlywhite(para):
121 result += para + "\n"
130 if tag[0] == 'h' and len(tag) == 2:
133 if n in range(1, 10): return n
134 except ValueError: return 0
136 class _html2text(sgmllib.SGMLParser):
137 def __init__(self, out=sys.stdout.write, baseurl=''):
138 sgmllib.SGMLParser.__init__(self)
140 if out is None: self.out = self.outtextf
156 self.abbr_title = None # current abbreviation definition
157 self.abbr_data = None # last inner HTML (for abbr being defined)
158 self.abbr_list = {} # stack of abbreviations to write later
159 self.baseurl = baseurl
161 def outtextf(self, s):
165 sgmllib.SGMLParser.close(self)
172 def handle_charref(self, c):
175 def handle_entityref(self, c):
178 def unknown_starttag(self, tag, attrs):
179 self.handle_tag(tag, attrs, 1)
181 def unknown_endtag(self, tag):
182 self.handle_tag(tag, None, 0)
184 def previousIndex(self, attrs):
185 """ returns the index of certain set of attributes (of a link) in the
188 If the set of attributes is not found, returns None
190 if not attrs.has_key('href'): return None
197 if a.has_key('href') and a['href'] == attrs['href']:
198 if a.has_key('title') or attrs.has_key('title'):
199 if (a.has_key('title') and attrs.has_key('title') and
200 a['title'] == attrs['title']):
207 def handle_tag(self, tag, attrs, start):
208 attrs = fixattrs(attrs)
212 if start: self.o(hn(tag)*"#" + ' ')
214 if tag in ['p', 'div']: self.p()
216 if tag == "br" and start: self.o(" \n")
218 if tag == "hr" and start:
223 if tag in ["head", "style", 'script']:
224 if start: self.quiet += 1
225 else: self.quiet -= 1
228 self.quiet = 0 # sites like 9rules.com never close <head>
230 if tag == "blockquote":
232 self.p(); self.o('> ', 0, 1); self.start = 1
238 if tag in ['em', 'i', 'u']: self.o("_")
239 if tag in ['strong', 'b']: self.o("**")
240 if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
244 for (x, y) in attrs: attrsD[x] = y
247 self.abbr_title = None
249 if attrs.has_key('title'):
250 self.abbr_title = attrs['title']
252 if self.abbr_title != None:
253 self.abbr_list[self.abbr_data] = self.abbr_title
254 self.abbr_title = None
260 for (x, y) in attrs: attrsD[x] = y
262 if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
263 self.astack.append(attrs)
266 self.astack.append(None)
269 a = self.astack.pop()
271 i = self.previousIndex(a)
276 a['count'] = self.acount
277 a['outcount'] = self.outcount
279 self.o("][" + `a['count']` + "]")
281 if tag == "img" and start:
283 for (x, y) in attrs: attrsD[x] = y
285 if attrs.has_key('src'):
286 attrs['href'] = attrs['src']
287 alt = attrs.get('alt', '')
288 i = self.previousIndex(attrs)
293 attrs['count'] = self.acount
294 attrs['outcount'] = self.outcount
298 self.o("]["+`attrs['count']`+"]")
300 if tag == 'dl' and start: self.p()
301 if tag == 'dt' and not start: self.pbr()
302 if tag == 'dd' and start: self.o(' ')
303 if tag == 'dd' and not start: self.pbr()
305 if tag in ["ol", "ul"]:
307 self.list.append({'name':tag, 'num':0})
309 if self.list: self.list.pop()
316 if self.list: li = self.list[-1]
317 else: li = {'name':'ul', 'num':0}
318 self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
319 if li['name'] == "ul": self.o("* ")
320 elif li['name'] == "ol":
322 self.o(`li['num']`+". ")
327 if tag in ["table", "tr"] and start: self.p()
328 if tag == 'td': self.pbr()
339 if self.p_p == 0: self.p_p = 1
341 def p(self): self.p_p = 2
343 def o(self, data, puredata=0, force=0):
344 if self.abbr_data is not None: self.abbr_data += data
347 if puredata and not self.pre:
348 data = re.sub('\s+', ' ', data)
349 if data and data[0] == ' ':
352 if not data and not force: return
355 #self.out(" :") #TODO: not output when already one there
358 bq = (">" * self.blockquote)
359 if not (force and data and data[0] == ">") and self.blockquote: bq += " "
363 data = data.replace("\n", "\n"+bq)
378 self.out(('\n'+bq)*self.p_p)
382 if not self.lastWasNL: self.out(' ')
385 if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
386 if force == "end": self.out("\n")
390 if self.outcount > link['outcount']:
391 self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href']))
392 if link.has_key('title'): self.out(" ("+link['title']+")")
397 if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
401 if self.abbr_list and force == "end":
402 for abbr, definition in self.abbr_list.items():
403 self.out(" *[" + abbr + "]: " + definition + "\n")
407 self.lastWasNL = data and data[-1] == '\n'
410 def handle_data(self, data):
411 if r'\/script>' in data: self.quiet -= 1
414 def unknown_decl(self, data): pass
416 def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
418 def html2text_file(html, out=wrapwrite, baseurl=''):
419 h = _html2text(out, baseurl)
424 def html2text(html, baseurl=''):
425 return optwrap(html2text_file(html, None, baseurl))
427 if __name__ == "__main__":
431 if arg.startswith('http://'):
433 j = urllib.urlopen(baseurl)
435 from feedparser import _getCharacterEncoding as enc
437 enc = lambda x, y: ('utf-8', 1)
439 encoding = enc(j.headers, text)[0]
440 if encoding == 'us-ascii': encoding = 'utf-8'
441 data = text.decode(encoding)
445 if len(sys.argv) > 2:
446 encoding = sys.argv[2]
449 data = f.read().decode(encoding)
453 data = sys.stdin.read().decode('utf8')
454 wrapwrite(html2text(data, baseurl))
457 # vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4: