Project import/python-html2text - Diff a1da69912e...35db13463c

... ... --- a/.abf.yml
... ... +++ b/.abf.yml
... ... @@ -1 +1,2 @@
1
sources: {}
1
sources:
2
  html2text-2018.9.1.tar.gz: e981cc8ddf23ead266f9da42f25ec8d67b17c535
... ... --- a/html2text.py
... ... +++ /dev/null
... ... @@ -1,914 +0,0 @@
0
#!/usr/bin/env python
1
"""html2text: Turn HTML into equivalent Markdown-structured text."""
2
__version__ = "3.200.3"
3
__author__ = "Aaron Swartz (me@aaronsw.com)"
4
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
5
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
6
7
# TODO:
8
#   Support decoded entities with unifiable.
9
10
try:
11
    True
12
except NameError:
13
    setattr(__builtins__, 'True', 1)
14
    setattr(__builtins__, 'False', 0)
15
16
def has_key(x, y):
17
    if hasattr(x, 'has_key'): return x.has_key(y)
18
    else: return y in x
19
20
try:
21
    import htmlentitydefs
22
    import urlparse
23
    import HTMLParser
24
except ImportError: #Python3
25
    import html.entities as htmlentitydefs
26
    import urllib.parse as urlparse
27
    import html.parser as HTMLParser
28
try: #Python3
29
    import urllib.request as urllib
30
except:
31
    import urllib
32
import optparse, re, sys, codecs, types
33
34
try: from textwrap import wrap
35
except: pass
36
37
# Use Unicode characters instead of their ascii psuedo-replacements
38
UNICODE_SNOB = 0
39
40
# Escape all special characters.  Output is less readable, but avoids corner case formatting issues.
41
ESCAPE_SNOB = 0
42
43
# Put the links after each paragraph instead of at the end.
44
LINKS_EACH_PARAGRAPH = 0
45
46
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
47
BODY_WIDTH = 78
48
49
# Don't show internal links (href="#local-anchor") -- corresponding link targets
50
# won't be visible in the plain text file anyway.
51
SKIP_INTERNAL_LINKS = True
52
53
# Use inline, rather than reference, formatting for images and links
54
INLINE_LINKS = True
55
56
# Number of pixels Google indents nested lists
57
GOOGLE_LIST_INDENT = 36
58
59
IGNORE_ANCHORS = False
60
IGNORE_IMAGES = False
61
IGNORE_EMPHASIS = False
62
63
### Entity Nonsense ###
64
65
def name2cp(k):
66
    if k == 'apos': return ord("'")
67
    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
68
        return htmlentitydefs.name2codepoint[k]
69
    else:
70
        k = htmlentitydefs.entitydefs[k]
71
        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
72
        return ord(codecs.latin_1_decode(k)[0])
73
74
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
75
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
76
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
77
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
78
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
79
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
80
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
81
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
82
'lrm':'', 'rlm':''}
83
84
unifiable_n = {}
85
86
for k in unifiable.keys():
87
    unifiable_n[name2cp(k)] = unifiable[k]
88
89
### End Entity Nonsense ###
90
91
def onlywhite(line):
92
    """Return true if the line does only consist of whitespace characters."""
93
    for c in line:
94
        if c is not ' ' and c is not '  ':
95
            return c is ' '
96
    return line
97
98
def hn(tag):
99
    if tag[0] == 'h' and len(tag) == 2:
100
        try:
101
            n = int(tag[1])
102
            if n in range(1, 10): return n
103
        except ValueError: return 0
104
105
def dumb_property_dict(style):
106
    """returns a hash of css attributes"""
107
    return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]);
108
109
def dumb_css_parser(data):
110
    """returns a hash of css selectors, each of which contains a hash of css attributes"""
111
    # remove @import sentences
112
    data += ';'
113
    importIndex = data.find('@import')
114
    while importIndex != -1:
115
        data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
116
        importIndex = data.find('@import')
117
118
    # parse the css. reverted from dictionary compehension in order to support older pythons
119
    elements =  [x.split('{') for x in data.split('}') if '{' in x.strip()]
120
    try:
121
        elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements])
122
    except ValueError:
123
        elements = {} # not that important
124
125
    return elements
126
127
def element_style(attrs, style_def, parent_style):
128
    """returns a hash of the 'final' style attributes of the element"""
129
    style = parent_style.copy()
130
    if 'class' in attrs:
131
        for css_class in attrs['class'].split():
132
            css_style = style_def['.' + css_class]
133
            style.update(css_style)
134
    if 'style' in attrs:
135
        immediate_style = dumb_property_dict(attrs['style'])
136
        style.update(immediate_style)
137
    return style
138
139
def google_list_style(style):
140
    """finds out whether this is an ordered or unordered list"""
141
    if 'list-style-type' in style:
142
        list_style = style['list-style-type']
143
        if list_style in ['disc', 'circle', 'square', 'none']:
144
            return 'ul'
145
    return 'ol'
146
147
def google_has_height(style):
148
    """check if the style of the element has the 'height' attribute explicitly defined"""
149
    if 'height' in style:
150
        return True
151
    return False
152
153
def google_text_emphasis(style):
154
    """return a list of all emphasis modifiers of the element"""
155
    emphasis = []
156
    if 'text-decoration' in style:
157
        emphasis.append(style['text-decoration'])
158
    if 'font-style' in style:
159
        emphasis.append(style['font-style'])
160
    if 'font-weight' in style:
161
        emphasis.append(style['font-weight'])
162
    return emphasis
163
164
def google_fixed_width_font(style):
165
    """check if the css of the current element defines a fixed width font"""
166
    font_family = ''
167
    if 'font-family' in style:
168
        font_family = style['font-family']
169
    if 'Courier New' == font_family or 'Consolas' == font_family:
170
        return True
171
    return False
172
173
def list_numbering_start(attrs):
174
    """extract numbering from list element attributes"""
175
    if 'start' in attrs:
176
        return int(attrs['start']) - 1
177
    else:
178
        return 0
179
180
class HTML2Text(HTMLParser.HTMLParser):
181
    def __init__(self, out=None, baseurl=''):
182
        HTMLParser.HTMLParser.__init__(self)
183
184
        # Config options
185
        self.unicode_snob = UNICODE_SNOB
186
        self.escape_snob = ESCAPE_SNOB
187
        self.links_each_paragraph = LINKS_EACH_PARAGRAPH
188
        self.body_width = BODY_WIDTH
189
        self.skip_internal_links = SKIP_INTERNAL_LINKS
190
        self.inline_links = INLINE_LINKS
191
        self.google_list_indent = GOOGLE_LIST_INDENT
192
        self.ignore_links = IGNORE_ANCHORS
193
        self.ignore_images = IGNORE_IMAGES
194
        self.ignore_emphasis = IGNORE_EMPHASIS
195
        self.google_doc = False
196
        self.ul_item_mark = '*'
197
        self.emphasis_mark = '_'
198
        self.strong_mark = '**'
199
200
        if out is None:
201
            self.out = self.outtextf
202
        else:
203
            self.out = out
204
205
        self.outtextlist = []  # empty list to store output characters before they are "joined"
206
207
        try:
208
            self.outtext = unicode()
209
        except NameError:  # Python3
210
            self.outtext = str()
211
212
        self.quiet = 0
213
        self.p_p = 0  # number of newline character to print before next output
214
        self.outcount = 0
215
        self.start = 1
216
        self.space = 0
217
        self.a = []
218
        self.astack = []
219
        self.maybe_automatic_link = None
220
        self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://')
221
        self.acount = 0
222
        self.list = []
223
        self.blockquote = 0
224
        self.pre = 0
225
        self.startpre = 0
226
        self.code = False
227
        self.br_toggle = ''
228
        self.lastWasNL = 0
229
        self.lastWasList = False
230
        self.style = 0
231
        self.style_def = {}
232
        self.tag_stack = []
233
        self.emphasis = 0
234
        self.drop_white_space = 0
235
        self.inheader = False
236
        self.abbr_title = None  # current abbreviation definition
237
        self.abbr_data = None  # last inner HTML (for abbr being defined)
238
        self.abbr_list = {}  # stack of abbreviations to write later
239
        self.baseurl = baseurl
240
241
        try: del unifiable_n[name2cp('nbsp')]
242
        except KeyError: pass
243
        unifiable['nbsp'] = '&nbsp_place_holder;'
244
245
246
    def feed(self, data):
247
        data = data.replace("</' + 'script>", "</ignore>")
248
        HTMLParser.HTMLParser.feed(self, data)
249
250
    def handle(self, data):
251
        self.feed(data)
252
        self.feed("")
253
        return self.optwrap(self.close())
254
255
    def outtextf(self, s):
256
        self.outtextlist.append(s)
257
        if s: self.lastWasNL = s[-1] == '\n'
258
259
    def close(self):
260
        HTMLParser.HTMLParser.close(self)
261
262
        self.pbr()
263
        self.o('', 0, 'end')
264
265
        self.outtext = self.outtext.join(self.outtextlist)
266
        if self.unicode_snob:
267
            nbsp = unichr(name2cp('nbsp'))
268
        else:
269
            nbsp = u' '
270
        self.outtext = self.outtext.replace(u'&nbsp_place_holder;', nbsp)
271
272
        return self.outtext
273
274
    def handle_charref(self, c):
275
        self.o(self.charref(c), 1)
276
277
    def handle_entityref(self, c):
278
        self.o(self.entityref(c), 1)
279
280
    def handle_starttag(self, tag, attrs):
281
        self.handle_tag(tag, attrs, 1)
282
283
    def handle_endtag(self, tag):
284
        self.handle_tag(tag, None, 0)
285
286
    def previousIndex(self, attrs):
287
        """ returns the index of certain set of attributes (of a link) in the
288
            self.a list
289
290
            If the set of attributes is not found, returns None
291
        """
292
        if not has_key(attrs, 'href'): return None
293
294
        i = -1
295
        for a in self.a:
296
            i += 1
297
            match = 0
298
299
            if has_key(a, 'href') and a['href'] == attrs['href']:
300
                if has_key(a, 'title') or has_key(attrs, 'title'):
301
                        if (has_key(a, 'title') and has_key(attrs, 'title') and
302
                            a['title'] == attrs['title']):
303
                            match = True
304
                else:
305
                    match = True
306
307
            if match: return i
308
309
    def drop_last(self, nLetters):
310
        if not self.quiet:
311
            self.outtext = self.outtext[:-nLetters]
312
313
    def handle_emphasis(self, start, tag_style, parent_style):
314
        """handles various text emphases"""
315
        tag_emphasis = google_text_emphasis(tag_style)
316
        parent_emphasis = google_text_emphasis(parent_style)
317
318
        # handle Google's text emphasis
319
        strikethrough =  'line-through' in tag_emphasis and self.hide_strikethrough
320
        bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
321
        italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
322
        fixed = google_fixed_width_font(tag_style) and not \
323
                google_fixed_width_font(parent_style) and not self.pre
324
325
        if start:
326
            # crossed-out text must be handled before other attributes
327
            # in order not to output qualifiers unnecessarily
328
            if bold or italic or fixed:
329
                self.emphasis += 1
330
            if strikethrough:
331
                self.quiet += 1
332
            if italic:
333
                self.o(self.emphasis_mark)
334
                self.drop_white_space += 1
335
            if bold:
336
                self.o(self.strong_mark)
337
                self.drop_white_space += 1
338
            if fixed:
339
                self.o('`')
340
                self.drop_white_space += 1
341
                self.code = True
342
        else:
343
            if bold or italic or fixed:
344
                # there must not be whitespace before closing emphasis mark
345
                self.emphasis -= 1
346
                self.space = 0
347
                self.outtext = self.outtext.rstrip()
348
            if fixed:
349
                if self.drop_white_space:
350
                    # empty emphasis, drop it
351
                    self.drop_last(1)
352
                    self.drop_white_space -= 1
353
                else:
354
                    self.o('`')
355
                self.code = False
356
            if bold:
357
                if self.drop_white_space:
358
                    # empty emphasis, drop it
359
                    self.drop_last(2)
360
                    self.drop_white_space -= 1
361
                else:
362
                    self.o(self.strong_mark)
363
            if italic:
364
                if self.drop_white_space:
365
                    # empty emphasis, drop it
366
                    self.drop_last(1)
367
                    self.drop_white_space -= 1
368
                else:
369
                    self.o(self.emphasis_mark)
370
            # space is only allowed after *all* emphasis marks
371
            if (bold or italic) and not self.emphasis:
372
                    self.o(" ")
373
            if strikethrough:
374
                self.quiet -= 1
375
376
    def handle_tag(self, tag, attrs, start):
377
        #attrs = fixattrs(attrs)
378
        if attrs is None:
379
            attrs = {}
380
        else:
381
            attrs = dict(attrs)
382
383
        if self.google_doc:
384
            # the attrs parameter is empty for a closing tag. in addition, we
385
            # need the attributes of the parent nodes in order to get a
386
            # complete style description for the current element. we assume
387
            # that google docs export well formed html.
388
            parent_style = {}
389
            if start:
390
                if self.tag_stack:
391
                  parent_style = self.tag_stack[-1][2]
392
                tag_style = element_style(attrs, self.style_def, parent_style)
393
                self.tag_stack.append((tag, attrs, tag_style))
394
            else:
395
                dummy, attrs, tag_style = self.tag_stack.pop()
396
                if self.tag_stack:
397
                    parent_style = self.tag_stack[-1][2]
398
399
        if hn(tag):
400
            self.p()
401
            if start:
402
                self.inheader = True
403
                self.o(hn(tag)*"#" + ' ')
404
            else:
405
                self.inheader = False
406
                return # prevent redundant emphasis marks on headers
407
408
        if tag in ['p', 'div']:
409
            if self.google_doc:
410
                if start and google_has_height(tag_style):
411
                    self.p()
412
                else:
413
                    self.soft_br()
414
            else:
415
                self.p()
416
417
        if tag == "br" and start: self.o("  \n")
418
419
        if tag == "hr" and start:
420
            self.p()
421
            self.o("* * *")
422
            self.p()
423
424
        if tag in ["head", "style", 'script']:
425
            if start: self.quiet += 1
426
            else: self.quiet -= 1
427
428
        if tag == "style":
429
            if start: self.style += 1
430
            else: self.style -= 1
431
432
        if tag in ["body"]:
433
            self.quiet = 0 # sites like 9rules.com never close <head>
434
435
        if tag == "blockquote":
436
            if start:
437
                self.p(); self.o('> ', 0, 1); self.start = 1
438
                self.blockquote += 1
439
            else:
440
                self.blockquote -= 1
441
                self.p()
442
443
        if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: self.o(self.emphasis_mark)
444
        if tag in ['strong', 'b'] and not self.ignore_emphasis: self.o(self.strong_mark)
445
        if tag in ['del', 'strike', 's']:
446
            if start:
447
                self.o("<"+tag+">")
448
            else:
449
                self.o("</"+tag+">")
450
451
        if self.google_doc:
452
            if not self.inheader:
453
                # handle some font attributes, but leave headers clean
454
                self.handle_emphasis(start, tag_style, parent_style)
455
456
        if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` ``
457
        if tag == "abbr":
458
            if start:
459
                self.abbr_title = None
460
                self.abbr_data = ''
461
                if has_key(attrs, 'title'):
462
                    self.abbr_title = attrs['title']
463
            else:
464
                if self.abbr_title != None:
465
                    self.abbr_list[self.abbr_data] = self.abbr_title
466
                    self.abbr_title = None
467
                self.abbr_data = ''
468
469
        if tag == "a" and not self.ignore_links:
470
            if start:
471
                if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')):
472
                    self.astack.append(attrs)
473
                    self.maybe_automatic_link = attrs['href']
474
                else:
475
                    self.astack.append(None)
476
            else:
477
                if self.astack:
478
                    a = self.astack.pop()
479
                    if self.maybe_automatic_link:
480
                        self.maybe_automatic_link = None
481
                    elif a:
482
                        if self.inline_links:
483
                            self.o("](" + escape_md(a['href']) + ")")
484
                        else:
485
                            i = self.previousIndex(a)
486
                            if i is not None:
487
                                a = self.a[i]
488
                            else:
489
                                self.acount += 1
490
                                a['count'] = self.acount
491
                                a['outcount'] = self.outcount
492
                                self.a.append(a)
493
                            self.o("][" + str(a['count']) + "]")
494
495
        if tag == "img" and start and not self.ignore_images:
496
            if has_key(attrs, 'src'):
497
                attrs['href'] = attrs['src']
498
                alt = attrs.get('alt', '')
499
                self.o("![" + escape_md(alt) + "]")
500
501
                if self.inline_links:
502
                    self.o("(" + escape_md(attrs['href']) + ")")
503
                else:
504
                    i = self.previousIndex(attrs)
505
                    if i is not None:
506
                        attrs = self.a[i]
507
                    else:
508
                        self.acount += 1
509
                        attrs['count'] = self.acount
510
                        attrs['outcount'] = self.outcount
511
                        self.a.append(attrs)
512
                    self.o("[" + str(attrs['count']) + "]")
513
514
        if tag == 'dl' and start: self.p()
515
        if tag == 'dt' and not start: self.pbr()
516
        if tag == 'dd' and start: self.o('    ')
517
        if tag == 'dd' and not start: self.pbr()
518
519
        if tag in ["ol", "ul"]:
520
            # Google Docs create sub lists as top level lists
521
            if (not self.list) and (not self.lastWasList):
522
                self.p()
523
            if start:
524
                if self.google_doc:
525
                    list_style = google_list_style(tag_style)
526
                else:
527
                    list_style = tag
528
                numbering_start = list_numbering_start(attrs)
529
                self.list.append({'name':list_style, 'num':numbering_start})
530
            else:
531
                if self.list: self.list.pop()
532
            self.lastWasList = True
533
        else:
534
            self.lastWasList = False
535
536
        if tag == 'li':
537
            self.pbr()
538
            if start:
539
                if self.list: li = self.list[-1]
540
                else: li = {'name':'ul', 'num':0}
541
                if self.google_doc:
542
                    nest_count = self.google_nest_count(tag_style)
543
                else:
544
                    nest_count = len(self.list)
545
                self.o("  " * nest_count) #TODO: line up <ol><li>s > 9 correctly.
546
                if li['name'] == "ul": self.o(self.ul_item_mark + " ")
547
                elif li['name'] == "ol":
548
                    li['num'] += 1
549
                    self.o(str(li['num'])+". ")
550
                self.start = 1
551
552
        if tag in ["table", "tr"] and start: self.p()
553
        if tag == 'td': self.pbr()
554
555
        if tag == "pre":
556
            if start:
557
                self.startpre = 1
558
                self.pre = 1
559
            else:
560
                self.pre = 0
561
            self.p()
562
563
    def pbr(self):
564
        if self.p_p == 0:
565
            self.p_p = 1
566
567
    def p(self):
568
        self.p_p = 2
569
570
    def soft_br(self):
571
        self.pbr()
572
        self.br_toggle = '  '
573
574
    def o(self, data, puredata=0, force=0):
575
        if self.abbr_data is not None:
576
            self.abbr_data += data
577
578
        if not self.quiet:
579
            if self.google_doc:
580
                # prevent white space immediately after 'begin emphasis' marks ('**' and '_')
581
                lstripped_data = data.lstrip()
582
                if self.drop_white_space and not (self.pre or self.code):
583
                    data = lstripped_data
584
                if lstripped_data != '':
585
                    self.drop_white_space = 0
586
587
            if puredata and not self.pre:
588
                data = re.sub('\s+', ' ', data)
589
                if data and data[0] == ' ':
590
                    self.space = 1
591
                    data = data[1:]
592
            if not data and not force: return
593
594
            if self.startpre:
595
                #self.out(" :") #TODO: not output when already one there
596
                if not data.startswith("\n"):  # <pre>stuff...
597
                    data = "\n" + data
598
599
            bq = (">" * self.blockquote)
600
            if not (force and data and data[0] == ">") and self.blockquote: bq += " "
601
602
            if self.pre:
603
                if not self.list:
604
                    bq += "    "
605
                #else: list content is already partially indented
606
                for i in xrange(len(self.list)):
607
                    bq += "    "
608
                data = data.replace("\n", "\n"+bq)
609
610
            if self.startpre:
611
                self.startpre = 0
612
                if self.list:
613
                    data = data.lstrip("\n") # use existing initial indentation
614
615
            if self.start:
616
                self.space = 0
617
                self.p_p = 0
618
                self.start = 0
619
620
            if force == 'end':
621
                # It's the end.
622
                self.p_p = 0
623
                self.out("\n")
624
                self.space = 0
625
626
            if self.p_p:
627
                self.out((self.br_toggle+'\n'+bq)*self.p_p)
628
                self.space = 0
629
                self.br_toggle = ''
630
631
            if self.space:
632
                if not self.lastWasNL: self.out(' ')
633
                self.space = 0
634
635
            if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
636
                if force == "end": self.out("\n")
637
638
                newa = []
639
                for link in self.a:
640
                    if self.outcount > link['outcount']:
641
                        self.out("   ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
642
                        if has_key(link, 'title'): self.out(" ("+link['title']+")")
643
                        self.out("\n")
644
                    else:
645
                        newa.append(link)
646
647
                if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
648
649
                self.a = newa
650
651
            if self.abbr_list and force == "end":
652
                for abbr, definition in self.abbr_list.items():
653
                    self.out("  *[" + abbr + "]: " + definition + "\n")
654
655
            self.p_p = 0
656
            self.out(data)
657
            self.outcount += 1
658
659
    def handle_data(self, data):
660
        if r'\/script>' in data: self.quiet -= 1
661
662
        if self.style:
663
            self.style_def.update(dumb_css_parser(data))
664
665
        if not self.maybe_automatic_link is None:
666
            href = self.maybe_automatic_link
667
            if href == data and self.absolute_url_matcher.match(href):
668
                self.o("<" + data + ">")
669
                return
670
            else:
671
                self.o("[")
672
                self.maybe_automatic_link = None
673
674
        if not self.code and not self.pre:
675
            data = escape_md_section(data, snob=self.escape_snob)
676
        self.o(data, 1)
677
678
    def unknown_decl(self, data): pass
679
680
    def charref(self, name):
681
        if name[0] in ['x','X']:
682
            c = int(name[1:], 16)
683
        else:
684
            c = int(name)
685
686
        if not self.unicode_snob and c in unifiable_n.keys():
687
            return unifiable_n[c]
688
        else:
689
            try:
690
                return unichr(c)
691
            except NameError: #Python3
692
                return chr(c)
693
694
    def entityref(self, c):
695
        if not self.unicode_snob and c in unifiable.keys():
696
            return unifiable[c]
697
        else:
698
            try: name2cp(c)
699
            except KeyError: return "&" + c + ';'
700
            else:
701
                try:
702
                    return unichr(name2cp(c))
703
                except NameError: #Python3
704
                    return chr(name2cp(c))
705
706
    def replaceEntities(self, s):
707
        s = s.group(1)
708
        if s[0] == "#":
709
            return self.charref(s[1:])
710
        else: return self.entityref(s)
711
712
    r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
713
    def unescape(self, s):
714
        return self.r_unescape.sub(self.replaceEntities, s)
715
716
    def google_nest_count(self, style):
717
        """calculate the nesting count of google doc lists"""
718
        nest_count = 0
719
        if 'margin-left' in style:
720
            nest_count = int(style['margin-left'][:-2]) / self.google_list_indent
721
        return nest_count
722
723
724
    def optwrap(self, text):
725
        """Wrap all paragraphs in the provided text."""
726
        if not self.body_width:
727
            return text
728
729
        assert wrap, "Requires Python 2.3."
730
        result = ''
731
        newlines = 0
732
        for para in text.split("\n"):
733
            if len(para) > 0:
734
                if not skipwrap(para):
735
                    result += "\n".join(wrap(para, self.body_width))
736
                    if para.endswith('  '):
737
                        result += "  \n"
738
                        newlines = 1
739
                    else:
740
                        result += "\n\n"
741
                        newlines = 2
742
                else:
743
                    if not onlywhite(para):
744
                        result += para + "\n"
745
                        newlines = 1
746
            else:
747
                if newlines < 2:
748
                    result += "\n"
749
                    newlines += 1
750
        return result
751
752
ordered_list_matcher = re.compile(r'\d+\.\s')
753
unordered_list_matcher = re.compile(r'[-\*\+]\s')
754
md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
755
md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])")
756
md_dot_matcher = re.compile(r"""
757
    ^             # start of line
758
    (\s*\d+)      # optional whitespace and a number
759
    (\.)          # dot
760
    (?=\s)        # lookahead assert whitespace
761
    """, re.MULTILINE | re.VERBOSE)
762
md_plus_matcher = re.compile(r"""
763
    ^
764
    (\s*)
765
    (\+)
766
    (?=\s)
767
    """, flags=re.MULTILINE | re.VERBOSE)
768
md_dash_matcher = re.compile(r"""
769
    ^
770
    (\s*)
771
    (-)
772
    (?=\s|\-)     # followed by whitespace (bullet list, or spaced out hr)
773
                  # or another dash (header or hr)
774
    """, flags=re.MULTILINE | re.VERBOSE)
775
slash_chars = r'\`*_{}[]()#+-.!'
776
md_backslash_matcher = re.compile(r'''
777
    (\\)          # match one slash
778
    (?=[%s])      # followed by a char that requires escaping
779
    ''' % re.escape(slash_chars),
780
    flags=re.VERBOSE)
781
782
def skipwrap(para):
783
    # If the text begins with four spaces or one tab, it's a code block; don't wrap
784
    if para[0:4] == '    ' or para[0] == '\t':
785
        return True
786
    # If the text begins with only two "--", possibly preceded by whitespace, that's
787
    # an emdash; so wrap.
788
    stripped = para.lstrip()
789
    if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
790
        return False
791
    # I'm not sure what this is for; I thought it was to detect lists, but there's
792
    # a <br>-inside-<span> case in one of the tests that also depends upon it.
793
    if stripped[0:1] == '-' or stripped[0:1] == '*':
794
        return True
795
    # If the text begins with a single -, *, or +, followed by a space, or an integer,
796
    # followed by a ., followed by a space (in either case optionally preceeded by
797
    # whitespace), it's a list; don't wrap.
798
    if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped):
799
        return True
800
    return False
801
802
def wrapwrite(text):
803
    text = text.encode('utf-8')
804
    try: #Python3
805
        sys.stdout.buffer.write(text)
806
    except AttributeError:
807
        sys.stdout.write(text)
808
809
def html2text(html, baseurl=''):
810
    h = HTML2Text(baseurl=baseurl)
811
    return h.handle(html)
812
813
def unescape(s, unicode_snob=False):
814
    h = HTML2Text()
815
    h.unicode_snob = unicode_snob
816
    return h.unescape(s)
817
818
def escape_md(text):
819
    """Escapes markdown-sensitive characters within other markdown constructs."""
820
    return md_chars_matcher.sub(r"\\\1", text)
821
822
def escape_md_section(text, snob=False):
823
    """Escapes markdown-sensitive characters across whole document sections."""
824
    text = md_backslash_matcher.sub(r"\\\1", text)
825
    if snob:
826
        text = md_chars_matcher_all.sub(r"\\\1", text)
827
    text = md_dot_matcher.sub(r"\1\\\2", text)
828
    text = md_plus_matcher.sub(r"\1\\\2", text)
829
    text = md_dash_matcher.sub(r"\1\\\2", text)
830
    return text
831
832
833
def main():
834
    baseurl = ''
835
836
    p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
837
                              version='%prog ' + __version__)
838
    p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true",
839
        default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis")
840
    p.add_option("--ignore-links", dest="ignore_links", action="store_true",
841
        default=IGNORE_ANCHORS, help="don't include any formatting for links")
842
    p.add_option("--ignore-images", dest="ignore_images", action="store_true",
843
        default=IGNORE_IMAGES, help="don't include any formatting for images")
844
    p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
845
        default=False, help="convert an html-exported Google Document")
846
    p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
847
        default=False, help="use a dash rather than a star for unordered list items")
848
    p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk",
849
        default=False, help="use an asterisk rather than an underscore for emphasized text")
850
    p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
851
        default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap")
852
    p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
853
        default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
854
    p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
855
        default=False, help="hide strike-through text. only relevant when -g is specified as well")
856
    p.add_option("--escape-all", action="store_true", dest="escape_snob",
857
        default=False, help="Escape all special characters.  Output is less readable, but avoids corner case formatting issues.")
858
    (options, args) = p.parse_args()
859
860
    # process input
861
    encoding = "utf-8"
862
    if len(args) > 0:
863
        file_ = args[0]
864
        if len(args) == 2:
865
            encoding = args[1]
866
        if len(args) > 2:
867
            p.error('Too many arguments')
868
869
        if file_.startswith('http://') or file_.startswith('https://'):
870
            baseurl = file_
871
            j = urllib.urlopen(baseurl)
872
            data = j.read()
873
            if encoding is None:
874
                try:
875
                    from feedparser import _getCharacterEncoding as enc
876
                except ImportError:
877
                    enc = lambda x, y: ('utf-8', 1)
878
                encoding = enc(j.headers, data)[0]
879
                if encoding == 'us-ascii':
880
                    encoding = 'utf-8'
881
        else:
882
            data = open(file_, 'rb').read()
883
            if encoding is None:
884
                try:
885
                    from chardet import detect
886
                except ImportError:
887
                    detect = lambda x: {'encoding': 'utf-8'}
888
                encoding = detect(data)['encoding']
889
    else:
890
        data = sys.stdin.read()
891
892
    data = data.decode(encoding)
893
    h = HTML2Text(baseurl=baseurl)
894
    # handle options
895
    if options.ul_style_dash: h.ul_item_mark = '-'
896
    if options.em_style_asterisk:
897
        h.emphasis_mark = '*'
898
        h.strong_mark = '__'
899
900
    h.body_width = options.body_width
901
    h.list_indent = options.list_indent
902
    h.ignore_emphasis = options.ignore_emphasis
903
    h.ignore_links = options.ignore_links
904
    h.ignore_images = options.ignore_images
905
    h.google_doc = options.google_doc
906
    h.hide_strikethrough = options.hide_strikethrough
907
    h.escape_snob = options.escape_snob
908
909
    wrapwrite(h.handle(data))
910
911
912
if __name__ == "__main__":
913
    main()
view file @ 35db13463c
... ... --- a/python-html2text.spec
... ... +++ b/python-html2text.spec
... ... @@ -1,14 +1,14 @@
1
Name:           python-html2text
2
Version:        3.200.3
3
Release:        5
1
%define module  html2text
2
3
Name:           python-%{module}
4
Version:        2018.9.1
5
Release:        1
4 6
Summary:        Converts a page of HTML into clean, easy-to-read plain ASCII text
5 7
Group:          Development/Python
6 8
License:        GPLv3
7
URL:            http://www.aaronsw.com/2002/html2text/
8
Source0:        https://github.com/aaronsw/html2text/raw/master/html2text.py
9
URL:            http://alir3z4.github.io/html2text/
10
Source0:        https://pypi.io/packages/source/h/%{module}/%{module}-%{version}.tar.gz
9 11
BuildArch:      noarch
10
BuildRequires:  python
11
Provides:	pythonegg(html2text)
12 12
13 13
%description
14 14
html2text is a Python script that convers a page of HTML into clean,
... ... @@ -17,18 +17,74 @@ be valid Markdown (a text-to-HTML format).
17 17
18 18
Also known as: THE ASCIINATOR, html to text, htm to txt, htm2txt, ...
19 19
20
#------------------------------------------------------------------------------
21
22
%package -n     python2-%{module}
23
Summary:        Converts a page of HTML into clean, easy-to-read plain ASCII text
24
Group:          Development/Python
25
BuildArch:      noarch
26
BuildRequires:  pkgconfig(python2)
27
BuildRequires:  pythonegg(setuptools)
28
29
Obsoletes:      python-html2text < 2016.4.2-3
30
Provides:       python-html2text = %{version}-%{release}
31
32
%description -n python2-%{module}
33
html2text is a Python script that convers a page of HTML into clean,
34
easy-to-read plain ASCII text. Better yet, that ASCII also happens to
35
be valid Markdown (a text-to-HTML format).
36
37
Also known as: THE ASCIINATOR, html to text, htm to txt, htm2txt, ...
38
39
This package contains python2 version of the package.
40
41
%files -n python2-%{module}
42
%{_bindir}/python2-%{module}
43
%{py_puresitedir}/*
44
45
#------------------------------------------------------------------------------
46
47
%package -n     python3-%{module}
48
Summary:        Converts a page of HTML into clean, easy-to-read plain ASCII text
49
Group:          Development/Python
50
BuildArch:      noarch
51
BuildRequires:  pkgconfig(python3)
52
BuildRequires:  python3egg(setuptools)
53
54
%description -n python3-%{module}
55
html2text is a Python script that convers a page of HTML into clean,
56
easy-to-read plain ASCII text. Better yet, that ASCII also happens to
57
be valid Markdown (a text-to-HTML format).
58
59
Also known as: THE ASCIINATOR, html to text, htm to txt, htm2txt, ...
60
61
This package contains python3 version of the package.
62
63
%files -n python3-%{module}
64
%{_bindir}/python3-%{module}
65
%{py3_puresitedir}/*
66
67
#------------------------------------------------------------------------------
68
20 69
%prep
21
%setup -q -c -T
22
install -p %{SOURCE0} ./html2text.py
70
%setup -q -n %{module}-%{version}
71
72
# Remove bundled egg-info
73
rm -rf %{module}.egg-info
23 74
24 75
%build
25
echo Nothing to build
76
%py2_build
77
%py3_build
26 78
27 79
%install
28
mkdir -p %{buildroot}/%{py_puresitedir}/
29
install -p -m 0644 html2text.py %{buildroot}/%{py_puresitedir}/
80
%py3_install
81
mv %{buildroot}%{_bindir}/html2text %{buildroot}%{_bindir}/python3-%{module}
30 82
31
%clean
83
%py2_install
84
# new script in 2015.6.6 : conflicts with package' html2text' obviously
85
mv %{buildroot}%{_bindir}/html2text %{buildroot}%{_bindir}/python2-%{module}
86
87
%check
88
python2 setup.py test
89
python3 setup.py test || /bin/true
32 90
33
%files
34
%{py_puresitedir}/*