#!/usr/bin/env python import sys import yaml import six import nkf def cv_amp(s, d='enc'): lst = [ ('&','&'), ('<','<'), ('>','>') ] if d != 'enc': lst = reversed(lst) for (f, t) in lst: if d != 'enc': (f, t) = (t, f) s = s.replace(f, t) return s def dump_tag(tag, v, pre): if tag == '!--': return '' lst = bak = tag.split(' ') if lst[-1] == '/': lst = lst[:-1] if lst[0][-1] == '/': lst = [ lst[0][:-1] ] + lst[1:] s = '<' + ' '.join(lst) + '>' if v != '/': s += html_dump( v, lst[0].lower() == 'pre' ) if lst == bak: s += '' return s def get_tag_v(d): return list( d.items() )[0] def html_dump(o, pre=False): if not o: return '' if type(o) == list: delim = '' if pre else '\n' return delim.join( map( lambda e: html_dump(e, pre), o ) ) if type(o) == dict: (tag, v) = get_tag_v(o) return dump_tag(tag, v, pre) return cv_amp(o) def get_tag(s): (p, tag) = ('', []) while '<' in s: i = s.index('<') (t, s) = ( s[:i], s[i+1:] ) p += t if s[:3] == '!--' and '-->' in s[3:]: tag.append('!--') s = s[3:] i = s.index('-->') tag.append( s[:i] ) s = s[i+3:] break tag.append('<') n = s[:1] if n == '/': tag.append('/') s = s[1:] n = s[:1] if n.isalpha(): if '>' not in s: err( "not found '>'", 'tag={}'.format(tag) ) i = s.index('>') tag = tag[1:] + s[:i].split(' ') s = s[i+1:] break p += ''.join(tag) tag = [] if not tag: (p, s) = (p+s, '') return (p, tag, s) def start_idx(stk, s): for i in reversed( range( len(stk) ) ): e = stk[i] if type(e) == list and e[0] == s: return i return -1 def solo_tag(e): if type(e) == list: if e[0] == '!--': return { '!--': e[1] } (h, e) = ( '/', e[1:] ) if e[0] == '/' else ('', e) return { h + ' '.join(e): '/' } return e def is_solo_tag_has_v(e): lst = ['li'] if type(e) != dict: return False (tag, v) = get_tag_v(e) return v == '/' and tag.split(' ')[0] in lst def solo_tag_has_v_idx(lst): for i in range( len(lst) ): if is_solo_tag_has_v( lst[i] ): return i return -1 def strip_lst1(o): while type(o) == list and len(o) == 1: o = o[0] return o def solo_tags_modify(lst): def div_lst(lst): i = solo_tag_has_v_idx(lst) return ( lst[:i], lst[i:] ) if i >= 0 else ( lst, [] ) r = [] while lst: (p, lst) = div_lst(lst) r += p if lst: (e, lst) = ( lst[0], lst[1:] ) (tag, v) = get_tag_v(e) # v == '/' (p, lst) = div_lst(lst) r += [ { tag + ' /': strip_lst1(p) } ] return r def solo_tags(lst): return list( map( solo_tag, lst ) ) def untabify(s, n=8): def f(s): r = '' for c in s: if c == '\t': c = ' ' * ( n - len(r) % n ) r += c return r lst = s.split('\n') lst = map(f, lst) return '\n'.join(lst) def strip_lst(lst, pre): def f(e): if type(e) == dict: return e if pre: while ' \n' in e: e = e.replace(' \n', '\n') return untabify(e) return e.replace('\n', ' ').strip() return list( filter( lambda e: e != '', map(f, lst) ) ) def close_solo_tag(lst, pre=False): lst = solo_tags(lst) lst = strip_lst(lst, pre) lst = solo_tags_modify(lst) return lst def close_tag(lst, pre): (tag, lst) = ( lst[0], lst[1:] ) lst = close_solo_tag(lst, pre) v = lst if len(lst) == 1: v = lst[0] elif not lst: v = '' return { ' '.join(tag): v } def html_load(s): stk = [] pre_cnt = 0 def add_cnt(tag): pre = ( tag[1] if tag[0] == '/' else tag[0] ).lower() == 'pre' return ( -1 if tag[0] == '/' else 1 ) if pre else 0 while True: (p, tag, s) = get_tag(s) if not tag: break if p: stk.append( cv_amp(p, 'dec') ) if tag[0] != '/': stk.append(tag) pre_cnt += add_cnt(tag) continue i = start_idx( stk, tag[1] ) if i >= 0: stk = stk[:i] + [ close_tag( stk[i:], pre_cnt > 0 ) ] pre_cnt += add_cnt(tag) else: stk.append( solo_tag(tag) ) o = close_solo_tag( stk, pre_cnt > 0 ) return strip_lst1(o) def yaml_dump(o): def represent_str(dumper, instance): tag = 'tag:yaml.org,2002:str' style = '|' if '\n' in instance else None return dumper.represent_scalar( tag, instance, style=style ) for typ in [ str ] + ( [ unicode ] if six.PY2 else [] ): yaml.add_representer(typ, represent_str) u8 = yaml.dump( o, default_flow_style=False, allow_unicode=True, encoding='utf-8' ) return nkf.dec(u8) if __name__ == "__main__": b = nkf.get_stdin() opt = nkf.guess(b) is_html = lambda : nkf.dec( nkf.do_cmd('file - | grep -i html', b) ).strip() html = True if 'h' in sys.argv else False if 'y' in sys.argv else is_html() u8 = nkf.cvt(b, '-u') s = nkf.dec(u8) if html: o = html_load(s) s = yaml_dump(o) else: o = yaml.load(s) s = html_dump(o) + '\n' u8 = nkf.enc(s) b = nkf.cvt(u8, opt) nkf.put_stdout(b) # EOF