diff -urN v3/ezhtml.py v4/ezhtml.py --- v3/ezhtml.py 2018-09-17 20:15:22.000000000 +0900 +++ v4/ezhtml.py 2018-09-17 23:46:44.000000000 +0900 @@ -28,9 +28,107 @@ return s return cv_amp(o) +def get_tag(s): + (p, tag) = ('', []) + + while '<' in s: + i = s.index('<') + (t, s) = ( s[:i], s[i+1:] ) + p += t + tag.append('<') + n = s[:1] + if n == '/': + tag.append('/') + s = s[1:] + n = s[:1] + if n.isalpha(): + if '>' not in s: + err( "not found '>'", 'tag={}'.format(tag) ) + i = s.index('>') + tag = tag[1:] + s[:i].split(' ') + s = s[i+1:] + break + p += ''.join(tag) + tag = [] + + if not tag: + (p, s) = (p+s, '') + + return (p, tag, s) + +def start_idx(stk, s): + for i in reversed( range( len(stk) ) ): + e = stk[i] + if type(e) == list and e[0] == s: + return i + return -1 + +def solo_tag(e): + if type(e) == list: + (h, e) = ( '/', e[1:] ) if e[0] == '/' else ('', e) + return { h + ' '.join(e): '/' } + return e + +def solo_tags(lst): + return list( map( solo_tag, lst ) ) + +def untabify(s, n=8): + def f(s): + r = '' + for c in s: + if c == '\t': + c = ' ' * ( n - len(r) % n ) + r += c + return r + + lst = s.split('\n') + lst = map(f, lst) + return '\n'.join(lst) + +def strip_lst(lst, pre=False): + def f(e): + if type(e) == dict: + return e + if pre: + while ' \n' in e: + e = e.replace(' \n', '\n') + return untabify(e) + return e.replace('\n', ' ').strip() + + return list( filter( lambda e: e != '', map(f, lst) ) ) + +def close_tag(lst): + tag = lst[0] + lst = solo_tags( lst[1:] ) + lst = strip_lst( lst, tag[0].lower() == 'pre' ) + v = lst + if len(lst) == 1: + v = lst[0] + elif not lst: + v = '' + return { ' '.join(tag): v } + def html_load(s): - # ... - return { 'html': { 'body': { 'p': 'to be continued' } } } + stk = [] + while True: + (p, tag, s) = get_tag(s) + if not tag: + break + if p: + stk.append( cv_amp(p, 'dec') ) + if tag[0] != '/': + stk.append(tag) + continue + i = start_idx( stk, tag[1] ) + if i >= 0: + stk = stk[:i] + [ close_tag( stk[i:] ) ] + else: + stk.append( solo_tag(tag) ) + + o = strip_lst( solo_tags(stk) ) + while type(o) == list and len(o) == 1: + o = o[0] + return o if __name__ == "__main__": b = nkf.get_stdin()