Hatena::Groupbugrammer

蟲!虫!蟲!

Esehara Profile Site (by Heroku) / Github / bookable.jp (My Service)
過去の記事一覧はこちら

なにかあったら「えせはら あっと Gmail」まで送って頂ければ幸いです。
株式会社マリーチでは、Pythonやdjango、また自然言語処理を使ったお仕事を探しています

2011-03-23

[]『思想の言葉』をxhtmlで落とすためのスクリプト(不完全版) 18:49

岩波書房( http://www.iwanami.co.jp/shiso/index.html )のサイトにおいて公開されている冒頭エッセイ・『思想の言葉』をEpubにして閲覧する際、xhtmlに保存するために書いたスクリプトです。

# -*- coding: utf-8 -*-
import string, re, urllib, urllib2,os,sys
import codecs
from BeautifulSoup import BeautifulSoup

#def
    
opener = urllib2.build_opener()
text_mach = re.compile('<p align="RIGHT">')
format = '%0' + str(4) + 'd'



def guess_charset(data):
    f = lambda d, enc: d.decode(enc) and enc

    try: return f(data, 'utf-8')
    except: pass
    try: return f(data, 'cp932')
    except: pass
    try: return f(data, 'euc-jp')
    except: pass
    try: return f(data, 'iso2022-jp')
    except: pass
    return None

def conv(data):
    charset = guess_charset(data)
    u = data.decode(charset,'ignore')
    return u.encode('utf-8')

def url_make(url_int):
    url_no_str = format % url_int
    url_previous = format % (url_int - 1) 
    url_next = format % (url_int + 1)

    global url_previous_next_link
    global url
    
    url_previous_next_link = u'<a href="./'+ url_previous + '.xhtml">No.' + url_previous + "</a> || " + '<a href="./' + url_next + '.xhtml">No.' + url_next + '</a>'
    url = 'http://www.iwanami.co.jp/shiso/' + url_no_str +  '/kotoba.html'
    
def url_beautiful(url_temp):
    html = opener.open(url_temp).read()
    soup = BeautifulSoup(html) 

    global magazine_no
    global text_title
    global text_writer
    global text_main
    global main_text_flag 

    magazine_no = soup.find('font' , size="+1").b
    text_title = soup.body.h3
    text_writer = soup.find('p',align="RIGHT")
    text_main = ''
    text_p_find = soup.findAll('p')
    main_text_flag = ''

    for text_line in text_p_find:
        text_line_str = str(text_line)
        if text_mach.search(text_line_str) == None:
            text_main += text_line_str

def make_html():
    header_xhtml = u"""<?xml version="1.0"?>
    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
    "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">

    <html xmlns="http://www.w3.org/1999/xhtml">
    <head>"""

    header_xhtml += u"<title>"
    header_xhtml += magazine_no.string + text_title.string + text_writer.string
    header_xhtml += u"<title>\n"

    book_data = u"<h3>" + magazine_no.string + u"<h3>\n"
    book_data += u"<h2>『"
    book_data += text_title.string
    book_data += u"』</h2>\n" 
    book_data += u"<H4>" + text_writer.string + u"<H4>\n"

    body_xhtml = u"<BODY>"
    body_xhtml += url_previous_next_link
    body_xhtml += text_main
    body_xhtml += url_previous_next_link
    body_xhtml += u"</BODY>"
    body_xhtml += u"</HTML>"

    global main_xhtml
    
    main_xhtml = header_xhtml + book_data + body_xhtml
 
def xhtml_save(url_int):
    url_no_str = format % url_int
    xhtml_save = codecs.open("./" + url_no_str + ".xhtml","w","utf-8")
    xhtml_save.write(main_xhtml)
    xhtml_save.close()

#関数
url_no_int = 907
url_no_max = 1042
while url_no_int < url_no_max:
    url_make(url_no_int)
    url_no_str = format % url_no_int

    try:
        url_beautiful(url)
        make_html()
        xhtml_save(url_no_int)
        print "OK,Complite No." + url_no_str + " data."
    except urllib2.HTTPError:
        print "Don't Complite No." + url_no_str + "data."

    url_no_int += 1

ゲスト



トラックバック - http://bugrammer.g.hatena.ne.jp/nisemono_san/20110323