采集传送门(chuansong.me)指定公众号文章

#!/usr/bin/python2.7
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import urllib2
import time
import csv
import sys,getopt,os
import pymysql

# 获取当前目录
def get_cur_file_dir():
    path = sys.path[0]
    if os.path.isdir(path):
        return path
    elif os.path.isfile(path):
        return os.path.dirname(path)

# 抓取内容函数
def open_url(url):
    req = urllib2.Request(url)
    req.add_header('User-agent', 'Mozilla 5.10')
    # 尝试三次
    for i in range(0, 3):
        try:
            xhtml = urllib2.urlopen(req)
            return xhtml
        except urllib2.HTTPError,e:    #HTTPError必须排在URLError的前面
            print "The server couldn't fulfill the request"
            print "Error code:",e.code
            if e.code!=503:
                return False
            time.sleep(5)
            print("try again")
        except urllib2.URLError,e:
            print "Failed to reach the server"
            print "The reason:",e.reason
            if e.code!=503:
                return False
            time.sleep(5)
            print("try again")
    
    return Fasle

# 处理内容页
def down_content(content_url,path_url):
    xhtml=open_url(content_url)
    # 抓取内容失败
    if False == xhtml :
        return False

    # 分析内容
    soup = BeautifulSoup(xhtml, "html5lib")
    titleH2 = soup.find("h2", id="activity-name")
    if None == titleH2:
        return False
    title = titleH2.string.encode('utf-8')
    string_time = soup.find("em", id="post-date").string.encode('utf-8')
    num_time = int(time.mktime(time.strptime(string_time,'%Y-%m-%d')))
    keywords = str(soup.find(attrs={"name":"keywords"})['content'].encode('utf8','ignore'))
    description = str(soup.find(attrs={"name":"description"})['content'].encode('utf8','ignore'))
    content = soup.find_all("div", class_="rich_media_content")
    
    if len(content) < 1 :
        print("      "+"no contet")
        return False
    
    # 记录内容日志
    html = """
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<title>"""+title+"""</title>
<meta name="keywords" content=\""""+keywords+"""\">
<meta name="description" content=\""""+description+"""\">
</head>
<body>
    <div id="body">
    <h1>"""+title+"""</h1>
    <div id="string_time">"""+string_time+""" </div><div id="num_time">"""+str(num_time)+"""</div>
    <div id="content">
    """+str(content[0])+"""
    </div>
    </div>
</body>
<script type="text/javascript" src="js/reimg.js"></script>
</html>
    """
        
    f=file(path_url,"w+")
    f.write(html)
    f.close()
    
    # 写入数据库
    cur.execute("INSERT INTO archive (category,category_parents,title,summary,addtime,uptime) VALUES (27,\"0,12,27,\",%s,%s,%s,%s)",(title.strip(),description.strip(),num_time,num_time))
    #print cur.description
    #print "ID of last record is ", int(cur.lastrowid) #最后插入行的主键ID  
    #print "ID of inserted record is ", int(conn.insert_id()) #最新插入行的主键ID,conn.insert_id()一定要在conn.commit()之前,否则会返回0 
    lastid = int(cur.lastrowid)
    
    cur.execute("INSERT INTO archive_article (archive,intro,content) VALUE (%s,'',%s)",(lastid, str(content[0])))
    
    cur.connection.commit()
    
    return True

# 处理列表页
def down_list(list_url):
    # 列表内容
    xhtml=open_url(list_url)
    if False == xhtml :
        return False

    # 内容连接
    soup = BeautifulSoup(xhtml, "html5lib")
    title = soup.title.string.encode('utf-8')
    li_a = soup.find_all("a", class_="question_link")
    next_list = soup.find_all("a", text="下一页")
    
    # 记录日志
    writer = csv.writer(file(datapath+'list.csv', 'a+b'))
    x = 0
    y = 0
    # 循环抓取内容页
    print(list_url+" start")
    for i in range(0, len(li_a)):
        content_id = li_a[i]['href'].encode('utf-8')[3:]
        content_title = li_a[i].string.encode('utf-8')
        content_url = "http://chuansong.me"+li_a[i]['href'].encode('utf-8')
        path_url = datapath+content_id+".html"
        
        if not os.path.exists(path_url):
            # 抓取内容失败,继续
            if False == down_content(content_url,path_url) :
                print("  "+str(x)+content_url+" down fail")
                continue
                #return False
                
            print("  "+str(x)+content_url+" down end")
            # 记录日志
            writer.writerow([content_id, content_title, content_url])
            # 定时休息
            x=x+1
            if x%2 == 1 :
                time.sleep(3)
            time.sleep(1)
        else:
            print("  "+content_url+" exist")
            y=y+1
            # 重复存在三次结束抓取
            if y>2 :
                return False
    print(list_url+" end")
    
    # 不存在下一个列表
    if len(next_list) < 1 :
        return False

    # print("next "+next_list[0]['href'].encode('utf-8')+"\n")
    return True
    
# 抓取列表页
def get_list(wechart):
    start=0
    # 循环抓取列表
    while True:
        if start==0:
            url = 'http://chuansong.me/account/'+wechart
        else:
            url = 'http://chuansong.me/account/'+wechart+'?start='+str(start)
        
        # 完成或者超过2000条数据
        start+=12
        if False == down_list(url) or start>2000:
            break

        time.sleep(1)
        
    print("get_list end")

# 帮助
def usage():
    help = """
-d temp dir,default: """+get_cur_file_dir()+"""
-w wechart,default: xingdongpai77
-u mysql user,default: root
-p mysql pwd,default: 
-h,--help for help
"""
    print help
    
if __name__ == "__main__":
    opts, args = getopt.getopt(sys.argv[1:], "d:w:u:p:h", ["help"])
    arg_dir = get_cur_file_dir()
    arg_wechart = 'xingdongpai77'
    arg_user = 'root'
    arg_pwd = ''
    for op, value in opts:
        if op == "-d":
            arg_dir = value
        elif op == "-w":
            arg_wechart = value
        elif op == "-u":
            arg_user = value
        elif op == "-p":
            arg_pwd = value
        elif op == "-h" or op == "--help":
            usage()
            sys.exit()

    print time.strftime("%Y-%m-%d %H:%M:%S")

    # 初始化临时文件夹
    datapath = arg_dir+'/data/'
    if not os.path.exists(datapath):
        os.makedirs(datapath)

    # 初始化数据库
    try:
        conn = pymysql.connect(host='127.0.0.1', port=3306, user=arg_user, passwd=arg_pwd, db='mysql')
        cur = conn.cursor()
        cur.execute("SET NAMES utf8")
        cur.execute("USE x")
    except pymysql.Error, e:
        print __file__, e
        usage()
        sys.exit()

    # 开始抓取
    get_list(arg_wechart)
    
    # 关闭数据库
    cur.close()
    conn.close()
    
    # xtime = time.strftime("%Y-%m-%d %H:%M:%S")
    # xday = time.strftime("%Y-%m-%d")
    # f=file(datapath+xtime+".html","w+")
    # f.write(body)
    # f.close()

标签: beautifulsoup, 公众号, 传送门, 数据采集

添加新评论