PHP写的QQ空间爬虫...感觉脑子进水

当初不知道哪里看的文章,用PHP写爬虫(证明PHP是世界上最好的语言?)….然后脑子就进水…真用php写爬虫,感觉真是不好….(可能是自身技术的原因吧)
可能是自己对php还是有些不太懂,不知道怎么用多线程
存了空间里面的说说,评论,赞,没有采集转发的,因为我想通过评论和赞来判断是否为共同好友,虽然不是很精确

当然还可以来查询其他的数据,这个就看你怎么来玩了
需要自己去配置cookie 空间 gtk 还有你的qq账号(起始账号)
设置了只爬3圈,通过赞来继续进入下一圈
大概过程是这样:
《PHP写的QQ空间爬虫...感觉脑子进水》

每爬到一条数据就保存到数据库

爬了一波下来…..数据好多没有抓到,可能是PHP CURL扩展库的原因,还有学校网络…断了N次= =

《PHP写的QQ空间爬虫...感觉脑子进水》

《PHP写的QQ空间爬虫...感觉脑子进水》

贴上代码
这个是采集的文件

<?php /** *============================ * author:Farmer * time:下午4:55:49 * blog:blog.icodef.com * function:QQFriend *============================ */ ignore_user_abort ( true ); set_time_limit ( 0 ); include 'config.php'; include 'log.php'; $log = new CI_Log (); $log->write_log ( 'error', time () );
$cookie = 'cookie需要自己更改';
collection ( '你的QQ号', 0 );
function httpsGet($url) {//https 访问网页,读取源码
    global $cookie;
    $ch = curl_init ();
    curl_setopt ( $ch, CURLOPT_URL, $url );
    $header = array ();
    curl_setopt ( $ch, CURLOPT_RETURNTRANSFER, true );
    curl_setopt ( $ch, CURLOPT_SSL_VERIFYPEER, FALSE ); // https
    curl_setopt ( $ch, CURLOPT_SSL_VERIFYHOST, FALSE ); // https
    curl_setopt ( $ch, CURLOPT_HEADER, true );
    curl_setopt ( $ch, CURLOPT_HTTPHEADER, $header );
    curl_setopt ( $ch, CURLOPT_COOKIE, $cookie );
    curl_setopt ( $ch, CURLOPT_TIMEOUT, 10);
    $content = curl_exec ( $ch );
    return $content;
}
function httpGet($url) {//http 访问网页,读取源码
    global $cookie;
    $ch = curl_init ();
    curl_setopt ( $ch, CURLOPT_URL, $url );
    $header = array ();
    curl_setopt ( $ch, CURLOPT_RETURNTRANSFER, true );
    curl_setopt ( $ch, CURLOPT_HEADER, true );
    curl_setopt ( $ch, CURLOPT_HTTPHEADER, $header );
    curl_setopt ( $ch, CURLOPT_COOKIE, $cookie );
    curl_setopt ( $ch, CURLOPT_TIMEOUT, 10 );
    $content = curl_exec ( $ch );
    return $content;
}

function collection($qq, $number) {
    global $log;
    if ($number >= 2) {//只进去两层
        return 0;
    }
    $page = 0;
    while ( $page < 5 ) {//只读取5页 $gtk = 'GTK需要自己更改'; $url = 'https://h5.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6?uin=' . $qq . '&ftype=0&sort=0&pos=' . ($page * 20) . '&num=20&replynum=100&g_tk=' . $gtk . '&callback=_preloadCallback&code_version=1&format=jsonp&need_private_comment=1'; $log->write_log ( 'error', "开始读取文章列表: $qq $page" ); // 写出日志
        $content = httpsGet ( $url );
        $log->write_log ( 'error', "读取文章列表完成: $qq $page" ); // 写出日志
        preg_match_all ( '/_preloadCallback(([sS]*?));/', $content, $tmp );
        if (! isset ( $tmp [1] [0] )) {
            $log->write_log ( 'error', "文章读取失败: $qq $page" ); // 写出日志
            $page ++;
            continue;
        }
        overFind($qq);
        $json = json_decode ( $tmp [1] [0], true );
        addQQMsg ( $json ['logininfo'] ['uin'], $json ['logininfo'] ['name'] );
        $i = 0; // 记录文章
        while ( $i < 30 ) { $uin = $json ['logininfo'] ['uin']; if (! isset ( $json ['msglist'] [$i] ['tid'] )) { $log->write_log ( 'error', "文章加入数据库完成: $qq $page" ); // 写出日志
                break;
            }
            if (isset ( $json ['msglist'] [$i] ['rt_uin'] )) {
                $i ++;
                continue;
            } else {
                $rt_uin = $qq;
                $tid = $json ['msglist'] [$i] ['tid'];
            }
            addSay ( $tid, $rt_uin, $json ['msglist'] [$i] ['content'], $json ['msglist'] [$i] ['created_time'] );
            $n = 0;
            while ( $n < 30 ) { if (! isset ( $json ['msglist'] [$i] ['commentlist'] [$n] ['content'] )) { break; } addComment ( $tid, $json ['msglist'] [$i] ['commentlist'] [$n] ['uin'], $json ['msglist'] [$i] ['commentlist'] [$n] ['content'], $json ['msglist'] [$i] ['commentlist'] [$n] ['create_time'] ); $n ++; } $goodUlr = 'http://users.cnc.qzone.qq.com/cgi-bin/likes/get_like_list_app?uin=' . $uin . '&unikey=http%3A%2F%2Fuser.qzone.qq.com%2F' . $rt_uin . '%2Fmood%2F' . $tid . '&begin_uin=0&query_count=60&if_first_page=1&g_tk=' . $gtk; // 赞的链接 $log->write_log ( 'error', "开始读取赞: $qq $tid" ); // 写出日志
            $tmp = httpGet ( $goodUlr );
            $log->write_log ( 'error', "赞读取完毕: $qq $tid" ); // 写出日志
            preg_match_all ( '/_Callback(([sS]*?));/', $tmp, $tmp );
            if (! isset ( $tmp [1] [0] )) {
                $log->write_log ( 'error', "赞读取失败: $qq $tid" ); // 写出日志
                $i ++;
                continue;
            }
            $goodJson = json_decode ( $tmp [1] [0], true );
            $log->write_log ( 'error', "开始将赞加入数据库 $qq $tid" ); // 写出日志
            for($i1 = 0;; $i1 ++) {
                if (! isset ( $goodJson ['data'] ['like_uin_info'] [$i1] ['fuin'] )) {
                    $log->write_log ( 'error', "加入完毕 $qq $tid" ); // 写出日志
                    break;
                }
                addGood ( $json ['msglist'] [$i] ['tid'], $goodJson ['data'] ['like_uin_info'] [$i1] ['fuin'] );
                addQQMsg ( $goodJson ['data'] ['like_uin_info'] [$i1] ['fuin'], $goodJson ['data'] ['like_uin_info'] [$i1] ['nick'] );
                addQQ ( $qq, $goodJson ['data'] ['like_uin_info'] [$i1] ['fuin'] );
                $tmpNumber = $number + 1;
                if (! isFind ( $goodJson ['data'] ['like_uin_info'] [$i1] ['fuin'] )) {
                    $log->write_log ( 'error', "读取下一级 本级:$number 下一级:$tmpNumber " . $goodJson ['data'] ['like_uin_info'] [$i1] ['fuin'] ); // 写出日志
                    collection ( $goodJson ['data'] ['like_uin_info'] [$i1] ['fuin'], $tmpNumber );
                    $log->write_log ( 'error', "读取完毕 本级:$number 下一级:$tmpNumber " . $goodJson ['data'] ['like_uin_info'] [$i1] ['fuin'] ); // 写出日志
                }
            }
            $i ++;
        }
        $page ++;
    }
}
function addGood($tid, $qq) {
    $sql = "select * from qq_good where `tid`='$tid' and `qq`='$qq'";
    $query = query ( $sql );
    $row = fetch ( $query );
    if ($row [0] != 0) {
        return true;
    }
    $sql = "insert into qq_good(`tid`,`qq`) values('$tid','$qq')";
    query ( $sql );
    return false;
}
function addComment($tid, $qq, $content, $time) {
    $sql = "select * from qq_comment where `tid`='$tid' and `qq`='$qq'";
    $query = query ( $sql );
    $row = fetch ( $query );
    if ($row [0] != 0) {
        return true;
    }
    $sql = "insert into qq_comment(`tid`,`qq`,`content`,`time`) values('$tid','$qq','$content','$time')";
    query ( $sql );
    return false;
}
function overFind($qq){
    //return false;
    $sql = "select * from qq_isfind where `qq`='$qq'";
    $query = query ( $sql );
    $row = fetch ( $query );
    if ($row [0] != 0) {
        return true;
    }
    $sql = "insert into qq_isfind(`qq`) values('$qq')";
    query ( $sql );
    return false;
}
function isFind($qq) {
    //return false;
    $sql = "select * from qq_isfind where `qq`='$qq'";
    $query = query ( $sql );
    $row = fetch ( $query );
    if ($row [0] != 0) {
        return true;
    }
    return false;
}
function addSay($tid, $qq, $content, $time) {
    $sql = "insert into qq_say(`tid`,`qq`,`content`,`time`) values('$tid','$qq','$content','$time')";
    query ( $sql );
}
function addQQ($qq, $to_qq) {
    $sql = "select * from qq_friend where `qq`='$qq' and `to_qq`='$to_qq'";
    $query = query ( $sql );
    $row = fetch ( $query );
    if ($row [0] != 0) {
        return 0;
    }
    $sql = "insert into qq_friend(`qq`,`to_qq`) values('$qq','$to_qq')";
    query ( $sql );
    return 1;
}
function addQQMsg($qq, $name) {
    $sql = "insert into qq_msg(`qq`,`name`) values('$qq','$name')";
    query ( $sql );
    return false;
}
[/php]

配置文件 config.php

[php]
<?php /** *============================ * Stu conifg.php * author:Farmer * time:9:22:01 * function:配置文件 *============================ */ define ( "DB_USER", "root" ); define ( "DB_PWD", "" ); define ( "DB_DATABASE", "qqdata" ); define ( "DB_SERVER", "localhost" ); $con = mysqli_connect ( DB_SERVER, DB_USER, DB_PWD ); if (! $con) { die ( 'Could not connect: ' . mysql_error () ); return; } mysqli_select_db ( $con, DB_DATABASE ); query("set names utf8"); header ( "Content-type: text/html; charset=utf-8" ); ?>
<?php function query($sql){ global $con; return mysqli_query($con, $sql); } function fetch($query){ return mysqli_fetch_array($query); } ?>

日志类 log.php 来源网络

<?php

if( ! defined('BASEPATH') ) {
    define ('BASEPATH', $_SERVER['DOCUMENT_ROOT'].'/news/');
}
//<a href="http://lib.csdn.net/base/mysql" class='replace_word' title="MySQL知识库" target='_blank' style='color:#df3434; font-weight:bold;'>数据库</a>设定
$config['db']=array(
);
//日志文件目录
$config['log']=array(
        'log_date_format' => 'Y-m-d H:i:s',
        'log_path' =>  '/logs/'
);

if( ! defined('BASEPATH') ) {
    define ('BASEPATH', $_SERVER['DOCUMENT_ROOT'].'/news/');
}
define('FILE_READ_MODE', 0644);  
define('FILE_WRITE_MODE', 0666);  
define('DIR_READ_MODE', 0755);  
define('DIR_WRITE_MODE', 0777);  

/* 
|-------------------------------------------------------------------------- 
| File Stream Modes 
|-------------------------------------------------------------------------- 
| 
| These modes are used when working with fopen()/popen() 
| 
*/  

define('FOPEN_READ',                            'rb');  
define('FOPEN_READ_WRITE',                      'r+b');  
define('FOPEN_WRITE_CREATE_DESTRUCTIVE',        'wb'); // truncates existing file data, use with care  
define('FOPEN_READ_WRITE_CREATE_DESTRUCTIVE',   'w+b'); // truncates existing file data, use with care  
define('FOPEN_WRITE_CREATE',                    'ab');  
define('FOPEN_READ_WRITE_CREATE',               'a+b');  
define('FOPEN_WRITE_CREATE_STRICT',             'xb');  
define('FOPEN_READ_WRITE_CREATE_STRICT',        'x+b');  

/* End of file constants.php */  
define ("LOG_PATH", $config['log']['log_path']);
define ("LOG_DATE_FORMAT", $config['log']['log_date_format']);

/**
 * Logging Class
 * @subpackage  Libraries
 * @category    Logging
 * @link        
 */

class CI_Log {

    var $log_path = './logs/';
    var $_threshold = 4;
    var $_date_fmt  = 'Y-m-d H:i:s';
    var $_enabled   = TRUE;
    var $_levels    = array('ERROR' => '1', 'DEBUG' => '2',  'INFO' => '3', 'ALL' => '4');

    /**
     * Constructor
     *
     * @access  public
     */
    function CI_Log()
    {
        if( defined(LOG_PATH) )
        {
            $this->log_path = LOG_PATH;
        }
        if ( ! is_dir($this->log_path))
        {
            $this->_enabled = FALSE;
        }

        if ( defined(LOG_DATE_FORMAT) )
        {
            $this->_date_fmt = LOG_DATE_FORMAT;
        }
    }

    // --------------------------------------------------------------------

    /**
     * Write Log File
     *
     * Generally this function will be called using the global log_message() function
     *
     * @access  public
     * @param   string  the error level
     * @param   string  the error message
     * @param   bool    whether the error is a native PHP error
     * @return  bool
     */     
    function write_log($level = 'error', $msg, $php_error = FALSE)
    {       
        if ($this->_enabled == FALSE)
        {
            return FALSE;
        }

        $level = strtoupper($level);

        if ( ! isset($this->_levels[$level]) OR ($this->_levels[$level] > $this->_threshold))
        {
            return FALSE;
        }

        $filepath = $this->log_path.'log-'.date('Y-m-d').'.log';
        $message  = '';
        if ( ! $fp = @fopen($filepath, FOPEN_WRITE_CREATE))
        {
            return FALSE;
        }

        $message .= $level.' '.(($level == 'INFO') ? ' -' : '-').' '.date($this->_date_fmt). ' --> '.$msg."n";

        flock($fp, LOCK_EX);    
        fwrite($fp, $message);
        flock($fp, LOCK_UN);
        fclose($fp);

        @chmod($filepath, FILE_WRITE_MODE); 

        return TRUE;
    }

}
// END Log Class

/* End of file Log.php */

数据库结构

/*
Navicat MySQL Data Transfer

Source Server         : localhost_3306
Source Server Version : 50505
Source Host           : localhost:3306
Source Database       : qqdata

Target Server Type    : MYSQL
Target Server Version : 50505
File Encoding         : 65001

Date: 2016-11-30 14:36:15
*/

SET FOREIGN_KEY_CHECKS=0;

-- ----------------------------
-- Table structure for qq_comment
-- ----------------------------
DROP TABLE IF EXISTS `qq_comment`;
CREATE TABLE `qq_comment` (
  `tid` varchar(255) NOT NULL,
  `qq` varchar(255) NOT NULL,
  `content` varchar(255) CHARACTER SET utf8 NOT NULL,
  `time` bigint(255) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=latin1;

-- ----------------------------
-- Table structure for qq_friend
-- ----------------------------
DROP TABLE IF EXISTS `qq_friend`;
CREATE TABLE `qq_friend` (
  `qq` varchar(255) DEFAULT NULL,
  `to_qq` varchar(255) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=latin1;

-- ----------------------------
-- Table structure for qq_good
-- ----------------------------
DROP TABLE IF EXISTS `qq_good`;
CREATE TABLE `qq_good` (
  `tid` varchar(255) NOT NULL,
  `qq` varchar(255) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=latin1;

-- ----------------------------
-- Table structure for qq_isfind
-- ----------------------------
DROP TABLE IF EXISTS `qq_isfind`;
CREATE TABLE `qq_isfind` (
  `qq` varchar(255) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=latin1;

-- ----------------------------
-- Table structure for qq_msg
-- ----------------------------
DROP TABLE IF EXISTS `qq_msg`;
CREATE TABLE `qq_msg` (
  `qq` bigint(20) NOT NULL,
  `name` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,
  PRIMARY KEY (`qq`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1;

-- ----------------------------
-- Table structure for qq_say
-- ----------------------------
DROP TABLE IF EXISTS `qq_say`;
CREATE TABLE `qq_say` (
  `tid` varchar(255) NOT NULL,
  `qq` varchar(255) NOT NULL,
  `content` varchar(255) CHARACTER SET utf8 NOT NULL,
  `time` bigint(20) NOT NULL,
  PRIMARY KEY (`tid`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
点赞

发表评论

电子邮件地址不会被公开。 必填项已用*标注

此站点使用Akismet来减少垃圾评论。了解我们如何处理您的评论数据