当初不知道哪里看的文章,用PHP写爬虫(证明PHP是世界上最好的语言?)....然后脑子就进水...真用php写爬虫,感觉真是不好....(可能是自身技术的原因吧)
可能是自己对php还是有些不太懂,不知道怎么用多线程
存了空间里面的说说,评论,赞,没有采集转发的,因为我想通过评论和赞来判断是否为共同好友,虽然不是很精确
当然还可以来查询其他的数据,这个就看你怎么来玩了
需要自己去配置cookie 空间 gtk 还有你的qq账号(起始账号)
设置了只爬3圈,通过赞来继续进入下一圈
大概过程是这样:
每爬到一条数据就保存到数据库
爬了一波下来.....数据好多没有抓到,可能是PHP CURL扩展库的原因,还有学校网络...断了N次= =
贴上代码
这个是采集的文件
<?php /** *============================ * author:Farmer * time:下午4:55:49 * blog:blog.icodef.com * function:QQFriend *============================ */ ignore_user_abort ( true ); set_time_limit ( 0 ); include 'config.php'; include 'log.php'; $log = new CI_Log (); $log->write_log ( 'error', time () );
$cookie = 'cookie需要自己更改';
collection ( '你的QQ号', 0 );
function httpsGet($url) {//https 访问网页,读取源码
global $cookie;
$ch = curl_init ();
curl_setopt ( $ch, CURLOPT_URL, $url );
$header = array ();
curl_setopt ( $ch, CURLOPT_RETURNTRANSFER, true );
curl_setopt ( $ch, CURLOPT_SSL_VERIFYPEER, FALSE ); // https
curl_setopt ( $ch, CURLOPT_SSL_VERIFYHOST, FALSE ); // https
curl_setopt ( $ch, CURLOPT_HEADER, true );
curl_setopt ( $ch, CURLOPT_HTTPHEADER, $header );
curl_setopt ( $ch, CURLOPT_COOKIE, $cookie );
curl_setopt ( $ch, CURLOPT_TIMEOUT, 10);
$content = curl_exec ( $ch );
return $content;
}
function httpGet($url) {//http 访问网页,读取源码
global $cookie;
$ch = curl_init ();
curl_setopt ( $ch, CURLOPT_URL, $url );
$header = array ();
curl_setopt ( $ch, CURLOPT_RETURNTRANSFER, true );
curl_setopt ( $ch, CURLOPT_HEADER, true );
curl_setopt ( $ch, CURLOPT_HTTPHEADER, $header );
curl_setopt ( $ch, CURLOPT_COOKIE, $cookie );
curl_setopt ( $ch, CURLOPT_TIMEOUT, 10 );
$content = curl_exec ( $ch );
return $content;
}
function collection($qq, $number) {
global $log;
if ($number >= 2) {//只进去两层
return 0;
}
$page = 0;
while ( $page < 5 ) {//只读取5页 $gtk = 'GTK需要自己更改'; $url = 'https://h5.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6?uin=' . $qq . '&ftype=0&sort=0&pos=' . ($page * 20) . '&num=20&replynum=100&g_tk=' . $gtk . '&callback=_preloadCallback&code_version=1&format=jsonp&need_private_comment=1'; $log->write_log ( 'error', "开始读取文章列表: $qq $page" ); // 写出日志
$content = httpsGet ( $url );
$log->write_log ( 'error', "读取文章列表完成: $qq $page" ); // 写出日志
preg_match_all ( '/_preloadCallback(([sS]*?));/', $content, $tmp );
if (! isset ( $tmp [1] [0] )) {
$log->write_log ( 'error', "文章读取失败: $qq $page" ); // 写出日志
$page ++;
continue;
}
overFind($qq);
$json = json_decode ( $tmp [1] [0], true );
addQQMsg ( $json ['logininfo'] ['uin'], $json ['logininfo'] ['name'] );
$i = 0; // 记录文章
while ( $i < 30 ) { $uin = $json ['logininfo'] ['uin']; if (! isset ( $json ['msglist'] [$i] ['tid'] )) { $log->write_log ( 'error', "文章加入数据库完成: $qq $page" ); // 写出日志
break;
}
if (isset ( $json ['msglist'] [$i] ['rt_uin'] )) {
$i ++;
continue;
} else {
$rt_uin = $qq;
$tid = $json ['msglist'] [$i] ['tid'];
}
addSay ( $tid, $rt_uin, $json ['msglist'] [$i] ['content'], $json ['msglist'] [$i] ['created_time'] );
$n = 0;
while ( $n < 30 ) { if (! isset ( $json ['msglist'] [$i] ['commentlist'] [$n] ['content'] )) { break; } addComment ( $tid, $json ['msglist'] [$i] ['commentlist'] [$n] ['uin'], $json ['msglist'] [$i] ['commentlist'] [$n] ['content'], $json ['msglist'] [$i] ['commentlist'] [$n] ['create_time'] ); $n ++; } $goodUlr = 'http://users.cnc.qzone.qq.com/cgi-bin/likes/get_like_list_app?uin=' . $uin . '&unikey=http%3A%2F%2Fuser.qzone.qq.com%2F' . $rt_uin . '%2Fmood%2F' . $tid . '&begin_uin=0&query_count=60&if_first_page=1&g_tk=' . $gtk; // 赞的链接 $log->write_log ( 'error', "开始读取赞: $qq $tid" ); // 写出日志
$tmp = httpGet ( $goodUlr );
$log->write_log ( 'error', "赞读取完毕: $qq $tid" ); // 写出日志
preg_match_all ( '/_Callback(([sS]*?));/', $tmp, $tmp );
if (! isset ( $tmp [1] [0] )) {
$log->write_log ( 'error', "赞读取失败: $qq $tid" ); // 写出日志
$i ++;
continue;
}
$goodJson = json_decode ( $tmp [1] [0], true );
$log->write_log ( 'error', "开始将赞加入数据库 $qq $tid" ); // 写出日志
for($i1 = 0;; $i1 ++) {
if (! isset ( $goodJson ['data'] ['like_uin_info'] [$i1] ['fuin'] )) {
$log->write_log ( 'error', "加入完毕 $qq $tid" ); // 写出日志
break;
}
addGood ( $json ['msglist'] [$i] ['tid'], $goodJson ['data'] ['like_uin_info'] [$i1] ['fuin'] );
addQQMsg ( $goodJson ['data'] ['like_uin_info'] [$i1] ['fuin'], $goodJson ['data'] ['like_uin_info'] [$i1] ['nick'] );
addQQ ( $qq, $goodJson ['data'] ['like_uin_info'] [$i1] ['fuin'] );
$tmpNumber = $number + 1;
if (! isFind ( $goodJson ['data'] ['like_uin_info'] [$i1] ['fuin'] )) {
$log->write_log ( 'error', "读取下一级 本级:$number 下一级:$tmpNumber " . $goodJson ['data'] ['like_uin_info'] [$i1] ['fuin'] ); // 写出日志
collection ( $goodJson ['data'] ['like_uin_info'] [$i1] ['fuin'], $tmpNumber );
$log->write_log ( 'error', "读取完毕 本级:$number 下一级:$tmpNumber " . $goodJson ['data'] ['like_uin_info'] [$i1] ['fuin'] ); // 写出日志
}
}
$i ++;
}
$page ++;
}
}
function addGood($tid, $qq) {
$sql = "select * from qq_good where `tid`='$tid' and `qq`='$qq'";
$query = query ( $sql );
$row = fetch ( $query );
if ($row [0] != 0) {
return true;
}
$sql = "insert into qq_good(`tid`,`qq`) values('$tid','$qq')";
query ( $sql );
return false;
}
function addComment($tid, $qq, $content, $time) {
$sql = "select * from qq_comment where `tid`='$tid' and `qq`='$qq'";
$query = query ( $sql );
$row = fetch ( $query );
if ($row [0] != 0) {
return true;
}
$sql = "insert into qq_comment(`tid`,`qq`,`content`,`time`) values('$tid','$qq','$content','$time')";
query ( $sql );
return false;
}
function overFind($qq){
//return false;
$sql = "select * from qq_isfind where `qq`='$qq'";
$query = query ( $sql );
$row = fetch ( $query );
if ($row [0] != 0) {
return true;
}
$sql = "insert into qq_isfind(`qq`) values('$qq')";
query ( $sql );
return false;
}
function isFind($qq) {
//return false;
$sql = "select * from qq_isfind where `qq`='$qq'";
$query = query ( $sql );
$row = fetch ( $query );
if ($row [0] != 0) {
return true;
}
return false;
}
function addSay($tid, $qq, $content, $time) {
$sql = "insert into qq_say(`tid`,`qq`,`content`,`time`) values('$tid','$qq','$content','$time')";
query ( $sql );
}
function addQQ($qq, $to_qq) {
$sql = "select * from qq_friend where `qq`='$qq' and `to_qq`='$to_qq'";
$query = query ( $sql );
$row = fetch ( $query );
if ($row [0] != 0) {
return 0;
}
$sql = "insert into qq_friend(`qq`,`to_qq`) values('$qq','$to_qq')";
query ( $sql );
return 1;
}
function addQQMsg($qq, $name) {
$sql = "insert into qq_msg(`qq`,`name`) values('$qq','$name')";
query ( $sql );
return false;
}
[/php]
配置文件 config.php
[php]
<?php /** *============================ * Stu conifg.php * author:Farmer * time:9:22:01 * function:配置文件 *============================ */ define ( "DB_USER", "root" ); define ( "DB_PWD", "" ); define ( "DB_DATABASE", "qqdata" ); define ( "DB_SERVER", "localhost" ); $con = mysqli_connect ( DB_SERVER, DB_USER, DB_PWD ); if (! $con) { die ( 'Could not connect: ' . mysql_error () ); return; } mysqli_select_db ( $con, DB_DATABASE ); query("set names utf8"); header ( "Content-type: text/html; charset=utf-8" ); ?>
<?php function query($sql){ global $con; return mysqli_query($con, $sql); } function fetch($query){ return mysqli_fetch_array($query); } ?>
日志类 log.php 来源网络
<?php
if( ! defined('BASEPATH') ) {
define ('BASEPATH', $_SERVER['DOCUMENT_ROOT'].'/news/');
}
//<a href="http://lib.csdn.net/base/mysql" class='replace_word' title="MySQL知识库" target='_blank' style='color:#df3434; font-weight:bold;'>数据库</a>设定
$config['db']=array(
);
//日志文件目录
$config['log']=array(
'log_date_format' => 'Y-m-d H:i:s',
'log_path' => '/logs/'
);
if( ! defined('BASEPATH') ) {
define ('BASEPATH', $_SERVER['DOCUMENT_ROOT'].'/news/');
}
define('FILE_READ_MODE', 0644);
define('FILE_WRITE_MODE', 0666);
define('DIR_READ_MODE', 0755);
define('DIR_WRITE_MODE', 0777);
/*
|--------------------------------------------------------------------------
| File Stream Modes
|--------------------------------------------------------------------------
|
| These modes are used when working with fopen()/popen()
|
*/
define('FOPEN_READ', 'rb');
define('FOPEN_READ_WRITE', 'r+b');
define('FOPEN_WRITE_CREATE_DESTRUCTIVE', 'wb'); // truncates existing file data, use with care
define('FOPEN_READ_WRITE_CREATE_DESTRUCTIVE', 'w+b'); // truncates existing file data, use with care
define('FOPEN_WRITE_CREATE', 'ab');
define('FOPEN_READ_WRITE_CREATE', 'a+b');
define('FOPEN_WRITE_CREATE_STRICT', 'xb');
define('FOPEN_READ_WRITE_CREATE_STRICT', 'x+b');
/* End of file constants.php */
define ("LOG_PATH", $config['log']['log_path']);
define ("LOG_DATE_FORMAT", $config['log']['log_date_format']);
/**
* Logging Class
* @subpackage Libraries
* @category Logging
* @link
*/
class CI_Log {
var $log_path = './logs/';
var $_threshold = 4;
var $_date_fmt = 'Y-m-d H:i:s';
var $_enabled = TRUE;
var $_levels = array('ERROR' => '1', 'DEBUG' => '2', 'INFO' => '3', 'ALL' => '4');
/**
* Constructor
*
* @access public
*/
function CI_Log()
{
if( defined(LOG_PATH) )
{
$this->log_path = LOG_PATH;
}
if ( ! is_dir($this->log_path))
{
$this->_enabled = FALSE;
}
if ( defined(LOG_DATE_FORMAT) )
{
$this->_date_fmt = LOG_DATE_FORMAT;
}
}
// --------------------------------------------------------------------
/**
* Write Log File
*
* Generally this function will be called using the global log_message() function
*
* @access public
* @param string the error level
* @param string the error message
* @param bool whether the error is a native PHP error
* @return bool
*/
function write_log($level = 'error', $msg, $php_error = FALSE)
{
if ($this->_enabled == FALSE)
{
return FALSE;
}
$level = strtoupper($level);
if ( ! isset($this->_levels[$level]) OR ($this->_levels[$level] > $this->_threshold))
{
return FALSE;
}
$filepath = $this->log_path.'log-'.date('Y-m-d').'.log';
$message = '';
if ( ! $fp = @fopen($filepath, FOPEN_WRITE_CREATE))
{
return FALSE;
}
$message .= $level.' '.(($level == 'INFO') ? ' -' : '-').' '.date($this->_date_fmt). ' --> '.$msg."n";
flock($fp, LOCK_EX);
fwrite($fp, $message);
flock($fp, LOCK_UN);
fclose($fp);
@chmod($filepath, FILE_WRITE_MODE);
return TRUE;
}
}
// END Log Class
/* End of file Log.php */
数据库结构
/*
Navicat MySQL Data Transfer
Source Server : localhost_3306
Source Server Version : 50505
Source Host : localhost:3306
Source Database : qqdata
Target Server Type : MYSQL
Target Server Version : 50505
File Encoding : 65001
Date: 2016-11-30 14:36:15
*/
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for qq_comment
-- ----------------------------
DROP TABLE IF EXISTS `qq_comment`;
CREATE TABLE `qq_comment` (
`tid` varchar(255) NOT NULL,
`qq` varchar(255) NOT NULL,
`content` varchar(255) CHARACTER SET utf8 NOT NULL,
`time` bigint(255) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
-- ----------------------------
-- Table structure for qq_friend
-- ----------------------------
DROP TABLE IF EXISTS `qq_friend`;
CREATE TABLE `qq_friend` (
`qq` varchar(255) DEFAULT NULL,
`to_qq` varchar(255) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
-- ----------------------------
-- Table structure for qq_good
-- ----------------------------
DROP TABLE IF EXISTS `qq_good`;
CREATE TABLE `qq_good` (
`tid` varchar(255) NOT NULL,
`qq` varchar(255) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
-- ----------------------------
-- Table structure for qq_isfind
-- ----------------------------
DROP TABLE IF EXISTS `qq_isfind`;
CREATE TABLE `qq_isfind` (
`qq` varchar(255) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
-- ----------------------------
-- Table structure for qq_msg
-- ----------------------------
DROP TABLE IF EXISTS `qq_msg`;
CREATE TABLE `qq_msg` (
`qq` bigint(20) NOT NULL,
`name` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,
PRIMARY KEY (`qq`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
-- ----------------------------
-- Table structure for qq_say
-- ----------------------------
DROP TABLE IF EXISTS `qq_say`;
CREATE TABLE `qq_say` (
`tid` varchar(255) NOT NULL,
`qq` varchar(255) NOT NULL,
`content` varchar(255) CHARACTER SET utf8 NOT NULL,
`time` bigint(20) NOT NULL,
PRIMARY KEY (`tid`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
文章评论