php爬虫爬取数据并存储至数据库

准备:php环境,phpspider请自行下载

下载地址:https://github.com/owner888/phpspider

文档:https://doc.phpspider.org/configs-members.html

在phpspider,demo文件里面新建一个php文件,my_spider.php,代码复制进去,在命令函下执行php my_spider.php能看到打印信息,数据库文件根据自己需要建一两个字段测试一下就好,这里就不给出数据库sql文件了

直接上代码:

 <?php
    require_once __DIR__ . '/../autoloader.php';
    use phpspider\core\requests;
    use phpspider\core\selector;
    use \phpspider\core\db;
    use \phpspider\core\queue;
    /* Do NOT delete this comment */
    /* 不要删除这段注释 */
    //数据库配置
    $db_config = array(
        'host'  => '127.0.0.1',
        'port'  => 3306,
        'user'  => 'root',
        'pass'  => 'root',
        'name'  => 'news',
    );
    // 数据库配置
    $db = db::set_connect('default', $db_config);
    // 数据库链接
    //db::init_mysql();
    
    //redis配置
    $redis_config = array(
        'host'      => '127.0.0.1',
        'port'      => 6379,
        'pass'      => '',
        'db'        => 5,
        'prefix'    => 'phpspider',
        'timeout'   => 30,
    );
    queue::set_connect('redis',$redis_config);
    queue::select(5);
    
    //抓取虎扑网nba数据
    $html = requests::get('https://voice.hupu.com/nba');
    ////获取列表资讯链接
    $url = selector::select($html,"//div[@class='voice-main']//li//h4//a/@href");
    //$url = array_reverse($url);
    //定义数组
    $spider_data = [];
    foreach ($url as $key => $v){
        //通过连接抓取内部内容
        $detail_html = requests::get($v);
        //保存地址
        $spider_data[$key]['url'] = $v;
        //抓取标题
        $spider_data[$key]['title'] = selector::select($detail_html,"/html/body/div[4]/div[1]/div[1]/h1");;
        //获取封面图
        $spider_data[$key]['cover'] = selector::select($detail_html,"/html/body/div[4]/div[1]/div[2]/div/div[1]/img/@src");
        //图片数组
        $spider_data[$key]['imgs'] =selector::select($detail_html,"/html/body/div[4]/div[1]//img/@src");
        if(is_array($spider_data[$key]['imgs'])){
            $spider_data[$key]['imgs'] = json_encode($spider_data[$key]['imgs']);
        }else{
            $spider_data[$key]['imgs'] = json_encode([$spider_data[$key]['imgs']]);
        }
        //获取内容
        $spider_data[$key]['content'] = selector::select($detail_html,"/html/body/div[4]/div[1]/div[2]/div");
        //获取时间
        $spider_data[$key]['create_time'] = strtotime(selector::select($detail_html,"//*[@pubtime_baidu\"]"));
        $spider_data[$key]['update_time'] = strtotime(selector::select($detail_html,"//*[@pubtime_baidu\"]"));
        //获取来源
        $spider_data[$key]['source'] = selector::select($detail_html,"//*[@source_baidu\"]/a");
        //频道id
        $spider_data[$key]['channel_id'] = 2;
        //状态
        $spider_data[$key]['status'] = 1;
        //类型
        $spider_data[$key]['type'] = 1;
        //随机获取作者id
        $admin_arr = array(23,24,25,26);
        $admin_key = array_rand($admin_arr);
        $spider_data[$key]['admin_id'] =  $admin_arr[$admin_key];
        //tag_id nba,先匹配,再分类
        if(strpos($spider_data[$key]['content'],'采访') !== false || strpos($spider_data[$key]['content'],'记者') || strpos($spider_data[$key]['content'],'报道')){
            $spider_data[$key]['tag_id'] = 103;
        }else if(strpos($spider_data[$key]['content'],'伤') !== false){
            $spider_data[$key]['tag_id'] = 3;
        }else if(strpos($spider_data[$key]['content'],'签下') !== false || strpos($spider_data[$key]['content'],'签约') !== false || strpos($spider_data[$key]['content'],'引援') !== false || 
strpos($spider_data[$key]['content'],'转会') !== false){
            $spider_data[$key]['tag_id'] = 1;
        }else{
            //随机获取分类
            $tag_arr = array(5,13);
            $tag_key = array_rand($tag_arr);
            $spider_data[$key]['tag_id'] = $tag_arr[$tag_key];
        }
    }
  var_dump($spider_data);exit;

本博客爬取的是虎扑资讯信息,如果导致爬取目标网站瘫痪,与本博无关,不做任何责任申明,只为技术。