使用WebMagic 编写 java 网络爬虫

admin 发布于 2020-03-18T15:07:57 评论(0) 阅读(50)

分类:Java

标签:

写这个的目的是为了爬歌词,因为喜欢听歌,遇到喜欢的歌就喜欢把歌词下载下来。

WebMacgic 教程地址

http://webmagic.io/docs/zh/posts/ch1-overview/

使用 IDEA 创建 maven工程

下面为工程目录结构


下面为源代码

package bean;
 
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
 
import java.sql.Timestamp;
import java.util.Date;
import java.util.List;
 
/**
 * @author zhaoshenjiao
 * @Date 2017-04-18 23:12:34
 */
@TargetUrl("http://www.kuwo.cn/yinyue/*")
public class KuWoMusic {
    /**
     * 歌名
     */
//    @ExtractBy(value="div.tit em.f-ff2",type = ExtractBy.Type.Css)
    @ExtractBy("//p[@id='lrcName']/text()")
    private String name;
    /**
     * 歌手
     */
 
//    @ExtractBy(value="p.des span a",type = ExtractBy.Type.Css)
    @ExtractBy("//p[@class='artist']/span/a/text()")
    private String singer;
    /**
     * 歌词
     */
//    @ExtractBy(value="div.mCSB_container p",type = ExtractByactBy.Type.Css)
    @ExtractBy("//p[@class='lrcItem']")
    private List<String>  lyrics;
 
 
    private String  lyric;
    /**
     * 所属专辑
     */
//    @ExtractBy(value="p.des a",type = ExtractBy.Type.Css)
    @ExtractBy("//p[@class='album']/span/a/text()")
    private String album;
 
    private Timestamp recordTime;
    /**
     * 所属专辑
     */
//    @ExtractBy(value="body",type = ExtractBy.Type.Css)
//    private String body;
 
    public String getName() {
        return name;
    }
 
    public void setName(String name) {
        this.name = name;
    }
 
    public String getSinger() {
        return singer;
    }
 
    public void setSinger(String singer) {
        this.singer = singer;
    }
 
    public List<String> getLyrics() {
        return lyrics;
    }
 
    public void setLyrics(List<String> lyrics) {
        this.lyrics = lyrics;
    }
 
    public String getLyric() {
        StringBuilder sb = new StringBuilder();
        for ( String str: lyrics ) {
            sb.append(str);
        }
        return sb.toString();
    }
 
    public void setLyric(String lyric) {
        StringBuilder sb = new StringBuilder();
        for ( String str: lyrics ) {
            sb.append(str);
        }
        this.lyric = sb.toString();
    }
 
    public String getAlbum() {
        return album;
    }
 
    public void setAlbum(String album) {
        this.album = album;
    }
 
    public Timestamp getRecordTime() {
        return new Timestamp( new Date().getTime());
    }
 
    public void setRecordTime(Timestamp recordTime) {
        this.recordTime = recordTime;
    }
 
    @Override
    public String toString() {
        return "[name:"+name +",singer="+singer+",album="+album+",lyric="+lyric+"]";
    }
}


package dao;
 
import bean.KuWoMusic;
import org.apache.ibatis.annotations.Insert;
 
/**
 * @author zhaoshenjiao
 * @Date 2017-04-19 00:37:57
 */
public interface KuWoMusicDao {
    @Insert("insert into lyric (`title`,`content`,`source`,`singer`,`album`,`recorder`,`recordTime`,`curStatus`) " +
            "values (#{name},#{lyric},'酷我',#{singer},#{album},'admin',#{recordTime},'2')")
    int add(KuWoMusic kuWoMusic);
}
package dao.pipeline;
 
import dao.KuWoMusicDao;
import bean.KuWoMusic;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.PageModelPipeline;
 
/**
 * @author zhaoshenjiao
 * @Date 2017-04-19 00:42:41
 */
@Component("KuWoMusicDaoPipeline")
public class KuWoMusicDaoPipeline implements PageModelPipeline<KuWoMusic> {
 
    ApplicationContext context = new ClassPathXmlApplicationContext("root-context.xml");
    KuWoMusicDao kuWoMusicDao = (KuWoMusicDao)context.getBean("kuWoMusicDao");
//    @Resource
//    private KuWoMusicDao kuWoMusicDao;
 
    @Override
    public void process(KuWoMusic kuWoMusic, Task task) {
        //输出歌词信息
        System.out.println(kuWoMusic.toString());
        kuWoMusicDao.add(kuWoMusic);
    }
}


package execute;
 
import dao.pipeline.KuWoMusicDaoPipeline;
import bean.KuWoMusic;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;
 
/**
 * 爬虫执行类
 * @author zhaoshenjiao
 * @Date 2017-04-18 23:23:43
 */
public class LyricCrawlerExecutor {
    public static void main(String[] args) {
        //保存到数据库
        OOSpider.create(
                Site.me(),
                new KuWoMusicDaoPipeline(), KuWoMusic.class)
                .addUrl("http://www.kuwo.cn/yinyue/492211?catalog=yueku2016")
                .thread(2)
                .run();
        //输出到控制台
//        OOSpider.create(
//                Site.me(),
//                new ConsolePageModelPipeline(), KuWoMusic.class)
//                .addUrl("http://www.kuwo.cn/yinyue/492211?catalog=yueku2016")
//                .thread(2)
//                .run();
 
        //测试获取bean
//        ApplicationContext context = new ClassPathXmlApplicationContext("root-context.xml");
//        KuWoMusicDao kuWoMusicDao = (KuWoMusicDao)context.getBean("kuWoMusicDao");
//
//        //包名(或者是保的完整路径)/配置文件名字(也就是xml文件)
//        ClassPathXmlApplicationContext cpx=new ClassPathXmlApplicationContext ("root-context.xml");
//
//        System.out.println(cpx.getBean("kuWoMusicDao"));
    }
}
log4j.rootLogger=INFO,DEBUG,stdout

log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] -%m%n


#log4j.logger.com.ibatis=debug
#log4j.logger.com.ibatis.common.jdbc.SimpleDataSource=debug
#log4j.logger.com.ibatis.common.jdbc.ScriptRunner=debug
#log4j.logger.com.ibatis.sqlmap.engine.impl.SqlMapClientDelegate=debug
#log4j.logger.java.sql.Connection=debug
#log4j.logger.java.sql.Statement=debug
#log4j.logger.java.sql.PreparedStatement=debug,stdout
————————————————
版权声明:本文为CSDN博主「静叶01」的原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/airujingye/article/details/70249094
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:tx="http://www.springframework.org/schema/tx"
	xmlns:task="http://www.springframework.org/schema/task"
	xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.2.xsd  
                http://www.springframework.org/schema/tx http://www.springframework.org/schema/tx/spring-tx-3.2.xsd
                http://www.springframework.org/schema/task http://www.springframework.org/schema/task/spring-task.xsd ">
 
	<!-- 配置DataSource数据源 -->
	<bean id="dataSource" class="org.apache.commons.dbcp.BasicDataSource" destroy-method="close">
		<property name="driverClassName" value="com.mysql.jdbc.Driver" />
		<property name="url" value="jdbc:mysql://localhost:3306/dbname?characterEncoding=utf-8" />
		<property name="username" value="" />
		<property name="password" value="" />
		<property name="maxActive" value="5" />
		<property name="maxIdle" value="3" />
		<property name="maxWait" value="1000" />
		<property name="defaultAutoCommit" value="true" />
		<property name="removeAbandoned" value="true" />
		<property name="removeAbandonedTimeout" value="60" />
	</bean>
 
	<!-- 创建SqlSessionFactory,同时指定数据源 -->
	<bean id="sqlSessionFactory" class="org.mybatis.spring.SqlSessionFactoryBean">
		<property name="dataSource" ref="dataSource" />
	</bean>
 
	<!-- 配置Spring的事务管理器 -->
	<bean id="transactionManager"
		class="org.springframework.jdbc.datasource.DataSourceTransactionManager">
		<property name="dataSource" ref="dataSource" />
	</bean>
 
	<bean id="kuWoMusicDao" class="org.mybatis.spring.mapper.MapperFactoryBean">
		<property name="mapperInterface" value="dao.KuWoMusicDao" />
		<property name="sqlSessionFactory" ref="sqlSessionFactory" />
	</bean>
 
	<tx:annotation-driven transaction-manager="transactionManager" />
	<!-- 识别@Scheduled注解,并设置线程池为5 -->
	<task:annotation-driven scheduler="qbScheduler"	mode="proxy" />
	<task:scheduler id="qbScheduler" pool-size="5" />
</beans>

pom.xml文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
 
    <groupId>lyric.crawler</groupId>
    <artifactId>lyric-crawler</artifactId>
    <version>1.0-SNAPSHOT</version>
    <properties>
        <!-- spring版本号 -->
        <spring.version>4.2.0.RELEASE</spring.version>
        <!-- mybatis版本号 -->
        <mybatis.version>3.3.0</mybatis.version>
        <!-- mySql版本号 -->
        <mysql.version>5.1.29</mysql.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.6.1</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.6.1</version>
        </dependency>
        <!-- spring核心包 -->
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-core</artifactId>
            <version>${spring.version}</version>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-context-support</artifactId>
            <version>${spring.version}</version>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-oxm</artifactId>
            <version>${spring.version}</version>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-tx</artifactId>
            <version>${spring.version}</version>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-jdbc</artifactId>
            <version>${spring.version}</version>
        </dependency>
        <!-- mybatis核心包 -->
        <dependency>
            <groupId>org.mybatis</groupId>
            <artifactId>mybatis</artifactId>
            <version>${mybatis.version}</version>
        </dependency>
        <!-- mybatis-spring包 -->
        <dependency>
            <groupId>org.mybatis</groupId>
            <artifactId>mybatis-spring</artifactId>
            <version>1.2.3</version>
        </dependency>
        <!-- 导入Mysql数据库链接jar包 -->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>${mysql.version}</version>
        </dependency>
        <!-- common组件 -->
        <dependency>
            <groupId>commons-dbcp</groupId>
            <artifactId>commons-dbcp</artifactId>
            <version>1.4</version>
        </dependency>
    </dependencies>
    <build>
        <finalName>lyriccrawler</finalName>
        <resources>
            <resource>
                <directory>src/main/java</directory>
                <includes>
                    <include>*.xml</include>
                    <include>*.properties</include>
                    <include>*.tld</include>
                    <include>*.txt</include>
                    <include>*.cfg</include>
                    <include>**/**/**/*.xml</include>
                    <include>**/**/**/**/*.xml</include>
                </includes>
            </resource>
        </resources>
    </build>
 
</project>

工程源代码下载地址

https://github.com/airujingye/lyriccrawler

————————————————

版权声明:本文为CSDN博主「静叶01」的原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接及本声明。

原文链接:https://blog.csdn.net/airujingye/article/details/70249094