文档章节

LagouHtmlParser

 小猪皮杰
发布于 2017/02/07 23:03
字数 686
阅读 6
收藏 0
package htmlpaser;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.*;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.TextExtractingVisitor;

import java.io.*;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/**
 * Created by 林志杰 on 2017/2/7.
 */
public class LagouHtmlParser {

    public static String getLink(String content){
        String link = "";
        Pattern pt_link = Pattern.compile("<link rel=\"canonical\" href=\"(http://www.lagou.com/jobs/(\\d{5,7}).html)\">",Pattern.MULTILINE|Pattern.DOTALL);

        //获取页面链接
        Matcher mc_link = pt_link.matcher(content);
        if(mc_link.find())
            link += mc_link.group(1);
        return link;
    }

    public static String getTitle(String content) throws ParserException {
        NodeFilter nodeFilter = new TagNameFilter("title");
        Parser parser = new Parser(content);
        NodeList nodeList = parser.parse(nodeFilter);
        Node node = nodeList.elementAt(0);
        Parser parser1 = new Parser(node.toHtml());
        TextExtractingVisitor visitor = new TextExtractingVisitor();
        parser1.visitAllNodesWith(visitor);
        return visitor.getExtractedText();
    }

    public static String getCompany(String content){
        String company = "";
        Pattern pt_company = Pattern.compile("<div class=\"company\">(.*)</div>");
        Matcher mc_company = pt_company.matcher(content);
        if(mc_company.find())
            company += mc_company.group(1);
        return company;
    }

    public static String getRequest(String content){
        String[] item = { "薪酬:","城市:","经验:","学历:","性质:"};
        String request = "";
        try {
            Parser parser = new Parser(content);
            //NodeFilter nodeFilter = new AndFilter(new TagNameFilter("span"),new HasParentFilter(new AndFilter(new TagNameFilter("dd"),new HasAttributeFilter("class","job_request"))));
            NodeFilter nodeFilter = new AndFilter(new TagNameFilter("span"),new HasParentFilter(new TagNameFilter("p")));
            NodeList nodeList = parser.parse(nodeFilter);
            for(int i = 0 ;i < 5;i++){
                Parser parser1 = new Parser(nodeList.elementAt(i).toHtml());
                TextExtractingVisitor textExtractingVisitor = new TextExtractingVisitor();
                parser1.visitAllNodesWith(textExtractingVisitor);
                String temp = textExtractingVisitor.getExtractedText();
                Pattern pattern = Pattern.compile("/");
                Matcher matcher = pattern.matcher(temp);
                temp = matcher.replaceAll("");
                request +=item[i] + temp + "\n";
            }




        } catch (ParserException e) {
            e.printStackTrace();
        }
        return request;
    }

    public static String getPublishTime(String content){
        String publish_time = "";
        try {
            Parser parser = new Parser(content);
            NodeFilter nodeFilter = new AndFilter(new TagNameFilter("p"),new HasAttributeFilter("class","publish_time"));
            NodeList nodeList = parser.parse(nodeFilter);
            Parser parser1 = new Parser(nodeList.elementAt(0).toHtml());
            TextExtractingVisitor textExtractingVisitor = new TextExtractingVisitor();
            parser1.visitAllNodesWith(textExtractingVisitor);
            publish_time = textExtractingVisitor.getExtractedText();
            publish_time = publish_time.substring(0,publish_time.lastIndexOf("发布于拉勾网"));
        } catch (ParserException e) {
            e.printStackTrace();
        }
        return publish_time;
    }

    public static String getDescription(String content){
        String description = "";
        try {
            Parser parser = new Parser(content);
            NodeFilter nodeFilter = new AndFilter(new TagNameFilter("dd"),new HasAttributeFilter("class","job_bt"));
            NodeList nodeList = parser.parse(nodeFilter);
            Parser parser1 = new Parser(nodeList.toHtml());
            TextExtractingVisitor textExtractingVisitor = new TextExtractingVisitor();
            parser1.visitAllNodesWith(textExtractingVisitor);
            description = textExtractingVisitor.getExtractedText();
            description = StringUtils.replaceBlank(description);
            description = description.substring(description.lastIndexOf("职位描述")+4);
        } catch (ParserException e) {
            e.printStackTrace();
        }
        return description.trim();
    }

    public static String getAddress(String content){
        String address = "";
        try {
            Parser parser = new Parser(content);
            NodeFilter nodeFilter = new AndFilter(new TagNameFilter("div"),new HasAttributeFilter("class","work_addr"));
            NodeList nodeList = parser.parse(nodeFilter);
            //System.out.println(nodeList.toHtml());

            Parser parser1 = new Parser(nodeList.toHtml());
            TextExtractingVisitor textExtractingVisitor = new TextExtractingVisitor();
            parser1.visitAllNodesWith(textExtractingVisitor);
            address += textExtractingVisitor.getExtractedText();

            address = StringUtils.replaceBlank(address);
            address = address.substring(0,address.indexOf("查看地图"));
            System.out.println(address);
        } catch (ParserException e) {
            e.printStackTrace();
        }
        return address;
    }


    public static String parse(File f) throws IOException, ParserException {

        String source = "来源:拉勾网";
        String link = "链接:";
        String title = "标题:";
        String company = "公司:";
        String request = "";
        String publish_time ="发布时间:";
        String description = "要求描述:";
        String address = "地址:";

        StringBuffer stringBuffer = new StringBuffer();
        //获取主要内容
        BufferedReader reader = new BufferedReader(new FileReader(f));
        String line = reader.readLine();
        while(line != null){
            stringBuffer.append(line+"\n");
            line = reader.readLine();
        }
        reader.close();
        String content = stringBuffer.toString();



        link += LagouHtmlParser.getLink(content);
        title += LagouHtmlParser.getTitle(content);
        company += LagouHtmlParser.getCompany(content);
        request = LagouHtmlParser.getRequest(content);
        publish_time += LagouHtmlParser.getPublishTime(content);
        description += LagouHtmlParser.getDescription(content);
        address += LagouHtmlParser.getAddress(content);
        String result =source + "\n" + link + "\n" + title + "\n" + company + "\n" + request + publish_time + "\n"+ description + "\n" + address +"\n";

        System.out.println(link);

        return result;
    }

    public static void main(String[] args) throws ParserException {
        String htmls = "F:\\Javaweb\\LuceneTest\\html";
        String content = "";
        File file1 = new File(htmls);
        File[] files = file1.listFiles();
        if(file1.isDirectory() && file1.listFiles() != null){
            for(File file : files ){
                try {
                    System.out.println("+++++++++++++++++++++++++++++++++++");
                    //获取文件内容
                    content = parse(file);
                    //System.out.println(content.trim());
                    //获取除去后缀的文件名
                    String fullname = file.getName();
                    String filename = fullname.substring(0,fullname.lastIndexOf(".html"));
                    //文件对应的TXT文档
                    String path = "F:\\Javaweb\\LuceneTest\\txt\\"+ filename+".txt";
                    File file2 = new File(path);
                    //System.out.println("path: "+path);
                    //System.out.println("file2: " + file2);
                    //将文件的元数据和内容写入TXT文档
                    FileWriter fileWriter = new FileWriter(file2);
                    if(fileWriter != null){
                        //System.out.println("fileWriter: " + fileWriter);
                        fileWriter.write(content);
                        fileWriter.flush();
                        fileWriter.close();
                    }

                } catch (FileNotFoundException e) {
                    e.printStackTrace();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }


}

 

© 著作权归作者所有

粉丝 0
博文 9
码字总数 1020
作品 0
广州
私信 提问

暂无文章

PHP7安装开启mysqli扩展

先用phpinfo()看看有没有mysqli扩展,没有的话先安装, 1,进去PHP安装目录下的ext/mysqli文件夹  cd /usr/local/php/ext/mysqli 2, /usr/local/php/bin/phpize 3, ./configure --with-p...

菜鸟要飞啊
12分钟前
4
0
vue-cli3构建TS项目(基础篇)

https://blog.csdn.net/qq_30669833/article/details/90487700

小鱼蕾蕾
26分钟前
3
0
2019完结篇!一张图带你了解文档管理API套包Aspose.Total完整产品线!

时间一晃,2019年即将结束了,Aspose公司在这一年又不断更新和研发,发布了好几款全新的文件格式API以及适用于更多新的语言平台,满足了更多用户的需求。 Aspose是全球领先的图表控件开发商,...

mnrssj
31分钟前
4
0
vue中eventBus的使用

使用场景: 1、兄弟组件的通信,父子组件的通信 2、不同路由的通信 针对兄弟组件的通信,父子组件的通信 新建bus.js文件 import Vue from 'vue' var bus = new Vue() export default bus 在需...

tianyawhl
42分钟前
5
0
C# DBHelper

using System;using System.Collections.Generic;using System.Linq;using System.Text;using System.Data;using System.Data.SqlClient;using System.Windows.Forms;namesp......

vga
45分钟前
5
0

没有更多内容

加载失败,请刷新页面

加载更多

返回顶部
顶部