Linux下Java语言实现简陋Web爬虫


Linux环境下Java语言实现简陋Web爬虫:

import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.Socket;
import java.net.UnknownHostException;

public class WebCrawler {

    private static String Text_File_Path = "/home/zms/htmldoc/htmldoc1.html";

    //运行前最好先建立此目录和文件,用于存放爬取的页面内容
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        try {
            File file = new File(Text_File_Path);
            FileWriter fpWriter = new FileWriter(file);
           
            //生成下载对象
            Socket webclient = new Socket("ubuntuone.cn",80);
            PrintWriter result = new PrintWriter(webclient.getOutputStream(),true);
            BufferedReader receiver = new BufferedReader(new InputStreamReader(webclient.getInputStream()));
           
            //发送Http请求
            result.println("GET / HTTP/1.1");
            result.println("Host:localhost");
            result.println("Connection: Close");
            result.println();
           
            //接收HTTP返回的消息
            boolean bRet = true;
            StringBuffer sb = new StringBuffer(8096);
            while(bRet){
                if(receiver.ready()){       
                    int idx = 0;
                    while(idx != -1){
                        idx = receiver.read();
                        if(idx == '<')
                            break;
                    }
                    while(idx != -1){
                        sb.append((char)idx);
                        idx = receiver.read();
                    }
                    bRet = false;
                }       
            }
           
            //显示获得网页的正文,打印到控制台
            System.out.println(sb.toString());
            fpWriter.write(sb.toString());
            webclient.close();
            fpWriter.close();
           
        } catch (UnknownHostException e) {
            System.err.println("无法访问您指定的主机。");
            e.printStackTrace();
            System.exit(1);
        } catch (IOException e) {
            System.err.println("下载失败,请检查输入地址是否正确。");
            e.printStackTrace();
            System.exit(1);
        }
    }

}

相关内容