今天偶然间看到一篇关于 Java 爬虫入门的博客,想到以前就学过一点爬虫,于是乎就在博客的基础上写了一个 demo,用来爬取慕课网的实战课程。
首先需要发送 HTTP 请求到网页,用到了 HttpURLConnection 类,具体如下:
package util;
import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL;
public class ConnectionUtil { public static String Connect(String address) { URL url = null; HttpURLConnection conn = null; InputStream in = null; BufferedReader reader = null; StringBuffer stringBuffer = null; try { url = new URL(address); conn = (HttpURLConnection) url.openConnection(); conn.connect(); in = conn.getInputStream(); reader = new BufferedReader(new InputStreamReader(in)); stringBuffer = new StringBuffer(); String line = null; while ((line = reader.readLine()) != null) { stringBuffer.append(line); } } catch (Exception e) { e.printStackTrace(); } finally { conn.disconnect(); try { in.close(); reader.close(); } catch (Exception e) { e.printStackTrace(); } }
return stringBuffer.toString(); } }
|
接下来就是利用正则来解析 HTML:
package util;
import pojo.Course;
import java.util.regex.Matcher; import java.util.regex.Pattern;
public class Analyze { private Course course;
public Course regexMain(String uid) { String url = "https://coding.imooc.com/class/" + uid + ".html"; String result = ConnectionUtil.Connect(url); course = getCourseInfo(result, uid); return course; }
private Course getCourseInfo(String targetStr, String uid) { course = new Course(); Pattern titlePattern = Pattern.compile("title-box.*?</h1>"); Matcher titleMatcher = titlePattern.matcher(targetStr); if (titleMatcher.find()) { String titleString = titleMatcher.group(); int start = titleString.indexOf("<h1>"); int end = titleString.indexOf("</h1>"); String str = titleString.substring(start + 4, end).trim(); str = str.replaceAll("<br/>", " "); course.setTitle(str); } Pattern salePricePattern = Pattern.compile("sale-price.*?</div>"); Matcher salePriceMatcher = salePricePattern.matcher(targetStr); if (salePriceMatcher.find()) { String salePriceString = salePriceMatcher.group(); int start = salePriceString.indexOf("¥"); int end = salePriceString.indexOf("</div>"); course.setSalePrice(Double.parseDouble(salePriceString.substring(start + 1, end))); } else { return null; }
course.setId(Integer.valueOf(uid)); return course; } }
|
Course 类如下:
package pojo;
public class Course { private int id; private String title; private double oriPrice; private double salePrice; private String lecturer;
public int getId() { return id; }
public void setId(int id) { this.id = id; }
public String getTitle() { return title; }
public void setTitle(String title) { this.title = title; }
public double getOriPrice() { return oriPrice; }
public void setOriPrice(double oriPrice) { this.oriPrice = oriPrice; }
public double getSalePrice() { return salePrice; }
public void setSalePrice(double salePrice) { this.salePrice = salePrice; }
public String getLecturer() { return lecturer; }
public void setLecturer(String lecturer) { this.lecturer = lecturer; }
@Override public String toString() { return "Course{" + "id=" + id + ", salePrice=" + salePrice + "\t" + ", title='" + title + '\'' + '}'; } }
|
测试一下:
import java.util.*;
public class Main { public static void main(String[] args) { List<Course> courses = new ArrayList<>(); Analyze analyze = new Analyze(); for (int i = 1; i <= 400; i++) { Course course = analyze.regexMain(String.valueOf(i)); if (course != null) { courses.add(course); System.out.println("发现课程\tid:" + course.getId() + "\ttitle:" + course.getTitle()); } } for (Course cours : courses) { System.out.println(cours); } } }
|
参考博客:Java爬虫入门笔记