目的: 获得目标背单词网站中的单词, 写了一个简单的小爬虫, 使用jdk11
到此, 思路明确!
第一步, 把冰箱门...., 串词了,Sorry!!
第一步, 调用登陆接口, 拿到sessionid!
第二步, 带着sessionid到单词列表页, 拿到body, 转成Document, 开始"借鉴"单词!
是不是So easy!
package com.***;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import org.junit.Test;import java.io.IOException;import java.net.URI;import java.net.http.HttpClient;import java.net.http.HttpRequest;import java.net.http.HttpResponse;import java.util.HashMap;/** * @author jqw1122@foxmail.com * @description 爬啊爬 * @date 2/23/2019 17:14 */public class Crawler { @Test public void crawler() { String loginUrl = "http://www.cikuang.me/login"; String fromBody = "username=jqw1122@foxamil.com&password=qweqwe123"; String wordSetUrl= "http://www.cikuang.me/member/learningset?id=4573"; HttpClient httpClient = HttpClient.newBuilder().build(); HttpRequest request = HttpRequest.newBuilder() .uri(URI.create(loginUrl)) .header("Content-Type","application/x-www-form-urlencoded") .POST(HttpRequest.BodyPublishers.ofString(fromBody)) .build(); httpClient.sendAsync(request, HttpResponse.BodyHandlers.ofString()) .thenApply(HttpResponse::headers) .thenAccept(headers -> { //cookie有好多 我只要sid啊魂淡! var cookieMap = new HashMap(); headers.map().get("set-cookie").forEach(c -> { String[] split = c.split(";"); for (String s : split) { String[] split1 = s.split("="); if (split1.length == 2) cookieMap.put(split1[0], split1[1]); } }); //拿着sid去单词页面 String cookie_sid = cookieMap.get("sid"); HttpRequest request2 = HttpRequest.newBuilder() .uri(URI.create(wordSetUrl)) .header("Content-Type","application/x-www-form-urlencoded") .header("Cookie", "sid=" + cookie_sid) .GET() .build(); httpClient.sendAsync(request2, HttpResponse.BodyHandlers.ofString()) .thenApply(HttpResponse::body) .thenAccept(htmlString ->{ //获取到body转成Document, 方便借鉴... Document htmlDocument = Jsoup.parse(htmlString); //获取单词table id Element wordListTable = htmlDocument.getElementById("wordListTable"); Elements trs = wordListTable.getElementsByTag("tr"); trs.forEach(t -> { Elements tds = t.children(); String en = tds.get(0).child(0).text(); String cn = tds.get(1).text(); System.out.println("单词---->>> " + en + ":" + cn); }); }).join(); } ).join(); }}
爬虫2: 目的:获取KMF中托福-听力-所有练习题的题目的音频
/** * @author jqw1122@foxmail.com * @description * @date 2/23/2019 17:14 */public class Crawler { @Test public void crawlerKMF() { String mainUrl= "https://toefl.kmf.com"; String mainUrl1= "https://toefl.kmf.com/listen/ets/order/"; String localFilePath = "C:\\kmf_audio\\"; HttpClient httpClient = HttpClient.newBuilder().build(); ListdetailUrlList = new ArrayList<>(); e:for (int i = 0; i <= 5; i++) { for (int j = 1; j <= 4; j++) { String url = mainUrl1 + i + "/0/" + j; HttpRequest request = HttpRequest.newBuilder() .uri(URI.create(url)) .header("Content-Type","application/x-www-form-urlencoded") .GET() .build(); httpClient.sendAsync(request, HttpResponse.BodyHandlers.ofString()).thenApply(HttpResponse::body).thenAccept(bodyString -> { Document htmlDocument = Jsoup.parse(bodyString); Elements elements = htmlDocument.getElementsByAttributeValue("class", "check-links js-check-link"); elements.forEach(tagA -> { String href = tagA.attr("href"); detailUrlList.add(href); }); System.out.println("page detail number:" + elements.size()); }).join(); //test// if (1==1) break e; } } System.out.println("page/file number: "+detailUrlList.size()); var fileList = new ArrayList
下载成功了....