diff --git a/src/main/java/fun/ticsmyc/crawler/Tools.java b/src/main/java/fun/ticsmyc/crawler/Tools.java index c6f81953cccba1207177047f49b65b3584392c77..b824f48218806c1fffbe090fef9d4bdbee4be044 100644 --- a/src/main/java/fun/ticsmyc/crawler/Tools.java +++ b/src/main/java/fun/ticsmyc/crawler/Tools.java @@ -1,5 +1,7 @@ package fun.ticsmyc.crawler; +import com.gargoylesoftware.htmlunit.WebClient; +import com.gargoylesoftware.htmlunit.html.HtmlPage; import org.apache.log4j.Logger; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -44,13 +46,28 @@ public class Tools { } /** - * 通过Jsoup获取整个html页面 + * htmlunit执行javascript,再将页面解析为xml获取到Jsoup的document对象 * @param url * @return */ public static void getPageByJSoup(String url) { try { - page = Jsoup.connect(url).get(); +// page = Jsoup.connect(url).get(); + //Htmlunit模拟的浏览器,设置css,js等支持及其它的一些简单设置 + WebClient browser = new WebClient(); + browser.getOptions().setCssEnabled(false); + browser.getOptions().setJavaScriptEnabled(true); + browser.getOptions().setThrowExceptionOnScriptError(false); + + //获取页面 + HtmlPage htmlPage = browser.getPage(url); + //设置等待js的加载时间 + browser.waitForBackgroundJavaScript(600); + + //使用xml的方式解析获取到jsoup的document对象 + page = Jsoup.parse(htmlPage.asXml()); + System.out.println(page); + } catch (IOException e) { logger.error("jsoup获取页面失败"); }