java 爬虫读取一个网页时,我按照网页指定的字符集进行解析,为什么解析的html源代码是乱码?。。。
源文件如下:待解析的网址是:http://jingfengjiayuan2.soufun.com/publicclassParseUrl_Soufun{publicsta...
源文件如下:
待解析的网址是:http://jingfengjiayuan2.soufun.com/
public class ParseUrl_Soufun {
public static String getPriceElement(String content) {
StringBuilder houseInfo = new StringBuilder();
String averagePriceRegex = "二 手 房</strong><strong>.*?</a> </strong>套";
String SecondHandRegex = "<span>二手房.*?</span>";
String hireRegex = "<span>租 房.*?</span>";
String PropertyCostsRegex = "费:.*?</li>";
houseInfo.append(furtherProcessing(matchPattern(averagePriceRegex,
content)) + "-");
houseInfo.append(furtherProcessing(matchPattern(SecondHandRegex,
content)) + "-");
houseInfo.append(furtherProcessing(matchPattern(hireRegex, content))
+ "-");
houseInfo.append(furtherProcessing(matchPattern(PropertyCostsRegex,
content)) + "-");
houseInfo.append(CurrentCalendar.getCurrentCalendar());
return houseInfo.toString();
}
public static String matchPattern(String regex, String sourceString) {
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(sourceString);
String target = "";
while (ma.find()) {
target = (ma.group());
}
return target;
}
public static String furtherProcessing(String patternString) {
if (patternString.contains("<span>租"))
return outTag(patternString).replace("租 房", "").replaceAll("[(|)]",
"");
else if (patternString.contains("均价"))
return patternString.replaceAll("<.*?>", "").replace("均价", "")
.replace("元/平米", "").trim();
else if (patternString.contains("费:"))
return outTag(patternString).replace("费:", "");
else
return patternString.replaceAll("<.*?>", "").replace("(", "")
.replace(")", "").replace("二手房", "");
}
public String getOneHtml(final String htmlurl) throws IOException {
URL url;
String temp;
final StringBuffer sb = new StringBuffer();
try {
url = new URL(htmlurl);
BufferedReader in = new BufferedReader(new InputStreamReader(
url.openStream(), "GB18030"));// 读取网页全部内容
while ((temp = in.readLine()) != null) {
System.out.println(temp);
//
}
in.close();
} catch (final MalformedURLException me) {
System.out.println("你输入的URL格式有问题!请仔细输入");
me.getMessage();
throw me;
} catch (final IOException e) {
e.printStackTrace();
throw e;
}
return sb.toString();
}
public static String outTag(String s) {
String regex = "<.*?>";
return s.replaceAll(regex, "");
}
public static void main(String[] args) throws IOException {
String url = "http://jingfengjiayuan2.soufun.com/";
// System.out.println(getWebContent(url));
// getPriceElement(getWebContent(url));
new ParseUrl_Soufun().getOneHtml(url);
}
} 展开
待解析的网址是:http://jingfengjiayuan2.soufun.com/
public class ParseUrl_Soufun {
public static String getPriceElement(String content) {
StringBuilder houseInfo = new StringBuilder();
String averagePriceRegex = "二 手 房</strong><strong>.*?</a> </strong>套";
String SecondHandRegex = "<span>二手房.*?</span>";
String hireRegex = "<span>租 房.*?</span>";
String PropertyCostsRegex = "费:.*?</li>";
houseInfo.append(furtherProcessing(matchPattern(averagePriceRegex,
content)) + "-");
houseInfo.append(furtherProcessing(matchPattern(SecondHandRegex,
content)) + "-");
houseInfo.append(furtherProcessing(matchPattern(hireRegex, content))
+ "-");
houseInfo.append(furtherProcessing(matchPattern(PropertyCostsRegex,
content)) + "-");
houseInfo.append(CurrentCalendar.getCurrentCalendar());
return houseInfo.toString();
}
public static String matchPattern(String regex, String sourceString) {
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(sourceString);
String target = "";
while (ma.find()) {
target = (ma.group());
}
return target;
}
public static String furtherProcessing(String patternString) {
if (patternString.contains("<span>租"))
return outTag(patternString).replace("租 房", "").replaceAll("[(|)]",
"");
else if (patternString.contains("均价"))
return patternString.replaceAll("<.*?>", "").replace("均价", "")
.replace("元/平米", "").trim();
else if (patternString.contains("费:"))
return outTag(patternString).replace("费:", "");
else
return patternString.replaceAll("<.*?>", "").replace("(", "")
.replace(")", "").replace("二手房", "");
}
public String getOneHtml(final String htmlurl) throws IOException {
URL url;
String temp;
final StringBuffer sb = new StringBuffer();
try {
url = new URL(htmlurl);
BufferedReader in = new BufferedReader(new InputStreamReader(
url.openStream(), "GB18030"));// 读取网页全部内容
while ((temp = in.readLine()) != null) {
System.out.println(temp);
//
}
in.close();
} catch (final MalformedURLException me) {
System.out.println("你输入的URL格式有问题!请仔细输入");
me.getMessage();
throw me;
} catch (final IOException e) {
e.printStackTrace();
throw e;
}
return sb.toString();
}
public static String outTag(String s) {
String regex = "<.*?>";
return s.replaceAll(regex, "");
}
public static void main(String[] args) throws IOException {
String url = "http://jingfengjiayuan2.soufun.com/";
// System.out.println(getWebContent(url));
// getPriceElement(getWebContent(url));
new ParseUrl_Soufun().getOneHtml(url);
}
} 展开
若以下回答无法解决问题,邀请你更新回答
2个回答
推荐律师服务:
若未解决您的问题,请您详细描述您的问题,通过百度律临进行免费专业咨询
广告 您可能关注的内容 |