用HTMLParser过滤掉html中所有标签,留下标题正文等内容,java
publicclasslable_delimplementsRunnable{privatestaticStringfileName;privatestaticlable...
public class lable_del implements Runnable {
private static String fileName;
private static lable_del tst;
private File file = null;
public String k;
public static int m = 0;
public lable_del(File file) {
this.file = file;
}
public void run() {
//System.out.print(k);
try {
File ff = new File("E:\\", "结果\\" + m + ".txt");
System.out.print(m + "{}");
if (!ff.exists()) {
ff.createNewFile();
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(ff));
BufferedWriter bw = new BufferedWriter(writer);
bw.write(getText());
bw.close();
writer.close();
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
public String getText() throws ParserException {
StringBean sb = new StringBean();
sb.setReplaceNonBreakingSpaces(true);
sb.setCollapse(true);
sb.setURL(k);
return sb.getStrings();
}
public static void getFile(File f)
{
File[] name = f.listFiles();
for (int i = 0; i < name.length; i++)
{
if (name[i].isDirectory()==true)
{
String s=f.getPath()+"\\"+name[i];
System.out.println("123"+s);
System.out.println(name[i]);
//File f1=new File();
getFile(name[i]);
}
else
{
System.out.println(name[i].getAbsolutePath());
if(name[i]!=null) {
lable_del tst = new lable_del(name[i]);
tst.k = name[i].getAbsolutePath();
System.out.println(tst.k);
System.out.println(name.length);
Thread t = new Thread(tst);
t.start();
try {
t.join();
tst.m = tst.m + 1;
} catch (InterruptedException ex) {
Logger.getLogger(lable_del.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
}
}
public static void main(String[] args)
{
File file = new File("E:\\爬虫网页");
getFile(file);
}
}
这个是我们的代码,但是处理结果只能去掉标签,还留下一大堆的东西,怎样做修改能说只提取HTML里的标题和正文呢。我们针对的新浪里的新闻,我知道理论上来说是提取<title>与<title>,<content>与<content>之间的内容,大神啊具体我不会写代码!!求帮助! 展开
private static String fileName;
private static lable_del tst;
private File file = null;
public String k;
public static int m = 0;
public lable_del(File file) {
this.file = file;
}
public void run() {
//System.out.print(k);
try {
File ff = new File("E:\\", "结果\\" + m + ".txt");
System.out.print(m + "{}");
if (!ff.exists()) {
ff.createNewFile();
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(ff));
BufferedWriter bw = new BufferedWriter(writer);
bw.write(getText());
bw.close();
writer.close();
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
public String getText() throws ParserException {
StringBean sb = new StringBean();
sb.setReplaceNonBreakingSpaces(true);
sb.setCollapse(true);
sb.setURL(k);
return sb.getStrings();
}
public static void getFile(File f)
{
File[] name = f.listFiles();
for (int i = 0; i < name.length; i++)
{
if (name[i].isDirectory()==true)
{
String s=f.getPath()+"\\"+name[i];
System.out.println("123"+s);
System.out.println(name[i]);
//File f1=new File();
getFile(name[i]);
}
else
{
System.out.println(name[i].getAbsolutePath());
if(name[i]!=null) {
lable_del tst = new lable_del(name[i]);
tst.k = name[i].getAbsolutePath();
System.out.println(tst.k);
System.out.println(name.length);
Thread t = new Thread(tst);
t.start();
try {
t.join();
tst.m = tst.m + 1;
} catch (InterruptedException ex) {
Logger.getLogger(lable_del.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
}
}
public static void main(String[] args)
{
File file = new File("E:\\爬虫网页");
getFile(file);
}
}
这个是我们的代码,但是处理结果只能去掉标签,还留下一大堆的东西,怎样做修改能说只提取HTML里的标题和正文呢。我们针对的新浪里的新闻,我知道理论上来说是提取<title>与<title>,<content>与<content>之间的内容,大神啊具体我不会写代码!!求帮助! 展开
推荐律师服务:
若未解决您的问题,请您详细描述您的问题,通过百度律临进行免费专业咨询