新人求助:评论文本乱码问题,大众点评的一段评论文本
使用Jsoap在网络上爬了一段大众点评评论文本,大众点评是UTF-8编码格式,可是为什么出现了许多????[local]1[/local]其中一段文本:
点的外卖在单位吃?? 强烈推荐乾隆白菜 和凉?? 但是烧麦的皮太干了我不喜??
这样的文本完全不能用啊...
求助怎么才能得到正常的文本呢
代码的话:
public class Main{
public static void main(String[] args) throws IOException {
String html,mainUrl;
System.out.println("input URL:");
Scanner scanner = new Scanner(System.in);
mainUrl = scanner.next();
if(mainUrl.contains("http://www.)){
Pattern pattern =("[^0-9]");
Matcher matcher =pattern.matcher(mainUrl);
String tstr = matcher.replaceAll("");
//mainUrl="http://www.
System.out.println(mainUrl);
html = WebReader.get(mainUrl);
Shop shop1= new Shop(html);
String content=shop1.getInfo()+"\r\n"+shop1.getComment();
// content =new String(content.getBytes("UTF-8"),"GBK");
System.out.println(content);
File file =new File(tstr+".txt");
file.createNewFile();
FileOutputStream fileOutputStream =new FileOutputStream(file);
OutputStreamWriter outputStreamWriter = new OutputStreamWriter(fileOutputStream,"GB2312");
outputStreamWriter.write(content);
outputStreamWriter.close();
}
}
}
public class WebReader {
public static String get(String urlStr){
String html="";
try {
URL url = new URL(urlStr);
URLConnection conn = url.openConnection();
conn.setRequestProperty("accept", "*/*");
conn.setRequestProperty("connection", "Keep-Alive");
conn.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; SV1)");
conn.connect();
BufferedReader in = new BufferedReader(
new InputStreamReader(
conn.getInputStream()));
String inputLine;
while ((inputLine = in.readLine()) != null) html = html + inputLine;
}
catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return html;
}
}
public class Shop{
private Document doc =null;
public Shop(String urlStr){
doc= Jsoup.parse(urlStr);
}
public String getInfo() throws UnsupportedEncodingException {
Elements title = doc.select("h1.shop-title");
String str = title.html()+"\r\n";
Elements elements = doc.select("span.rst");
Iterator iterator = elements.iterator();
while(iterator.hasNext()){
Element elements1 = (Element) iterator.next();
if ("strong".equalsIgnoreCase(elements1.child(0).nodeName())) {
str = str+ elements1.ownText()+elements1.child(0).html()+"\r\n";
}
}
str=new String (str.getBytes(),"UTF-8");
return str;
}
public String getComment() throws UnsupportedEncodingException {
String str="";
Elements body = doc.select("div.content");
Iterator iterator = body.iterator();
Iterator iteratorOfUser = body.iterator();
String tempstr;
while(iterator.hasNext()) {
Element elementOfUser = (Element) iteratorOfUser.next();
String tstr = elementOfUser.child(0).child(0).className();
tempstr = new String(tstr.substring(tstr.indexOf("irr-star") + 8, tstr.indexOf("irr-star") + 9).getBytes(), "UTF-8");
tstr = "commentlevel"+ tempstr + "\r\n";
str = str + tstr;
Element elements1 = (Element) iterator.next();
tstr = elements1.child(1).child(0).ownText() + "\r\n";
str += tstr;
}
//str=new String(str.getBytes(),"UTF-8");
return str;
}
}