| 网站首页 | 业界新闻 | 小组 | 威客 | 人才 | 下载频道 | 博客 | 代码贴 | 在线编程 | 编程论坛
欢迎加入我们,一同切磋技术
用户名:   
 
密 码:  
共有 2951 人关注过本帖
标题:大神帮我看下为什么这样子我采集不到文章呢? 一远行就报错
只看楼主 加入收藏
asd7298183
Rank: 1
等 级:新手上路
帖 子:5
专家分:0
注 册:2014-4-30
结帖率:0
收藏
 问题点数:0 回复次数:2 
大神帮我看下为什么这样子我采集不到文章呢? 一远行就报错
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using
using
using System.Text.RegularExpressions;

namespace code20
{
    class Program
    {
        static void Main(string[] args)
        {


            Console.Write("1:抓取,2:处理:");
            if (Console.ReadLine() == "1")
            {





                string url = "http://www.
                string html=  gethtml(url, Encoding.Default);

                //Regex r = new Regex("(?<=<title>).*?(?=</title>)");//实例化一个正则
                //MatchCollection co=r.Matches(html);//匹配所有项返回一个集合
                //Console.WriteLine("标题:" + co[0].Value);//通过索引加value获取到内容


                //Regex rcontent = new Regex("<div class=\"content\">[\\s\\S]*?</div>");
                //MatchCollection cocontent = rcontent.Matches(html);
                //Console.WriteLine("内容:"+cocontent[0].Value);


                string listurl = "http://www.



                string listhtml = gethtml(listurl, Encoding.Default);
                //http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?
                Regex rlist = new Regex("(?<=href=\").*?(?=\")");

                MatchCollection co = rlist.Matches(listhtml);

                for (int i = 0; i < co.Count; i++)
                {

                    if (co[i].Value.ToString().Contains("article"))
                    {


                        Console.WriteLine("http://www. + co[i].Value);
                        Console.WriteLine("抓取内容....");
                        string contenthtml = gethtml("http://www. + co[i].Value, Encoding.Default);

                        Regex r = new Regex("(?<=<title>).*?(?=</title>)");//实例化一个正则
                        MatchCollection cotitlt = r.Matches(contenthtml);//匹配所有项返回一个集合

                        // Console.WriteLine("标题:" + cotitlt[0].Value);//通过索引加value获取到内容


                        Regex rcontent = new Regex("<div class=\"content\">[\\s\\S]*?</div>");
                        MatchCollection cocontent = rcontent.Matches(contenthtml);
                        // Console.WriteLine("内容:" + cocontent[0].Value);
                        string title = cotitlt[0].Value;
                        string content = cocontent[0].Value;
                        Console.WriteLine("保存数据...");
                        string appdir = Directory.GetCurrentDirectory();
                        if (!Directory.Exists(appdir + "\\data"))
                        {
                            Directory.CreateDirectory(appdir + "\\data");

                        }


                        File.WriteAllText(appdir + "\\data" + "\\" + i + ".txt", title + "\r\n" + content);
                        Console.WriteLine("保存成功!");



                    }

                }

                Console.ReadLine();

            }
            else
            {

                string appdir = Directory.GetCurrentDirectory();

                string [] files= Directory.GetFiles(appdir+"\\data");//获取data里面所有的文件

                foreach(string filename in files)//遍历所有文件名
                {
                    Console.WriteLine(filename);

                    string html = File.ReadAllText(filename,Encoding.UTF8);//读取内容
                    string title = html.Remove(html.IndexOf('\n')); //提取标题
                    string content = html.Replace(title, "");//替换掉内容中的标题,提取出内容
                     title = title.Remove(title.LastIndexOf('-'));//处理标题
                    Console.Write(title);
                  

                    Regex r = new Regex("(?<=href=\").*?(?=\")");

                   MatchCollection con= r.Matches(content);
                   for (int i = 0; i < con.Count;i++ )
                   {

                       string url = con[i].Value;
                      string newurl =url.Replace( "www.,"www.);
                      content=   content.Replace(url,newurl);
                     
                  
                   }
           content = content.Replace("<div class=\"content\">","").Replace("</div>","");
                  







                    Console.WriteLine("保存...");
                    File.Delete(filename);
                    File.WriteAllText(filename, title + "\r\n" + content, Encoding.UTF8);

                    Console.ReadLine();
                }

                Console.ReadLine();
            
            }


        }
        /// <summary>
        /// 根据url和编码获取html内容
        /// </summary>
        /// <param name="url">完整链接带http</param>
        /// <param name="enc">编码</param>
        /// <returns>字符串,html代码</returns>
        public static string gethtml(string url,Encoding enc)
        {

            WebClient myweb = new WebClient();//实例化一个WebClient连接

            Stream stream = myweb.OpenRead(url);//根据指定的url获取流

            StreamReader sr = new StreamReader(stream,enc);//从流中用utf8编码实例化一个读取器

            string html = sr.ReadToEnd();//从流中读取数据得到字符串

            return html;
        
        }
    }
}
搜索更多相关主题的帖子: 文章 
2014-05-02 13:54
asmdaydream
Rank: 16Rank: 16Rank: 16Rank: 16
来 自:中原
等 级:版主
威 望:13
帖 子:257
专家分:840
注 册:2009-5-10
收藏
得分:0 
表示用的是pascal,c好高端

常走夜路不怕黑 长沙PHP高薪招聘群6K+ 95926136
2014-05-05 09:15
volte
Rank: 10Rank: 10Rank: 10
等 级:贵宾
威 望:69
帖 子:1167
专家分:1316
注 册:2004-12-19
收藏
得分:0 
这个是.net写的吧

大家都是朋友,有空就来坐坐!
2014-05-12 11:01
快速回复:大神帮我看下为什么这样子我采集不到文章呢? 一远行就报错
数据加载中...
 
   



关于我们 | 广告合作 | 编程中国 | 清除Cookies | TOP | 手机版

编程中国 版权所有,并保留所有权利。
Powered by Discuz, Processed in 0.061260 second(s), 7 queries.
Copyright©2004-2024, BCCN.NET, All Rights Reserved