使用Html Agility Pack快速解析Html内容

Html Agility Pack 是一个开源的.NET 方案HTML解析器。

开源地址:https://github.com/zzzprojects/html-agility-pack

用法:vs上通过Nuget搜索Html Agility Pack并安装

示例代码1:

        /// <summary>
        /// 获取网页内容
        /// </summary>
        /// <returns></returns>
        private static string GetHtml()
        {
            string html = string.Empty;
            string url = "http://quote.eastmoney.com/stocklist.html";

            using (var client = new HttpClient())
            {
                client.BaseAddress = new Uri(url);

                //关键代码1:设置请求头采用GZip和deflate两种压缩算法
                client.DefaultRequestHeaders.Add("Accept-Encoding", "gzip, deflate");
                var response = client.GetAsync(url).Result;

                var fileStream = response.Content.ReadAsStreamAsync().Result;

                //关键代码2:对文件流采用GZip算法解压
                GZipStream gzip = new GZipStream(fileStream, CompressionMode.Decompress);

                using (StreamReader reader = new StreamReader(gzip, Encoding.GetEncoding("gb2312")))//中文编码处理
                {
                    html = reader.ReadToEnd();
                    //File.WriteAllText(@"C:\stock.html", reader.ReadToEnd(), Encoding.Default);
                }
            }

            return html;
        }

示例代码2:

        static void Main(string[] args)
        {
            string html= GetHtml();

            var doc = new HtmlDocument();
            doc.LoadHtml(html);

            //查找dom节点div的Id为quotesearch下所有ul下的li下的所有a节点
            var nodes = doc.DocumentNode.SelectNodes("//div[@]/ul/li/a");
            foreach (var node in nodes)
            {
                var arrays=node.InnerText.Split('(');
                Console.WriteLine(string.Format("股票名称:{0},股票代码:{1}", arrays[0], arrays[1].Replace(")","")));
            }

            //File.WriteAllText(@"C:\stock.html", matches.ToString(), Encoding.Default);

            Console.ReadKey();
        }