thanks!

解决方案 »

  1.   


    /// <summary>
        /// 去掉html标记
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        protected static string ConvertGettext(string str)
        {
            Regex regex = new Regex(@"\<(.*?)\>", RegexOptions.IgnoreCase);        return regex.Replace(str, "").Replace("&nbsp;", "").Replace("\n", "").Replace("\r", "");
        } 
    具体情况你可以发个示例来看看
      

  2.   

    http://topic.csdn.net/u/20110508/21/b00b79a9-c90b-4a01-8ab6-225243547d04.html
      

  3.   

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.IO.Compression;
    using System.Text.RegularExpressions;namespace WikiPageCreater.Common
    {
        public class PageHelper
        {
            /// <summary>
            /// 根据 url 获取网页编码
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            public static string GetEncoding(string url)
            {
                HttpWebRequest request = null;
                HttpWebResponse response = null;
                StreamReader reader = null;
                try
                {
                    request = (HttpWebRequest)WebRequest.Create(url);
                    request.Timeout = 20000;
                    request.AllowAutoRedirect = false;                response = (HttpWebResponse)request.GetResponse();
                    if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                    {
                        if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                            reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
                        else
                            reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);                    string html = reader.ReadToEnd();                    Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
                        if (reg_charset.IsMatch(html))
                        {
                            return reg_charset.Match(html).Groups["charset"].Value;
                        }
                        else if (response.CharacterSet != string.Empty)
                        {
                            return response.CharacterSet;
                        }
                        else
                            return Encoding.Default.BodyName;
                    }
                }
                catch
                {
                }
                finally
                {                if (response != null)
                    {
                        response.Close();
                        response = null;
                    }
                    if (reader != null)
                        reader.Close();                if (request != null)
                        request = null;            }            return Encoding.Default.BodyName;
            }        /// <summary>
            /// 根据 url 和 encoding 获取当前url页面的 html 源代码        
           /// </summary>
            /// <param name="url"></param>
            /// <param name="encoding"></param>
            /// <returns></returns>
            public static string GetHtml(string url, Encoding encoding)
            {
                HttpWebRequest request = null;
                HttpWebResponse response = null;
                StreamReader reader = null;
                try
                {
                    request = (HttpWebRequest)WebRequest.Create(url);
                    request.Timeout = 20000;
                    request.AllowAutoRedirect = false;                response = (HttpWebResponse)request.GetResponse();
                    if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                    {
                        if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                            reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);
                        else
                            reader = new StreamReader(response.GetResponseStream(), encoding);
                        string html = reader.ReadToEnd();                    return html;
                    }
                }
                catch
                {
                }
                finally
                {                if (response != null)
                    {
                        response.Close();
                        response = null;
                    }
                    if (reader != null)
                        reader.Close();                if (request != null)
                        request = null;            }            return string.Empty;
            }
        }
    }然后正则取
      

  4.   

    -----------------
    具体情况是这样的 想调用微软的翻译实现多国语言
    一次传输太多内容会翻译不了,想把需要翻译的内容,分成几段传输过去
    想把网页生成的html中需要翻译的内容提取出来,然后再翻译 
      

  5.   

    protected static string ConvertGettext(string str)
        {
            Regex regex = new Regex(@"\<(.*?)\>", RegexOptions.IgnoreCase);        return regex.Replace(str, "").Replace("&nbsp;", "").Replace("\n", "").Replace("\r", "");
        }
      

  6.   


     public static string DropHTML(string Htmlstring)
            {
                //删除脚本  
                Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
                //删除HTML  
                Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
                Htmlstring.Replace("<", "");
                Htmlstring.Replace(">", "");
                Htmlstring.Replace("\r\n", "");
                Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
                return Htmlstring;
            }