欢迎来到飞鸟慕鱼博客,开始您的技术之旅!
当前位置: 首页知识笔记正文

Title: [C#] Extract hyperlink from webpage (C #)

终极管理员 知识笔记 128阅读

Use the system; Use the system. Xml usage system. Text; usingSystem.Net; Use the system. Io; Use the system. Collection; Use the system. Words. Regular expression; Public class app {publicstaticvoidmain () {Stringstrcode; ArrayListalLinks console Write ('Please enter a web address:'); StringstrURL=console. ReadLine(); If (strURL. Substring(0,7)!=@ ' http://'){ strURL=@ ' http://' strURL; } console. WriteLine ('Getting page code, please wait'

or: rgba(0, 0, 0, 1)">); 
       strCode = GetPageSource(strURL); 

       Console.WriteLine(
"正在提取超链接,请稍侯"); 
       alLinks 
= GetHyperLinks(strCode); 

       Console.WriteLine(
"正在写入文件,请稍侯"); 
        WriteToXml(strURL,alLinks); 
      }
 

// 获取指定网页的HTML代码 
     static string GetPageSource(string URL) 
    

      Uri uri 
=new Uri(URL); 

      HttpWebRequest hwReq 
=HttpWebRequest)     
      WebRequest.Create(uri); 
      HttpWebResponse hwRes 
= (HttpWebResponse)   
      hwReq.GetResponse(); 

      hwReq.Method 
= "Get"

      hwReq.KeepAlive 
= false

      StreamReader reader 
= new StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding("GB2312")); 

       
return reader.ReadToEnd(); 
     }
 

     
// 提取HTML代码中的网址 
     static ArrayList GetHyperLinks(string htmlCode) 
    

       ArrayList al 
= new ArrayList(); 

       
string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"

       Regex r 
= new Regex(strRegex,RegexOptions.IgnoreCase); 
       MatchCollection m 
= r.Matches(htmlCode); 

       
for(int i=0; i<=m.Count-1; i++
       

          
bool rep = false
          
string strNew = m[i].ToString(); 

          
// 过滤重复的URL 
          foreach(string str in al) 
          

             
if(strNew==str) 
             

                  rep 
=true
                   
break
             }
 
           }
 

           
if(!rep) al.Add(strNew); 
         }
 

       al.Sort(); 

       
return al; 
    }
 

     
// 把网址写入xml文件 
     static void WriteToXml(string strURL, ArrayList alHyperLinks) 
    

       XmlTextWriter writer 
= new XmlTextWriter("HyperLinks.xml",Encoding.UTF8); 

       writer.Formatting 
= Formatting.Indented; 
       writer.WriteStartDocument(
false); 
       writer.WriteDocType(
"HyperLinks"null"urls.dtd"null); 
       writer.WriteComment(
"提取自" + strURL + "的超链接"); 
       writer.WriteStartElement(
"HyperLinks"); 
       writer.WriteStartElement(
"HyperLinks"null); 
       writer.WriteAttributeString(
"DateTime",DateTime.Now.ToString()); 


       
foreach(string str in alHyperLinks) 
       

          
string title = GetDomain(str); 
          
string body = str; 
          writer.WriteElementString(title,
null,body); 
       }
 

      writer.WriteEndElement(); 
      writer.WriteEndElement(); 

      writer.Flush(); 
      writer.Close(); 
    }
 

     
// 获取网址的域名后缀 
     static string GetDomain(string strURL) 
    

       
string retVal; 

       
string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)"

        Regex r 
= new Regex(strRegex,RegexOptions.IgnoreCase); 
        Match m 
= r.Match(strURL); 
        retVal 
= m.ToString(); 

        strRegex 
= @"\.|/$"
        retVal 
= Regex.Replace(retVal, strRegex, "").ToString(); 

       
if(retVal == ""
       retVal 
= "other"

       
return retVal; 
     }
 
}
 

标签:
声明:无特别说明,转载请标明本文来源!