C# -- HTML Agility Pack 사용한 html 파싱
참고 : http://www.c-sharpcorner.com/UploadFile/9b86d4/getting-started-with-html-agility-pack/
http://blog.olussier.net/2010/03/30/easily-parse-html-documents-in-csharp/
http://stackoverflow.com/questions/21236359/get-a-value-of-an-attribute-by-htmlagilitypack
http://www.codeproject.com/Tips/804660/How-to-Parse-HTML-using-Csharp
http://www.w3schools.com/xsl/xpath_syntax.asp ==> xpath 사용법
** HTML Agility Pack 에서는 xpath 문법 지원한다.
1. Manage NuGet Packages... 를 통해서 Html Agility Pack 을 설치한다.
2. 사용하기
** 사용 예
HtmlAgilityPack.HtmlDocument document = htmlWeb.Load("http://www.somewebsite.com");
HtmlNode someNode = document.GetElementbyId("mynode");
3. 출력결과
<< 소스 >>
using System; | |
using HtmlAgilityPack; | |
namespace HtmlParsing | |
{ | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
String myHtml = @"<body> | |
<p>It is better to use the style attribute (instead of the width and height attributes), because it prevents | |
internal or external styles sheets to change the original size of an image:</p> | |
<img src=""html51.gif"" alt=""HTML5 Icon"" style=""width:128px;height:128px;""> | |
<img src=""html52.gif"" alt=""HTML5 Icon"" width=""128"" height=""128""> | |
</body>"; | |
// Console.WriteLine(myHtml); | |
HtmlAgilityPack.HtmlDocument mydoc = new HtmlAgilityPack.HtmlDocument(); | |
mydoc.LoadHtml(myHtml); | |
HtmlAgilityPack.HtmlNodeCollection nodeCol = mydoc.DocumentNode.SelectNodes("//img"); | |
foreach (HtmlAgilityPack.HtmlNode node in nodeCol) | |
{ | |
Console.WriteLine(node.OuterHtml); | |
Console.WriteLine(node.Attributes["src"].Value); | |
} | |
Console.ReadLine(); | |
} | |
} | |
} |
'C# Web Scraping' 카테고리의 다른 글
C# -- IE automation #7 -- iframe 접근하기 (2) | 2016.02.09 |
---|---|
C# -- IE automation #6 -- 구글 검색창 제어하기 (0) | 2016.02.09 |
C# -- IE automation #5 -- Attribute 제어하기 (0) | 2016.02.09 |
C# -- IE automation #4 -- html 구조 파헤치기 (0) | 2016.02.09 |
C# -- IE automation #3 -- internet explorer 제어하여 html 소스 가져오기 (4) | 2016.02.03 |