ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • 002. HTML 파싱하기 - 태그읽기
    Java 2015. 7. 29. 15:51
    1. getElementsByTag(String tag)를 이용한 태그읽기

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
     import java.io.File;
    import java.io.IOException;
     
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
     
    // 파일에 있는 HTML 파싱하기
    public class Jsoup05 {
        public static void main(String[] args) {
            String fileName = "Jsoup05.html";
            File file = new File(fileName);
            Document document = null;
            try {
                document = Jsoup.parse(file, "UTF-8");
                
                // 태그 읽기
                Elements links1 = document.getElementsByTag("h1");
                for (Element link : links1) {
                    System.out.println(link.text());
                }
                System.out.println();
                
                // 태그 읽기
                Elements links2 = document.getElementsByTag("h2");
                for (Element link : links2) {
                    System.out.println(link.text());
                }
                
            } catch (IOException e) {
                e.printStackTrace();
            }
     
        }
    }
    cs

    Jsoup05.html

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    <!doctype html>
    <html lang="ko">
    <head>
    <meta charset="UTF-8">
    <title>연습용 HTML문서</title>
    </head>
    <body>
        <h1>파일로부터 h1 tag 읽기 1</h1>
        <h2>파일로부터 h2 tag 읽기 1</h2>
        <h1>파일로부터 h1 tag 읽기 2</h1>
        <h2>파일로부터 h2 tag 읽기 2</h2>
        <h1>파일로부터 h1 tag 읽기 3</h1>
        <h2>파일로부터 h2 tag 읽기 3</h2>
        <h1>파일로부터 h1 tag 읽기 4</h1>
        <h2>파일로부터 h2 tag 읽기 4</h2>
    </body>
    </html>
    cs

    결과

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    파일로부터 h1 tag 읽기 1
    파일로부터 h1 tag 읽기 2
    파일로부터 h1 tag 읽기 3
    파일로부터 h1 tag 읽기 4
     
    파일로부터 h2 tag 읽기 1
    파일로부터 h2 tag 읽기 2
    파일로부터 h2 tag 읽기 3
    파일로부터 h2 tag 읽기 4
     
    cs

    2. select(tagName)으로 태그 읽기

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    import java.io.File;
    import java.io.IOException;
     
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
     
    // 파일에 있는 HTML 파싱하기
    public class Jsoup06 {
        public static void main(String[] args) {
            String fileName = "Jsoup05.html";
            File file = new File(fileName);
            Document document = null;
            try {
                document = Jsoup.parse(file, "UTF-8");
                
                // select로 태그 읽기
                Elements links1 = document.select("h1");
                for (Element link : links1) {
                    System.out.println(link.text());
                }
                System.out.println();
                
                Elements links2 = document.select("h2");
                for (Element link : links2) {
                    System.out.println(link.text());
                }
                
            } catch (IOException e) {
                e.printStackTrace();
            }
     
        }
    }
     
    cs

    결과

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    파일로부터 h1 tag 읽기 1
    파일로부터 h1 tag 읽기 2
    파일로부터 h1 tag 읽기 3
    파일로부터 h1 tag 읽기 4
     
    파일로부터 h2 tag 읽기 1
    파일로부터 h2 tag 읽기 2
    파일로부터 h2 tag 읽기 3
    파일로부터 h2 tag 읽기 4
     
    cs

    3. 자식 테그를 검색하기

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    import java.io.IOException;
     
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
     
    // 뉴스 RSS에서 읽기
    public class Jsoup07 {
        public static void main(String[] args) {
            Document document = null;
            try {
                String url = "http://rss.hankyung.com/new/news_main.xml";
                document = Jsoup.connect(url).get();
                Elements elements = document.getElementsByTag("item");
                for(Element el : elements){
                    Elements titles = el.getElementsByTag("title");
                    for(Element title : titles){
                     String temp = title.text().replace("<![CDATA[","").replace("]]>","");
                     System.out.println(temp);
                    }
                }
                System.out.println();
                
                elements.clear();
                elements = document.select("channel item title");
                for(Element el : elements){
                    String temp = el.text().replace("<![CDATA[","").replace("]]>","");
                    System.out.println(temp);
                }
                
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
     
     
    cs

    4. id, class로 검색하기

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    import java.io.File;
    import java.io.IOException;
     
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
     
     
    public class Jsoup08 {
        public static void main(String[] args) {
            String fileName = "Jsoup08.html";
            File file = new File(fileName);
            Document document = null;
            try {
                document = Jsoup.parse(file, "UTF-8");
                
                // getElementById로 태그 읽기
                // 리턴값이 Element타입이다.
                Element box1 = document.getElementById("box1");
                System.out.println(box1.text());
                System.out.println();
                
                // select("#아이디")로 읽기
                // 리턴값이 Elements타입이다.
                Elements box2 = document.select("#box2");
                for (Element box : box2) {
                    System.out.println(box.text());
                }
                System.out.println();
     
                // getElementsByClass로 태그 읽기
                Elements redbox = document.getElementsByClass("redbox");
                System.out.println("개수 : " + redbox.size() + "개");
                for (Element box : redbox) {
                    System.out.println(box.text());
                }
                System.out.println();
     
                // select(".클래스")로 태그 읽기
                Elements greenbox = document.select(".greenbox");
                System.out.println("개수 : " + greenbox.size() + "개");
                for (Element box : greenbox) {
                    System.out.println(box.text());
                }
                System.out.println();
                
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
     
     
     
     
    cs

    Jsoup08.html

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    <!doctype html>
    <html lang="ko">
    <head>
    <meta charset="UTF-8">
    <title>연습용 HTML문서</title>
    <style type="text/css">
    .redbox {
        border: 1px solid red;
        margin: 5;
    }
    .greenbox {
        border: 1px solid red;
        margin: 5;
    }
    </style>
    </head>
    <body>
        <div id="box1" class="greenbox">
            나는 id가 box1인 영역입니다. class는 greenbox
        </div>
        <div id="box2" class="redbox">
            나는 id가 box2인 영역입니다. class는 redbox
        </div>
        <div id="box2" class="greenbox">
            나는 id가 box2인 영역입니다. class는 greenbox
        </div>
    </body>
    </html>
     
     
     
    cs



    댓글

Designed by Tistory.