Home>

How to ask parent td elements without child td elements in Bs4, each page can have different number of parent elements

<body>  <tr>    <td>Some text</td>    <td>Some text2</td>    <td>Some text3</td>    <td>Some text3</td>    <td>       <body>          <td>Some text that does not need to be parsed1</td>          <td>Some text that does not need to be parsed2</td>          <td>Some text that does not need to be parsed3</td>       </tbody>    </td>    <td>Some text4</td>    <td>Some text5</td>  </tr></tbody>
  • Answer # 1

    It's not clear from the question which option you expect, but try this

    from bs4 import BeautifulSoup as Soup
    txt="""<body>  <tr>    <td>Some text</td>    <td>Some text2</td>    <td>Some text3</td>    <td>Some text3</td>    <td>       <body>          <td>Some text that does not need to be parsed1</td>          <td>Some text that does not need to be parsed2</td>          <td>Some text that does not need to be parsed3</td>       </tbody>    </td>    <td>Some text4</td>    <td>Some text5</td>  </tr></tbody>"""
    soup= Soup(txt, 'html.parser')
    tbody= soup.find('tbody')
    for elem in tbody.find_all(['tbody', 'table']):
        elem.replace_with('')
    print(
        *[
            item.get_text(strip=True)
            for item in tbody.find_all('td')
            if item.get_text(strip=True)
        ],
        sep='\n'
    )
    #Some text
    #Some text2
    #Some text3
    #Some text3
    #Some text4
    #Some text5
    
  • Answer # 2

    It's not clear from the question which option you expect, but try this

    from bs4 import BeautifulSoup as Soup
    txt="""<body>  <tr>    <td>Some text</td>    <td>Some text2</td>    <td>Some text3</td>    <td>Some text3</td>    <td>       <body>          <td>Some text that does not need to be parsed1</td>          <td>Some text that does not need to be parsed2</td>          <td>Some text that does not need to be parsed3</td>       </tbody>    </td>    <td>Some text4</td>    <td>Some text5</td>  </tr></tbody>"""
    soup= Soup(txt, 'html.parser')
    tbody= soup.find('tbody')
    for elem in tbody.find_all(['tbody', 'table']):
        elem.replace_with('')
    print(
        *[
            item.get_text(strip=True)
            for item in tbody.find_all('td')
            if item.get_text(strip=True)
        ],
        sep='\n'
    )
    #Some text
    #Some text2
    #Some text3
    #Some text3
    #Some text4
    #Some text5