Home>

There is a data frame with 3 columns of "Name", "Category 1" and "Category 2".
In the "Category 2" column, the values ​​of "002, 003, 004" are collectively written as "002 to 004".

Name Category 1Category 2
A A-1 001
A A-1 002
B B-1 001
B B-1002 ~ 004

I have a question.
Above, "002 ~ 004" of category 2 keeps the value of "name" and "category 1"
Could you please teach me how to split it into separate lines as shown below?

Name Category 1 Category 2
A A-1 001
A A-1 002
B B-1 001
B B-1 002
B B-1 003
B B-1 004

  • Answer # 1

    print (df)
    # Name Category 1 Category 2
    # 0 A A-1 001
    # 1 A A-1 002
    # 2 B B-1 001
    # 3 B B-1 002 ~ 004
    def f (x):
        l = x.split ('~')
        temp = []
        for i in range (int (l [0]), int (l [-1]) + 1):
            temp.append (f'{i}'. zfill (3))
        return temp
    df ['Category 2'] = df ['Category 2'] .apply (f)
    df = df.explode ('Category 2')
    print (df)
    # Name Category 1 Category 2
    # 0 A A-1 001
    # 1 A A-1 002
    # 2 B B-1 001
    # 3 B B-1 002
    # 3 B B-1 003
    # 3 B B-1 004

  • Answer # 2

    import pandas as pd
    import io
    import re
    txt = "" "
    Name, category 1, category 2
    A, A-1,001
    A, A-1,002
    B, B-1,001
    B, B-1,002 ~ 004
    "" "
    def make_list (s):
        if re.fullmatch (r'\ d {3}', s):
            return [s]
        if re.fullmatch (r'\ d {3} ~ \ d {3}', s):
            return [f "{i: 03}" for i in range (int (s [0: 3]), int (s [4: 7]) + 1)]
        return None
    df = pd.read_csv (io.StringIO (txt))
    df ['Category 2'] = df ['Category 2'] .apply (make_list)
    # print (df)
    df = df.set_index (['Name','Category 1']) ['Category 2'] .apply (pd.Series) .stack (). Reset_index ()
    df.drop (columns = df.columns [2], inplace = True)
    df.columns = ['Name','Category 1','Category 2']
    print (df)
    Name Category 1 Category 2
    0 A A-1 001
    1 A A-1 002
    2 B B-1 001
    3 B B-1 002
    4 B B-1 003
    5 B B-1 004