根据条件从 python 字典中选择特定范围的元素-解网

问：

我有以下词典：

ip_dict = 
{
    "doc_1" : {
                "img_1" : ("FP","some long text"),
                "img_2" : ("LP", "another long text"),
                "img_3" : ("Others", "long text"),
                "img_4" : ("Others", "some loong text"),
                "img_5" : ("FP", "one more text"),
                "img_6" : ("FP", "another one"),
                "img_7" : ("LP", "ANOTHER ONE"),
                "img_8" : ("Others", "some text"),
                "img_9" : ("Others", "some moretext"),
                "img_10" : ("FP", "more text"),
                "img_11" : ("Others", "whatever"),
                "img_12" : ("Others", "more whatever"),
                "img_13" : ("LP", "SoMe TeXt"),
                "img_14" : ("Others", "some moretext"),
                "img_15" : ("FP", "whatever"),
                "img_16" : ("Others", "whatever"),
                "img_17" : ("LP", "whateverrr")
            },

    "doc_2" : {
                "img_1" : ("FP", "text"),
                "img_2" : ("FP", "more text"),
                "img_3" : ("LP", "more more text"),
                "img_4" : ("Others", "some more"),
                "img_5" : ("Others", "text text"),
                "img_6" : ("FP", "more more text"),
                "img_7" : ("Others", "lot of text"),
                "img_8" : ("LP", "still more text")
            }

}

这里代表第一页和最后一页。对于所有我只想提取和 .对于，如果它们位于和之间，则仅提取它们，因为它们表示和之间的页面。如果他们躺在外面，然后忽略他们。此外，对于后面没有 a ，将它们视为单个页面并提取它们。所以我的输出字典是这样的：FPLPdocsFPLPOthersFPLPFPLPFPLPFPLP

op_dict = 
{
    "doc_1" : [
                {
                "img_1" : ("FP","some long text"),
                "img_2" : ("LP", "another long text")
                },

                {
                    "img_5" : ("FP", "one more text")
                },

                {
                    "img_6" : ("FP", "another one"),
                    "img_7" : ("LP", "ANOTHER ONE")
                },

                {
                    "img_10" : ("FP", "more text"),
                    "img_11" : ("Others", "whatever"),
                    "img_12" : ("Others", "more whatever"),
                    "img_13" : ("LP", "SoMe TeXt"),
                },

                {
                    "img_15" : ("FP", "whatever"),
                    "img_16" : ("Others", "whatever"),
                    "img_17" : ("LP", "whateverrr"),
                }
            ],


    "doc_2" : [

                {
                "img_1" : ("FP", "text")
                },

                {        
                "img_2" : ("FP", "more text"),
                "img_3" : ("LP", "more more text")
                },        

                {
                "img_6" : ("FP", "more more text"),
                "img_7" : ("Others", "lot of text"),
                "img_8" : ("LP", "still more text")
                },

            ]
}

如您所见，所有和都已被提取，但还有介于两者之间的那些也被提取并存储在字典中。此外，那些没有后面跟着 a 的也被提取出来。FPLPOthersFPLPFPLP

附言：

ip_dict = 
{
    "doc_1" : {
                "img_1" : ("LP","some long text"),
                "img_2" : ("Others", "another long text"),
                "img_3" : ("Others", "long text"),
                "img_4" : ("FP", "long text"),
                "img_5" : ("Others", "long text"),
                "img_6" : ("LP", "long text")
            }
}

op_dict =     {
        "doc_1" : [{
                    "img_1" : ("LP","some long text")
                },
                    {
                    "img_4" : ("FP", "long text"),
                    "img_5" : ("Others", "long text"),
                    "img_6" : ("LP", "long text")
                    }
                  ]
    
              }

任何帮助都是值得赞赏的！

python-3.x 字典数据

for doc in ip_dict:
    print('\n', doc, '\n')

    ignore = True

    for img in ip_dict[doc]:
    
        TYPE = ip_dict[doc][img][0] # FP or LP
        TEXT = ip_dict[doc][img][1] # The text
    
        if TYPE == 'FP':
            ignore = False
    
        if ignore == False:
            print(img,' :\t', TYPE, '/', TEXT)
        
        if TYPE == 'LP':
            ignore = True

结果：

doc_1 

img_1  :     FP / some long text
img_2  :     LP / another long text
img_5  :     FP / one more text
img_6  :     FP / another one
img_7  :     LP / ANOTHER ONE
img_10  :    FP / more text
img_11  :    Others / whatever
img_12  :    Others / more whatever
img_13  :    LP / SoMe TeXt
img_15  :    FP / whatever
img_16  :    Others / whatever
img_17  :    LP / whateverrr

doc_2 

img_1  :     FP / text
img_2  :     FP / more text
img_3  :     LP / more more text
img_6  :     FP / more more text
img_7  :     Others / lot of text
img_8  :     LP / still more text


def process(ip_dict):
    op_dict=dict()
    for key,value in ip_dict.items():
        op_list=[]
        fp_counter=0
        lp_counter=0
        op_dup=dict()
        for key1,value1 in value.items():
            if value1[0] == "FP" and fp_counter==1:
                fp_counter=1
                if len(op_dup) != 0:
                    op_list.append(op_dup)
                op_dup=dict()
                op_dup[key1]=value1
                continue
            
            if value1[0] == "FP" and fp_counter==0:
                fp_counter=1
                
               
            if value1[0] == "LP" and lp_counter==1:
                lp_counter=1
                if len(op_dup) != 0:
                    op_list.append(op_dup)
                op_dup=dict()
                op_dup[key1]=value1
                continue
            
            if value1[0] == "LP" and lp_counter==0:
                lp_counter=1
                
            if(lp_counter==0 and fp_counter == 1):
                op_dup[key1]=value1
                
            if(lp_counter == 1 and fp_counter == 1 and value1[0] == "LP"):
                op_dup[key1]=value1
                
            if(lp_counter == 1 and fp_counter == 1 and value1[0] != "LP"):
                if len(op_dup) != 0:
                    op_list.append(op_dup)
                op_dup=dict()
                lp_counter=0
                fp_counter=0
        if(len(op_dup) != 0):
            op_list.append(op_dup)
        op_dict[key]=op_list
    return op_dict

print(process(ip_dict))

1赞 John Collins 8/2/2023 #3

一种可能的方法：

op_dict = {}
first_page = None
for doc, imgs in ip_dict.items():
    op_dict[doc] = []
    for k, v in imgs.items():
        if v[0] == "FP":
            if first_page:
                if len(new.keys()) == 1:
                    op_dict[doc].append(new)
                else:
                    op_dict[doc].append(
                        {list(new.keys())[0]: list(new.values())[0]}
                    )
                new = {}
            else:
                new = {k: v}
                first_page = True
                continue
        if first_page:
            new[k] = v
            if v[0] == "LP":
                op_dict[doc].append(new)
                first_page = False
    if first_page:
        op_dict[doc].append({k: v})

这给出了：

{'doc_1': [{'img_1': ('FP', 'some long text'),
   'img_2': ('LP', 'another long text')},
  {'img_5': ('FP', 'one more text')},
  {'img_6': ('FP', 'another one'), 'img_7': ('LP', 'ANOTHER ONE')},
  {'img_61': ('FP', 'another one'), 'img_71': ('LP', 'ANOTHER ONE')},
  {'img_62': ('FP', 'another one'), 'img_72': ('LP', 'ANOTHER ONE')},
  {'img_54': ('FP', 'one more text')},
  {'img_540': ('FP', 'one more text')},
  {'img_541': ('FP', 'one more text')},
  {'img_13': ('FP', 'more text'),
   'img_14': ('Others', 'whatever'),
   'img_140': ('Others', 'whatever'),
   'img_141': ('Others', 'whatever'),
   'img_142': ('Others', 'whatever'),
   'img_15': ('Others', 'more whatever'),
   'img_16': ('LP', 'SoMe TeXt')},
  {'img_18': ('FP', 'whatever'),
   'img_19': ('Others', 'whatever'),
   'img_20': ('LP', 'whateverrr')}],
 'doc_2': [{'img_1': ('FP', 'text')},
  {'img_2': ('FP', 'more text'), 'img_3': ('LP', 'more more text')},
  {'img_6': ('FP', 'more more text'),
   'img_7': ('Others', 'lot of text'),
   'img_8': ('LP', 'still more text')},
  {'img_69': ('FP', 'more more text')}]}

def select_page_ranges(d: dict):

    def _del_excess_items():
        # if previous block was not closed and has excess entries
        if start and last_mark != 'FP':
            res[pk][-1] = {start_key: res[pk][-1][start_key]}

    res = {}
    for pk, v in ip_dict.items():
        res[pk] = []
        start, start_key, last_mark = None, None, ''
        for k, v in v.items():
            if v[0] == 'FP':
                _del_excess_items()
                res[pk].append({k: v})
                start = True
                start_key = k
            elif v[0] == 'LP':
                res[pk][-1].update({k: v})
                start = False
            elif start:
                res[pk][-1].update({k: v})
            last_mark = v[0]
        _del_excess_items()
    return res

print(select_page_ranges(ip_dict))

{'doc_1': [{'img_1': ('FP', 'some long text'),
            'img_2': ('LP', 'another long text')},
           {'img_5': ('FP', 'one more text')},
           {'img_6': ('FP', 'another one'), 'img_7': ('LP', 'ANOTHER ONE')},
           {'img_61': ('FP', 'another one'), 'img_71': ('LP', 'ANOTHER ONE')},
           {'img_62': ('FP', 'another one'), 'img_72': ('LP', 'ANOTHER ONE')},
           {'img_54': ('FP', 'one more text')},
           {'img_540': ('FP', 'one more text')},
           {'img_541': ('FP', 'one more text')},
           {'img_13': ('FP', 'more text'),
            'img_14': ('Others', 'whatever'),
            'img_140': ('Others', 'whatever'),
            'img_141': ('Others', 'whatever'),
            'img_142': ('Others', 'whatever'),
            'img_15': ('Others', 'more whatever'),
            'img_16': ('LP', 'SoMe TeXt')},
           {'img_18': ('FP', 'whatever'),
            'img_19': ('Others', 'whatever'),
            'img_20': ('LP', 'whateverrr')}],
 'doc_2': [{'img_1': ('FP', 'text')},
           {'img_2': ('FP', 'more text'), 'img_3': ('LP', 'more more text')},
           {'img_6': ('FP', 'more more text'),
            'img_7': ('Others', 'lot of text'),
            'img_8': ('LP', 'still more text')},
           {'img_69': ('FP', 'more more text')}]}

根据条件从 python 字典中选择特定范围的元素

Select specific range of elements from a python dictionary based on condition

评论

评论

评论

评论