Security || AI

2. 악성코드 PE 추출 with pickle, pefile 본문

프로젝트/RL_Malware

2. 악성코드 PE 추출 with pickle, pefile

보안&인공지능 2021. 2. 15. 06:38

1. Dos Header 추출

def Dos_Header(pe):
	#print(file)
    features = []

    temp = pe.DOS_HEADER
    features.append(temp.e_cblp)  
    features.append(temp.e_cp)  
    features.append(temp.e_crlc)  
    features.append(temp.e_cparhdr)  
    features.append(temp.e_minalloc)  
    features.append(temp.e_maxalloc)  
    features.append(temp.e_ss)  
    features.append(temp.e_sp)  
    features.append(temp.e_csum)  
    features.append(temp.e_ip)  
    features.append(temp.e_cs)  
    features.append(temp.e_lfarlc)  
    features.append(temp.e_oemid)  
    features.append(temp.e_oeminfo)  
    features.append(temp.e_lfanew)

2. File Header 추출

def File_Header(pe):
    features = []

    temp = pe.FILE_HEADER

    features.append(temp.Machine) 
    features.append(temp.NumberOfSections) 
    features.append(temp.PointerToSymbolTable) 
    features.append(temp.NumberOfSymbols) 
    features.append(temp.SizeOfOptionalHeader) 
    features.append(temp.Characteristics) 

   
    if temp.NumberOfSections == len(pe.sections): 
        features.append(1)
    else:
        features.append(0)
    return features

3. Optional Header 추출

def Optional_Header(pe):
    features = []
    temp = pe.OPTIONAL_HEADER

    features.append(temp.Magic)  
    features.append(temp.MajorLinkerVersion)  
    features.append(temp.MinorLinkerVersion)  
    features.append(temp.SizeOfCode)  
    features.append(temp.SizeOfInitializedData)  
    features.append(temp.SizeOfUninitializedData)  
    features.append(temp.AddressOfEntryPoint)  
    features.append(temp.BaseOfCode)  
    features.append(temp.ImageBase)  
    features.append(temp.SectionAlignment)  
    features.append(temp.FileAlignment)  
    features.append(temp.MajorOperatingSystemVersion)  
    features.append(temp.MinorOperatingSystemVersion)  
    features.append(temp.MajorImageVersion)  
    features.append(temp.MinorImageVersion)  
    features.append(temp.MajorSubsystemVersion)  
    features.append(temp.MinorSubsystemVersion)  
    features.append(temp.Reserved1)  
    features.append(temp.SizeOfImage)  
    features.append(temp.SizeOfHeaders)  
    features.append(temp.CheckSum)  
    features.append(temp.Subsystem)  
    features.append(temp.DllCharacteristics)  
    features.append(temp.SizeOfStackReserve)  
    features.append(temp.SizeOfStackCommit)  
    features.append(temp.SizeOfHeapReserve)  
    features.append(temp.SizeOfHeapCommit)  
    features.append(temp.LoaderFlags)  
    features.append(temp.NumberOfRvaAndSizes)  

    return features

4. Data Directory 추출

def Data_Directory(pe):
    features = []
    temp = pe.OPTIONAL_HEADER.DATA_DIRECTORY
    count = 0
    for i in temp:
        features.append(i.VirtualAddress)
        features.append(i.Size)
        count += 1
    #print(len(features))
    for m in range(count, 16):
        features.append(0)
        features.append(0)
    #print(len(features))
    return features

5. Section 영역 추출

ef Sections(pe):
    text = []
    data = []
    rsrc = []
    rdata = []
    reloc = []
    sections = pe.sections

    other_section_count = 0
    for f in sections:
        name = str(f.Name, encoding="utf8").strip('\x00')
        if name == '.text' or name == '.data' or name == '.rsrc' \
                or name == '.rdata' or name == '.reloc':
            list = []
            list.append(f.Misc_VirtualSize)  
            list.append(f.VirtualAddress)  
            list.append(f.SizeOfRawData)  
            list.append(f.PointerToRawData)  
            list.append(f.PointerToRelocations)  
            list.append(f.PointerToLinenumbers)  
            list.append(f.NumberOfRelocations)  
            list.append(f.NumberOfLinenumbers)  
            list.append(f.Characteristics)  

            if name == '.text':
                text = list
            elif name == '.data':
                data = list
            elif name == '.rsrc':
                rsrc = list
            elif name == '.rdata':
                rdata = list
            elif name == '.reloc':
                reloc = list
        else:
            other_section_count += 1

    if len(text) == 0:
        for i in range(9):
            text.append(0)
    if len(data) == 0:
        for i in range(9):
            data.append(0)
    if len(rsrc) == 0:
        for i in range(9):
            rsrc.append(0)
    if len(rdata) == 0:
        for i in range(9):
            rdata.append(0)
    if len(reloc) == 0:
        for i in range(9):
            reloc.append(0)

    text.extend(data)
    text.extend(rsrc)
    text.extend(rdata)
    text.extend(reloc)
    #print(len(text))
    text.append(other_section_count)

    return text

6. Resources 영역 추출

def Resources(pe):
    features = []
    types = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 17, 19, 20, 21, 22, 23, 24]
    try:
        temp = pe.DIRECTORY_ENTRY_RESOURCE
    except:
        features.append(0)
        for _ in types:
            features.append(0)
        return features
    features.append(temp.struct.NumberOfNamedEntries + temp.struct.NumberOfIdEntries)
    for i in types:
        exist = False
        for x in temp.entries:
            if x.id == i:
                features.append(x.directory.struct.NumberOfNamedEntries + x.directory.struct.NumberOfIdEntries)
                exist = True
                break
        if not exist:
            features.append(0)
    return features

7. DLL, API 영역 추출

 

def Imported_DLL_and_API(pe):
    dlls = []
    apis = []
    try:
        temp = pe.DIRECTORY_ENTRY_IMPORT
        #print(type(temp), temp)
    except:
        result = []
        for i in range(53):
            result.append(0)
        return result
    for i in temp:
        if i.dll: dlls.append(str(i.dll.upper(), encoding="utf8"))
        for j in i.imports:
            if j.name: apis.append(str(j.name.upper(), encoding="utf8"))
    dll = []
    api = []

    for key in dll_dict.keys():
        exist = False
        for i in dlls:
            if i == key:
                #print(key)
                dll.append(1)
                exist = True
                break
        if not exist:
            dll.append(0)
    
    for key in api_dict.keys():
        exist = False
        for i in apis:
            if i == key:
                api.append(1)
                exist = True
                break
        if not exist:
            api.append(0)
    result = dll
    result.extend(api)
    result.append(len(dlls))
    result.append(len(apis))

    return result

결과

하나의 파일당 604개의 feature를 추출하였다.

총 1만개의 악성코드 feature를 추출하는데 걸린 시간은 1740초(29분)만큼 소요됐다.

반응형