프로젝트/RL_Malware
2. 악성코드 PE 추출 with pickle, pefile
보안&인공지능
2021. 2. 15. 06:38
1. Dos Header 추출
def Dos_Header(pe):
#print(file)
features = []
temp = pe.DOS_HEADER
features.append(temp.e_cblp)
features.append(temp.e_cp)
features.append(temp.e_crlc)
features.append(temp.e_cparhdr)
features.append(temp.e_minalloc)
features.append(temp.e_maxalloc)
features.append(temp.e_ss)
features.append(temp.e_sp)
features.append(temp.e_csum)
features.append(temp.e_ip)
features.append(temp.e_cs)
features.append(temp.e_lfarlc)
features.append(temp.e_oemid)
features.append(temp.e_oeminfo)
features.append(temp.e_lfanew)
2. File Header 추출
def File_Header(pe):
features = []
temp = pe.FILE_HEADER
features.append(temp.Machine)
features.append(temp.NumberOfSections)
features.append(temp.PointerToSymbolTable)
features.append(temp.NumberOfSymbols)
features.append(temp.SizeOfOptionalHeader)
features.append(temp.Characteristics)
if temp.NumberOfSections == len(pe.sections):
features.append(1)
else:
features.append(0)
return features
3. Optional Header 추출
def Optional_Header(pe):
features = []
temp = pe.OPTIONAL_HEADER
features.append(temp.Magic)
features.append(temp.MajorLinkerVersion)
features.append(temp.MinorLinkerVersion)
features.append(temp.SizeOfCode)
features.append(temp.SizeOfInitializedData)
features.append(temp.SizeOfUninitializedData)
features.append(temp.AddressOfEntryPoint)
features.append(temp.BaseOfCode)
features.append(temp.ImageBase)
features.append(temp.SectionAlignment)
features.append(temp.FileAlignment)
features.append(temp.MajorOperatingSystemVersion)
features.append(temp.MinorOperatingSystemVersion)
features.append(temp.MajorImageVersion)
features.append(temp.MinorImageVersion)
features.append(temp.MajorSubsystemVersion)
features.append(temp.MinorSubsystemVersion)
features.append(temp.Reserved1)
features.append(temp.SizeOfImage)
features.append(temp.SizeOfHeaders)
features.append(temp.CheckSum)
features.append(temp.Subsystem)
features.append(temp.DllCharacteristics)
features.append(temp.SizeOfStackReserve)
features.append(temp.SizeOfStackCommit)
features.append(temp.SizeOfHeapReserve)
features.append(temp.SizeOfHeapCommit)
features.append(temp.LoaderFlags)
features.append(temp.NumberOfRvaAndSizes)
return features
4. Data Directory 추출
def Data_Directory(pe):
features = []
temp = pe.OPTIONAL_HEADER.DATA_DIRECTORY
count = 0
for i in temp:
features.append(i.VirtualAddress)
features.append(i.Size)
count += 1
#print(len(features))
for m in range(count, 16):
features.append(0)
features.append(0)
#print(len(features))
return features
5. Section 영역 추출
ef Sections(pe):
text = []
data = []
rsrc = []
rdata = []
reloc = []
sections = pe.sections
other_section_count = 0
for f in sections:
name = str(f.Name, encoding="utf8").strip('\x00')
if name == '.text' or name == '.data' or name == '.rsrc' \
or name == '.rdata' or name == '.reloc':
list = []
list.append(f.Misc_VirtualSize)
list.append(f.VirtualAddress)
list.append(f.SizeOfRawData)
list.append(f.PointerToRawData)
list.append(f.PointerToRelocations)
list.append(f.PointerToLinenumbers)
list.append(f.NumberOfRelocations)
list.append(f.NumberOfLinenumbers)
list.append(f.Characteristics)
if name == '.text':
text = list
elif name == '.data':
data = list
elif name == '.rsrc':
rsrc = list
elif name == '.rdata':
rdata = list
elif name == '.reloc':
reloc = list
else:
other_section_count += 1
if len(text) == 0:
for i in range(9):
text.append(0)
if len(data) == 0:
for i in range(9):
data.append(0)
if len(rsrc) == 0:
for i in range(9):
rsrc.append(0)
if len(rdata) == 0:
for i in range(9):
rdata.append(0)
if len(reloc) == 0:
for i in range(9):
reloc.append(0)
text.extend(data)
text.extend(rsrc)
text.extend(rdata)
text.extend(reloc)
#print(len(text))
text.append(other_section_count)
return text
6. Resources 영역 추출
def Resources(pe):
features = []
types = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 17, 19, 20, 21, 22, 23, 24]
try:
temp = pe.DIRECTORY_ENTRY_RESOURCE
except:
features.append(0)
for _ in types:
features.append(0)
return features
features.append(temp.struct.NumberOfNamedEntries + temp.struct.NumberOfIdEntries)
for i in types:
exist = False
for x in temp.entries:
if x.id == i:
features.append(x.directory.struct.NumberOfNamedEntries + x.directory.struct.NumberOfIdEntries)
exist = True
break
if not exist:
features.append(0)
return features
7. DLL, API 영역 추출
def Imported_DLL_and_API(pe):
dlls = []
apis = []
try:
temp = pe.DIRECTORY_ENTRY_IMPORT
#print(type(temp), temp)
except:
result = []
for i in range(53):
result.append(0)
return result
for i in temp:
if i.dll: dlls.append(str(i.dll.upper(), encoding="utf8"))
for j in i.imports:
if j.name: apis.append(str(j.name.upper(), encoding="utf8"))
dll = []
api = []
for key in dll_dict.keys():
exist = False
for i in dlls:
if i == key:
#print(key)
dll.append(1)
exist = True
break
if not exist:
dll.append(0)
for key in api_dict.keys():
exist = False
for i in apis:
if i == key:
api.append(1)
exist = True
break
if not exist:
api.append(0)
result = dll
result.extend(api)
result.append(len(dlls))
result.append(len(apis))
return result
결과
하나의 파일당 604개의 feature를 추출하였다.
총 1만개의 악성코드 feature를 추출하는데 걸린 시간은 1740초(29분)만큼 소요됐다.
반응형