[+] 약 100만 개의 데이터를 33초로 ...

import pefile
import os
import re
import pprint as ppr
import time
import operator
#_____________________________________________________________
# .text section
class NgramExtract:
def __init__(self):
'''악성코드 작업 공간'''
self.mal_path = 'C:\\Users\\kitri\\Desktop\\target_dir'
self.mal_file_list = list() # 악성파일 경로 적재

'''정상파일 작업 공간'''
self.normal_path = ''
self.normal_path_list = list() # 정상파일 경로 적재

# 악성파일 정보 추출_______________________________________
def malware_file_list_extract(self):
os.chdir(path=self.mal_path)
print("[+] 현재 악성코드 작업 공간 : {}".format(os.curdir))
for f_ in os.listdir():
self.mal_file_list.append(os.path.abspath(f_))

# 악성파일 ngram 추출______________________________________
def malware_ngram_extract(self):
# PE 구조에서 .TEXT 섹션만을 추출하기 위해
temp_filter_text = re.compile(pattern=".text")
# ____________________________________________________________
filterList_zero = ['0x00' for _ in range(0x10)]
filterList_ff = ['0xff' for _ in range(0x10)]
subfilterList_zero = ['0x0' for _ in range(0x4)]
subfilterList_ff = ['0xff' for _ in range(0x4)]
subfilterList_cc = ['0xcc' for _ in range(0x4)]
#____________________________________________________________
for f_ in self.mal_file_list:
print ("malware file: {}".format(f_))
pe_data_f = pefile.PE(f_)

# 임시 리스트______________________________________________
pe_row_list = list()
ngram_list = list()
ngram_dict = dict()
ngram_result_list = list()
tmp = None
#_________________________________________________________

start_time = time.time()
# .text section 만 추출해보자
with open(f_, 'rb') as bin_file:
for sec in pe_data_f.sections:
if temp_filter_text.search(sec.Name.decode('utf-8')):
strt_entry = (sec.PointerToRawData//0x10)
sect_size = (sec.SizeOfRawData//0x10)

print (".text 시작: {0} .text 사이즈: {1}".format(strt_entry, sect_size))
# 시작 위치 까지 이동
for _ in range(strt_entry):
bin_file.read(0x10)

# 사이즈
for _ in range(sect_size):
ent = [ hex(i) for i in bin_file.read(0x10) ]
# 00 ff 로 이루어진 벡터는 의미 없음 제외
if ent != filterList_zero and ent != filterList_ff:
pe_row_list.extend(ent)

# ngram 구조로 변환
for i in range(0, len(pe_row_list)-3):
# ngram_set.add(tuple(pe_row_list[i:i+4]))
ent = pe_row_list[i:i+4]
# 00 ff 로 이루어진 벡터는 의미 없음 제외
if ent != subfilterList_ff and \
ent != subfilterList_zero and \
ent != subfilterList_cc:
ngram_list.append(ent)

ngram_list.sort(reverse=True)
indx = 0
while ngram_list != []:
target_cnt = ngram_list.count(ngram_list[0])
ngram_dict[tuple(ngram_list[0])] = target_cnt
indx = indx + target_cnt
ngram_list = ngram_list[indx:]

tmp = sorted(ngram_dict.items(),
key= operator.itemgetter(1),
reverse=True)
print (time.time() - start_time)

n_gram_data = [i[0] for i in tmp]
print(n_gram_data)
break


def main():
project_developter = NgramExtract()
project_developter.malware_file_list_extract()
project_developter.malware_ngram_extract()
if __name__ == "__main__":
main()


'언어 > python' 카테고리의 다른 글

프로젝트 코드  (0) 2018.07.10
machine "svm"  (0) 2018.07.09
text 섹션 ngram 추출 코드 작업 중 ...  (0) 2018.07.04
selenium  (0) 2018.07.03
크롤링  (0) 2018.07.03