ngram 프로젝트 코드

언어/python2018. 7. 5. 14:08

뷰어
댓글로
이전글
다음글

[+] 약 100만 개의 데이터를 33초로 ...

import pefile
import os
import re
import pprint as ppr
import time
import operator
#_____________________________________________________________
# .text section
class NgramExtract:
    def __init__(self):
        '''악성코드 작업 공간'''
        self.mal_path = 'C:\\Users\\kitri\\Desktop\\target_dir'
        self.mal_file_list = list() # 악성파일 경로 적재

        '''정상파일 작업 공간'''
        self.normal_path = ''
        self.normal_path_list = list() # 정상파일 경로 적재

    # 악성파일 정보 추출_______________________________________
    def malware_file_list_extract(self):
        os.chdir(path=self.mal_path)
        print("[+] 현재 악성코드 작업 공간 : {}".format(os.curdir))
        for f_ in os.listdir():
            self.mal_file_list.append(os.path.abspath(f_))

    # 악성파일 ngram 추출______________________________________
    def malware_ngram_extract(self):
        # PE 구조에서 .TEXT 섹션만을 추출하기 위해
        temp_filter_text   = re.compile(pattern=".text")
        # ____________________________________________________________
        filterList_zero    = ['0x00' for _ in range(0x10)]
        filterList_ff      = ['0xff' for _ in range(0x10)]
        subfilterList_zero = ['0x0'  for _ in range(0x4)]
        subfilterList_ff   = ['0xff' for _ in range(0x4)]
        subfilterList_cc   = ['0xcc' for _ in range(0x4)]
        #____________________________________________________________
        for f_ in self.mal_file_list:
            print ("malware file: {}".format(f_))
            pe_data_f = pefile.PE(f_)

            # 임시 리스트______________________________________________
            pe_row_list = list()
            ngram_list  = list()
            ngram_dict  = dict()
            ngram_result_list = list()
            tmp = None
            #_________________________________________________________

            start_time = time.time()
            # .text section 만 추출해보자
            with open(f_, 'rb') as bin_file:
                for sec in pe_data_f.sections:
                    if temp_filter_text.search(sec.Name.decode('utf-8')):
                        strt_entry = (sec.PointerToRawData//0x10)
                        sect_size  = (sec.SizeOfRawData//0x10)

                        print (".text 시작: {0} .text 사이즈: {1}".format(strt_entry, sect_size))
                        # 시작 위치 까지 이동
                        for _ in range(strt_entry):
                            bin_file.read(0x10)

                        # 사이즈
                        for _ in range(sect_size):
                            ent = [ hex(i) for i in bin_file.read(0x10) ]
                            # 00 과 ff 로 이루어진 벡터는 의미 없음 제외
                            if ent != filterList_zero and ent != filterList_ff:
                                pe_row_list.extend(ent)

                        # ngram 구조로 변환
                        for i in range(0, len(pe_row_list)-3):
                            # ngram_set.add(tuple(pe_row_list[i:i+4]))
                            ent = pe_row_list[i:i+4]
                            # 00 과 ff 로 이루어진 벡터는 의미 없음 제외
                            if ent != subfilterList_ff   and  \
                               ent != subfilterList_zero and \
                               ent != subfilterList_cc:
                                ngram_list.append(ent)

                        ngram_list.sort(reverse=True)
                        indx   = 0
                        while ngram_list != []:
                            target_cnt = ngram_list.count(ngram_list[0])
                            ngram_dict[tuple(ngram_list[0])] = target_cnt
                            indx = indx + target_cnt
                            ngram_list = ngram_list[indx:]

                        tmp = sorted(ngram_dict.items(),
                                     key= operator.itemgetter(1),
                                     reverse=True)
                        print (time.time() - start_time)

                        n_gram_data = [i[0] for i in tmp]
                        print(n_gram_data)
                        break


def main():
    project_developter = NgramExtract()
    project_developter.malware_file_list_extract()
    project_developter.malware_ngram_extract()
if __name__ == "__main__":
    main()

저작자표시 비영리 변경금지 (새창열림)

'언어 > python' 카테고리의 다른 글

프로젝트 코드 (0)	2018.07.10
machine "svm" (0)	2018.07.09
text 섹션 ngram 추출 코드 작업 중 ... (0)	2018.07.04
selenium (0)	2018.07.03
크롤링 (0)	2018.07.03

일	월	화	수	목	금	토
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31

길

ngram 프로젝트 코드

'언어 > python' 카테고리의 다른 글

최근에 올라온 글

최근에 달린 댓글

공지사항

글 보관함

링크

티스토리툴바