In [1]:
# 訓練データのパス
train_path = "/mnt/c/Users/yuki.mogi/Downloads/train.jsonl"

In [2]:
import dask.bag as db
import json

In [64]:
# 注意
# train.jsonlには、マルウェアとクリーンウェアが同数含まれる
# よって、ある性質Xを持つ検体の、マルウェアにおける割合とクリーンウェアにおける割合の比は、
# 性質Xを持つマルウェアの数と性質Xを持つクリーンウェアの数の比で計算できる

In [65]:
# 問題1.1 x (3倍以上ではない)
mal = db.read_text(train_path).map(json.loads).filter(lambda x: x["label"] == 1).pluck("file_size").mean().compute()
cln = db.read_text(train_path).map(json.loads).filter(lambda x: x["label"] == 0).pluck("file_size").mean().compute()
ratio = cln / mal
print(f"比率: {ratio}")

比率: 2.7625019879296944


In [66]:
# 問題1.2 x (9倍以上ではない)
mal = db.read_text(train_path).map(json.loads).filter(lambda x: x["label"] == 1).filter(lambda x: x["lief"]["optional_header"]["sizeof_uninitialized_data"] > x["file_size"]).count().compute()
cln = db.read_text(train_path).map(json.loads).filter(lambda x: x["label"] == 0).filter(lambda x: x["lief"]["optional_header"]["sizeof_uninitialized_data"] > x["file_size"]).count().compute()
ratio = mal / cln
print(f"比率: {ratio}")

比率: 8.95


In [67]:
# 問題1.3 x (3倍以上ではない)
mal = db.read_text(train_path).map(json.loads).filter(lambda x: x["label"] == 1).filter(lambda x: (x["lief"]["header"]["pointerto_symbol_table"] != 0) | ("debug" in x["lief"].keys())).count().compute()
cln = db.read_text(train_path).map(json.loads).filter(lambda x: x["label"] == 0).filter(lambda x: (x["lief"]["header"]["pointerto_symbol_table"] != 0) | ("debug" in x["lief"].keys())).count().compute()
print(f"比率: {cln / mal}")

比率: 2.7158836689038033


In [68]:
# 問題2 IMAGE_FILE_RELOCS_STRIPPED
IMAGE_FILE_RELOCS_STRIPPED = 1
IMAGE_FILE_LINE_NUMS_STRIPPED = 4
IMAGE_FILE_LOCAL_SYMS_STRIPPED = 8
relocs_stripped = db.read_text(train_path).map(json.loads).filter(lambda x: x["lief"]["header"]["characteristics"] & IMAGE_FILE_RELOCS_STRIPPED)
line_nums_stripped = db.read_text(train_path).map(json.loads).filter(lambda x: x["lief"]["header"]["characteristics"] & IMAGE_FILE_LINE_NUMS_STRIPPED)
local_syms_stripped = db.read_text(train_path).map(json.loads).filter(lambda x: x["lief"]["header"]["characteristics"] & IMAGE_FILE_LOCAL_SYMS_STRIPPED)
print(f'IMAGE_FILE_RELOCS_STRIPPED: {relocs_stripped.filter(lambda x: x["label"] == 1).count().compute() / relocs_stripped.count().compute()}')
print(f'IMAGE_FILE_LINE_NUMS_STRIPPED: {line_nums_stripped.filter(lambda x: x["label"] == 1).count().compute() / line_nums_stripped.count().compute()}')
print(f'IMAGE_FILE_LOCAL_SYMS_STRIPPED: {local_syms_stripped.filter(lambda x: x["label"] == 1).count().compute() / local_syms_stripped.count().compute()}')

IMAGE_FILE_RELOCS_STRIPPED: 0.809820193637621
IMAGE_FILE_LINE_NUMS_STRIPPED: 0.7354392892398816
IMAGE_FILE_LOCAL_SYMS_STRIPPED: 0.7485317672183662


In [69]:
# 問題3.1 x (7倍以上ではない)
def has_high_entropy_section(sections):
    for section in sections:
        if section["entropy"] >= 7:
            return True
    return False
mal = db.read_text(train_path).map(json.loads).filter(lambda x: x["label"] == 1).filter(lambda x: has_high_entropy_section(x["lief"]["sections"])).count().compute()
cln = db.read_text(train_path).map(json.loads).filter(lambda x: x["label"] == 0).filter(lambda x: has_high_entropy_section(x["lief"]["sections"])).count().compute()
print(f"比率: {mal / cln}")

比率: 6.256983240223463


In [70]:
# 問題3.2.1 x (Obsidium はマルウェアには含まれていない)
num = db.read_text(train_path).map(json.loads).filter(lambda x: x["label"] == 1).filter(lambda x: has_high_entropy_section(x["lief"]["sections"])).filter(lambda x: "Obsidium" in ", ".join(x["peid"]["PEiD"])).count().compute()
print(f"性質S2を持つマルウェアで、Obsidium が検出されている数: {num}")

性質S2を持つマルウェアで、Obsidium が検出されている数: 0


In [71]:
# 問題3.2.2 ○
mal = db.read_text(train_path).map(json.loads).filter(lambda x: x["label"] == 1).filter(lambda x: has_high_entropy_section(x["lief"]["sections"]))
mal_num_with_s2 = mal.count().compute()
print(f"性質S2を持つマルウェアの数: {mal_num_with_s2}")
impfuzzy = dict(mal.pluck("hashes").pluck("impfuzzy").frequencies())
list_ = [k for k in impfuzzy if impfuzzy[k] > 112]
print(f"1割を超えるimpfuzzy: {list_}")
target_impfuzzy = list_[0] # 一件しか該当するimpfuzzyが存在しない
num = db.read_text(train_path).map(json.loads).filter(lambda x: x["label"] == 0).filter(lambda x: x["hashes"]["impfuzzy"] == target_impfuzzy).count().compute()
print(f"impfuzzy が {target_impfuzzy} であるクリーンウェアの数: {num}")

性質S2を持つマルウェアの数: 1120
1割を超えるimpfuzzy: ['48:YqtQg1E//Cy5OCECLvRjv04Sey0Qu38twt4xXp95GTXnBKl/i/1sjFRnvD:Dtt1E/6y57WX4DAFdD']
impfuzzy が 48:YqtQg1E//Cy5OCECLvRjv04Sey0Qu38twt4xXp95GTXnBKl/i/1sjFRnvD:Dtt1E/6y57WX4DAFdD であるクリーンウェアの数: 0


In [72]:
# 問題3.2.3 x (パッカーが検出されていない検体も存在する)
def has_low_entropy_section(sections):
    for section in sections:
        if section["entropy"] <= 1:
            return True
    return False
print("{PEiDの出力: その出力の、性質S2とS3を持つクリーンウェアの数}")
dict(db.read_text(train_path).map(json.loads).filter(lambda x: x["label"] == 0).filter(lambda x: has_high_entropy_section(x["lief"]["sections"])).filter(lambda x: has_low_entropy_section(x["lief"]["sections"])).map(lambda x: ", ".join(x["peid"]["PEiD"])).frequencies())

{PEiDの出力: その出力の、性質S2とS3を持つクリーンウェアの数}


{'VC8_Microsoft_Corporation, Microsoft_Visual_Cpp_8, Armadillo_V6X_Minimum_Protection_Silicon_Realms_Toolworks_20081227': 1,
 '': 39,
 'Microsoft_Visual_Cpp_v4x, Microsoft_Visual_Cpp_30_old_crap, Microsoft_Visual_C_20_additional, Microsoft_Visual_Cpp_30_old_crap_additional, Microsoft_Visual_Cpp_v42_additional, Microsoft_Visual_Cpp_v4x_additional, Microsoft_Visual_Cpp_42, Microsoft_Visual_C_20, Microsoft_Visual_Cpp_42_additional': 3,
 'Microsoft_Visual_Basic_v50': 2,
 'UPX_v0896_v102_v105_v122_Delphi_stub_additional, UPX_v0896_v102_v105_v122_Delphi_stub_Laszlo_Markus, PackerUPX_CompresorGratuito_wwwupxsourceforgenet, UPX_wwwupxsourceforgenet_additional, MSLRH_V031_emadicius, yodas_Protector_v1033_dllocx_Ashkbiz_Danehkar_h, UPX_v0896_v102_v105_v122_Delphi_stub, UPX_wwwupxsourceforgenet': 1,
 'PackerUPX_CompresorGratuito_wwwupxsourceforgenet, UPX_wwwupxsourceforgenet_additional, yodas_Protector_v1033_dllocx_Ashkbiz_Danehkar_h, Netopsystems_FEAD_Optimizer_1, UPX_290_LZMA, UPX_290_LZMA_Mark

In [17]:
import pandas as pd

In [18]:
# 問題4 (分類問題)
# ライブラリを用いた特徴量作成
from ffeature_extractor import LiefFeatureExtractor
lfe = LiefFeatureExtractor()
def load(path):
    feature_vectors = []
    labels = []
    ids = []
    with open(path, "rb") as f:
        for l in f:
            raw_obj = json.loads(l)
            columns, feature_vector = lfe.get_features(raw_obj["lief"])
            feature_vectors.append(feature_vector)
            ids.append(raw_obj["id"])
            if "label" in raw_obj:
                labels.append(raw_obj["label"])
    df = pd.DataFrame(feature_vectors)
    df.columns = columns
    return df, labels, ids

In [50]:
# trainデータのロード
train_features, labels, _ = load(train_path)
train_features.head()

Unnamed: 0,entrypoint,virtual_size,dos_header_addressof_new_exeheader,dos_header_addressof_relocation_table,dos_header_checksum,dos_header_file_size_in_pages,dos_header_header_size_in_paragraphs,dos_header_initial_ip,dos_header_initial_relative_cs,dos_header_initial_relative_ss,...,load_configuration_hybrid_metadata_pointer,load_configuration_guard_rf_failure_routine,load_configuration_guard_rf_failure_routine_function_pointer,load_configuration_dynamic_value_reloctable_offset,load_configuration_dynamic_value_reloctable_section,load_configuration_reserved2,load_configuration_guard_rf_verify_stackpointer_function_pointer,load_configuration_hotpatch_table_offset,load_configuration_reserved3,load_configuration_addressof_unicode_string
0,4669441.0,491520.0,256.0,64.0,0.0,2.0,4.0,0.0,0.0,0.0,...,,,,,,,,,,
1,4200236.0,45056.0,184.0,64.0,0.0,3.0,4.0,0.0,0.0,0.0,...,,,,,,,,,,
2,4204370.0,53248.0,232.0,64.0,0.0,3.0,4.0,0.0,0.0,0.0,...,,,,,,,,,,
3,4232017.0,495616.0,264.0,64.0,0.0,3.0,4.0,0.0,0.0,0.0,...,,,,,,,,,,
4,4575104.0,487424.0,256.0,64.0,0.0,2.0,4.0,0.0,0.0,0.0,...,,,,,,,,,,


In [20]:
# 分類器にはLightGBMを使用
import numpy as np
from sklearn.model_selection import cross_val_score
import lightgbm as lgb

In [21]:
clf1 = lgb.LGBMClassifier()

In [23]:
X, y = train_features.values, labels

In [24]:
# 5-fold CVでF1スコアを評価
print(cross_val_score(clf1, X, y, cv=5, scoring='f1'))

[0.97222222 0.965      0.96871089 0.97256858 0.97156984]


In [27]:
# テストデータのパス
test_path = "/mnt/c/Users/yuki.mogi/Downloads/test.jsonl"

In [28]:
test_features, _, test_ids = load(test_path)

In [29]:
clf1.fit(X, y)
pred1 = clf1.predict(test_features.values)

In [30]:
# Test Score: LB 0.7, PB 0.7560975609756098
your_first_answer = pd.DataFrame({"id": test_ids, "label": pred1})
your_first_answer.to_csv("answer1.csv", index=False)

In [31]:
# 次に、問題で問われた特徴量を使用してみる

In [45]:
def internal_extract_field(dat):
    return {
        "sizeof_uninitialized_data": dat["lief"]["optional_header"]["sizeof_uninitialized_data"],
        "file_size": dat["file_size"],
        "characteristics": dat["lief"]["header"]["characteristics"],
        "has_debug": "debug" in dat["lief"].keys(),
        "has_coff_debug": dat["lief"]["header"]["pointerto_symbol_table"] != 0,
        "PEiD": ", ".join(dat["peid"]["PEiD"]),
        "impfuzzy": dat["hashes"]["impfuzzy"],
        "entropy_larger_than_7": has_high_entropy_section(dat["lief"]["sections"]),
        "entropy_smaller_than_1": has_low_entropy_section(dat["lief"]["sections"])
    }

def extract_field_for_train(line):
    dat = json.loads(line)
    label = {"label": dat["label"]}
    features = internal_extract_field(dat)
    return dict(label, **features)

def extract_field_for_test(line):
    dat = json.loads(line)
    return internal_extract_field(dat)

# JSONLから必要なデータを読み取る関数
def read_nested_json(path, extract_field):
    with open(path, "rb") as f:
        dat = pd.json_normalize([extract_field(l) for l in f.readlines()])
    return dat

In [46]:
# 問題で問われた特徴量を追加する関数（trainデータ用）
def make_additional_features_for_train(path):
    df = read_nested_json(path, extract_field_for_train)
    df["bss_larger_than_file_size"] = (df["sizeof_uninitialized_data"] > df["file_size"]) * 1
    df["has_debug_all"] = (df["has_debug"] | df["has_coff_debug"]) * 1
    IMAGE_FILE_RELOCS_STRIPPED = 1
    IMAGE_FILE_LINE_NUMS_STRIPPED = 4
    IMAGE_FILE_LOCAL_SYMS_STRIPPED = 8
    df["entropy_larger_than_7"] = df["entropy_larger_than_7"] * 1
    df["entropy_smaller_than_1"] = df["entropy_smaller_than_1"] * 1
    df["relocs_stripped"] = ((df["characteristics"] & IMAGE_FILE_RELOCS_STRIPPED) != 0) * 1
    df["line_nums_stripped"] = ((df["characteristics"] & IMAGE_FILE_LINE_NUMS_STRIPPED) != 0) * 1
    df["local_syms_stripped"] = ((df["characteristics"] & IMAGE_FILE_LOCAL_SYMS_STRIPPED) != 0) * 1
    df["includes_upx"] = df["PEiD"].map(lambda x: "UPX" in x if x else False) * 1
    df["includes_ASPack"] = df["PEiD"].map(lambda x: "ASPack" in x if x else False) * 1
    df["inclues_Obsidium"] = df["PEiD"].map(lambda x: "Obsidium" in x if x else False) * 1
    df = df.drop(["label", "PEiD", "impfuzzy", "has_debug", "has_coff_debug"], axis=1)
    return df

In [47]:
# 問題で問われた特徴量を追加する関数（testデータ用）
def make_additional_features_for_test(path):
    df = read_nested_json(path, extract_field_for_test)
    df["bss_larger_than_file_size"] = (df["sizeof_uninitialized_data"] > df["file_size"]) * 1
    df["has_debug_all"] = (df["has_debug"] | df["has_coff_debug"]) * 1
    IMAGE_FILE_RELOCS_STRIPPED = 1
    IMAGE_FILE_LINE_NUMS_STRIPPED = 4
    IMAGE_FILE_LOCAL_SYMS_STRIPPED = 8
    df["entropy_larger_than_7"] = df["entropy_larger_than_7"] * 1
    df["entropy_smaller_than_1"] = df["entropy_smaller_than_1"] * 1
    df["relocs_stripped"] = ((df["characteristics"] & IMAGE_FILE_RELOCS_STRIPPED) != 0) * 1
    df["line_nums_stripped"] = ((df["characteristics"] & IMAGE_FILE_LINE_NUMS_STRIPPED) != 0) * 1
    df["local_syms_stripped"] = ((df["characteristics"] & IMAGE_FILE_LOCAL_SYMS_STRIPPED) != 0) * 1
    df["includes_upx"] = df["PEiD"].map(lambda x: "UPX" in x if x else False) * 1
    df["includes_ASPack"] = df["PEiD"].map(lambda x: "ASPack" in x if x else False) * 1
    df["inclues_Obsidium"] = df["PEiD"].map(lambda x: "Obsidium" in x if x else False) * 1
    df = df.drop(["PEiD", "impfuzzy", "has_debug", "has_coff_debug"], axis=1)
    return df

In [48]:
additional_train_features = make_additional_features_for_train(train_path)

In [51]:
train_features_full = pd.concat([train_features, additional_train_features], axis=1)

In [52]:
clf2 = lgb.LGBMClassifier()

In [53]:
X, y = train_features_full.values, labels

In [54]:
# slightly improved
print(cross_val_score(clf2, X, y, cv=5, scoring='f1'))

[0.97481108 0.96612296 0.97128589 0.97755611 0.97277228]


In [57]:
additional_test_features = make_additional_features_for_test(test_path)

In [58]:
test_features_full = pd.concat([test_features, additional_test_features], axis=1)

In [59]:
clf2.fit(X, y)
pred2 = clf2.predict(test_features_full.values)

In [60]:
# Test Score: LB 0.7259259259259259, PB 0.7763713080168776
your_first_answer = pd.DataFrame({"id": test_ids, "label": pred2})
your_first_answer.to_csv("answer2.csv", index=False)

In [61]:
# 出力されたCSVを見ると、どうもテストデータでは、クリーンウェアの方がかなり多いようだ
# そこで、thresholdを高くして、FPを小さくすることを優先してみる

In [62]:
pred3 = np.where(clf2.predict_proba(test_features_full.values)[:, 1] < 0.99, 0, 1)

In [63]:
# Test Score: LB 0.8910891089108911, PB 0.8820512820512821. Greatly improved!
your_first_answer = pd.DataFrame({"id": test_ids, "label": pred3})
your_first_answer.to_csv("answer3.csv", index=False)