三日目　N-gramを作る。 - 機械学習メモ

前回まで

dango0dan5.hatenablog.com

N-gram

ようやく、書籍を手にすることができた。iQSPRの基本的な概念が日本語で記載されているので、コードの理解にも役に立つ。Xenonpyはいろいろと便利なクラスが用意されているし、ドキュメントもコード内にも細かく書かれている。新しいことをしようとすると、ちゃんと調べてやらないと、かなり難しい。

www.kyoritsu-pub.co.jp

次に、N-gramを作成する。

まずN-gramの概念については、下記を参照のこと。

mojitoba.com

qiita.com

SMILES文字列の場合は、単語ではなく、文字で区切りを設けている。
推奨文字数として5文字以上が経験的に推奨されている。最大で10文字が許されるが、以下のようにすると、最大が5文字として設定される。

# N-gram library in XenonPy-iQSPR
from xenonpy.inverse.iqspr import NGram

# initialize a new n-gram
n_gram = NGram()

# train the n-gram with SMILES of available molecules
n_gram.fit(data['SMILES'],train_order=5)

NGram(ngram_table=[[[ C ( O [N+] ! & N Cl =C =O ... \
['C'] 26254 11168 2899 58 3496 13361 2666 237 1251 215 ...
[')'] 11471 2204 5707 488 0 0 3750 1333 0 4 ...
['O'] 4430 2 40 22 3926 0 39 1 0 1 ...
['[N+]'] 0 589 0 0 0 14 0 0 0 0 ...
['N'] 1937 1537 61 6 1609 940 135 1 247 116 ...
... ... ... ... ... ... ... ... ... ... ... ...
['=[Se]'] 0 1 0 0 0 0 0 0 0 1 ...
['[Ti]'] 0 1 0 0 0 0 0 0 0 0 ...
['=[Si]'] 0 0 0 0 0 0 0 0 0 1 ...
['[Cu]'] 0 0 0 0 0 0 0 0 0 0 ...
['[O+]'] 0 0 0 0 0 0 0 0 0...
['C', '(', '=C', 'C', 2] 1 0 0 0 0 0
['(', '=C', 'C', 2, '=C'] 0 0 0 1 0 0
['=C', 'C', 2, '=C', 'C'] 0 0 0 0 0 1
['C', '=C', 'C', '=C', 0] 0 0 0 1 0 0
['=C', 'C', '=C', 0, 'C'] 0 1 0 0 0 0,
=C C 2 0
['&', '=C', '&', 'C', '('] 1 0 0 0
['=C', '&', 'C', '(', '=C'] 0 1 0 0
['&', 'C', '(', '=C', 'C'] 0 0 1 0
[1, 'C', '&', '=C', '('] 0 1 0 0
['C', '&', '=C', '(', 'C'] 1 0 0 0
['&', '=C', '(', 'C', '=C'] 0 1 0 0
['=C', '(', 'C', '=C', 'C'] 1 0 0 0
['(', 'C', '=C', 'C', '=C'] 0 0 0 1]]],
sample_order=(1, 5))

データの最初の５個目までの構造で、N-gramを使って構造発生すると、以下のようなものが得られる。

# perform pure iQSPR molecule generation starting with 5 initial molecules
n_loop = 5
tmp = data['SMILES'][:5]
for i in range(n_loop):
    tmp = n_gram.proposal(tmp)
    print('Round %i' % i,tmp)

Round 0 ['CC(=O)OC(CC(=O)[O-])C', 'CC(=O)OC(CC(=O)O)C[N+](C)(C)N', 'C1=CC(C(C(=C1)C)(C(=O)O)CBr)O', 'C1COCCN1S(=O)(=O)OCCCl', 'C(C(=O)COP(=O)(O)O)O']
Round 1 ['CC(=O)OC(CC(=O)[O-])C1=CC=C(C=C1)CC=C', 'CC(=O)OC(CC(=O)O)C[N+](C)(C)C', 'C1=CC(C(C(=C1)C)(C(=O)O)O)C(=O)C=CC(=O)C', 'C1COCCN1S(=O)N1CCC2=CC=CC=C21', 'C(C(=O)COP(=O)(O)OP(=O)(O)OC1(C(N(C2=CC3=C(CCC4=CC(=O)CCC4(C3(C(CCC2C1)O)O)C)O)C1=CC=CC=C1)O)C)NC1=C(C=CC(=C1)O)Cl']
Round 2 ['CC(=O)OC(CC(=O)[O-])C1=CC=C(C=C1)[N+](=O)[O-]', 'CC(=O)OC(CC(=O)O)C(C)C1=CC=CC=C1O', 'C1=CC(C(C(=C1)C)(C(=O)O)O)C(=O)O', 'C1COCCN1S(=O)N1CCC2=CC=CC=C21', 'C(C(=O)COP(=O)(O)OP(=O)(O)OC1(C(N(C2=CC3=C(CCC4=CC(=O)CCC4(C3(C(CCC2C1)O)O)C)O)C1=CC=CC=C1)O)C)NC1=C(C=CC(=C1)OP(=O)(O)O)OC(=O)C=C(C#N)N=CC1=CC=C(C=C1)N']
Round 3 ['CC(=O)OC(CC(=O)[O-])C1=CC=C(C=C1)[N+](=O)[O-]', 'CC(=O)OC(CC(=O)O)C(C)C1=CC=CC=C1OC', 'C1=CC(C(C(=C1)C)(C(=O)O)O)C(=O)OC', 'C1COCCN1S(=O)N1CCC2=CC=CC=C2S1', 'C(C(=O)COP(=O)(O)OP(=O)(O)OC1(C(N(C2=CC3=C(CCC4=CC(=O)CCC4(C3(C(CCC2C1)O)O)C)O)C1=CC=CC=C1)O)C)NC1=C(C=CC(=C1)OP(=O)(O)O)OC(=O)C=C(C#N)N=CC1=CC=C(C=C1)C(=O)NC1=CC=CS1']
Round 4 ['CC(=O)OC(CC(=O)[O-])C1=CC=C(C=C1)OCC=CC1=CC2=C(C1=O)C(=C2)C', 'CC(=O)OC(CC(=O)O)C(C)C1=CC=C(C=C1)Br', 'C1=CC(C(C(=C1)C)(C(=O)O)O)O', 'C1COCCN1S(=O)N1CCC2=C(C1=O)C=CC(=C2O)Br', 'C(C(=O)COP(=O)(O)OP(=O)(O)OC1(C(N(C2=CC3=C(CCC4=CC(=O)CCC4(C3(C(CCC2C1)O)O)C)O)C1=CC=CC=C1)O)C)NC1=C(C=CC(=C1)OP(=O)(O)O)OC(=O)C=C(C#N)N=CC1=CC=C(C=C1)C(=O)OCC(=O)N(C1=CC=C(C=C1)Cl)C1=CC=C(C=C1)C1=CC=CC2=CC3=CC=CC=C3C=C12']

N-gramの挙動を調べるために、パラメーターを与えて違いを見てみている。最大長さを500文字として、末尾から原子を削除する個数（ランダムに可変する）の範囲を1-10とし、並び替えが行われる確率を0.5としている。同じことをしているのにも関わらず、違う構造が得られているのがわかる。

# change internal parameters of n_gram generator
_ = n_gram.set_params(del_range=[1,10], max_len=500, reorder_prob=0.5)

# perform pure iQSPR molecule generation starting with 10 PG molecules
n_loop = 5
tmp = data['SMILES'][:5]
for i in range(n_loop):
    tmp = n_gram.proposal(tmp)
    print('Round %i' % i,tmp)

Round 0 ['C[N+](C)(C)CC(CC(=O)[O-])OC(C1=CC=CC=C1)CC', 'O(C(C)=O)C(CC(=O)O)C[N+](C)(C)O', 'C1=CC=C(C(=O)NC(=O)NC1=O)[N+](=O)[O-]', 'C1=NC2=C(C1=O)C(=O)C=CC2=O', 'C(C(=O)COP(=O)(O)OC1CCN(CC1)CN1CCOCC1)C(=O)N']
Round 1 ['C[N+](C)(C)CC(CC(=O)[O-])OC(C1=CC=CC=C1)COP(=O)(O)OP(=O)(OC)O', 'O=C(C)OC(CC(=O)O)CNCCCN', 'O=C1NC(=O)NC(=O)C=CC=C1C(=O)OCC', 'C1=NC2=C(C1=O)C(=O)C=CC2=O', 'N1(CN2CCOCC2)CCC(OP(=O)(O)OCC(=O)CC(O1)C1=CC=CC=C1)(C(=O)O)C1=CC=CC=C1O']
Round 2 ['C[N+](C)(C)CC(CC(=O)[O-])OC(C1=CC=CC=C1)COP(=O)(O)OP(=O)(O)OC', 'C(CCN)NCC(CC(=O)O)OC(CC(CCCO)O)C(C)N', 'N1C(=O)C=CC=C(C(=O)OCC)C(=O)NC(=O)N1C1=CC=CC=C1', 'C1=NC2=C(C1=O)C(=O)C=CC2=O', 'N1(CN2CCOCC2)CCC(OP(=O)(O)OCC(=O)CC(O1)C1=CC=CC=C1)(C(=O)O)C1=CC=CC=C1O']
Round 3 ['c1ccc(C(COP(=O)(O)OP(=O)(O)OC)OC(CC(=O)[O-])C[N+](C)(C)C)NC(=O)O1', 'OC(=O)CC(CNCCCN)OC(CC(O)CCCO)C(=O)C', 'c1(-n2[nH]c(=O)cccc(C(=O)OCC)c(=O)[nH]c2=O)N(C1=O)C', 'C1=NC2=C(C1=O)C(=O)C=CC2=O', 'C1COCCN1CN1CCC(C(=O)O)(c2ccccc2O)OP(=O)(O)OCC(=O)CC(c2ccccc2)C=CC=C1']
Round 4 ['O=P(O)(OC)OP(=O)(O)OCC(OC(CC(=O)[O-])C[N+](C)(C)C)C1=CC(=C(C=C1)C)C', 'C(NCCCN)C(CC(=O)O)OC(CC(O)(C)O)(C)C', 'c1(-n2[nH]c(=O)cccc(C(=O)OCC)c(=O)[nH]c2=O)N(C1=O)C', 'C1=NC2=C(C1=O)C13CCCCC1C2CC2=C3C=C(C=C2Cl)C=CC2C3C(CCC1C(C2CC(C3)O)O)CCOC1(C)C', 'C1COCCN1CN1CCC(C(=O)O)(c2ccccc2O)OP(=O)(O)OCC(=O)CC(c2ccccc2)C=CC=C1']

並び替えを行う方が、多彩な化学構造を得ることができるので、逆解析の際に有利となる。次の例では文字列の並び替えを行わない例を示す。デフォルトのtrain_order=10なので、10文字が現れている。

# Method 1: use canonical SMILES in RDKit with no reordering
cans = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in data['SMILES']]
n_gram_cans = NGram(reorder_prob=0)
n_gram_cans.fit(cans)

# save results
with open('ngram_cans.obj', 'wb') as f:
    pk.dump(n_gram_cans, f)

NGram
NGram(ngram_table=[[[ C ( O [N+] ! =C & N P =[N+] ... \
['C'] 24476 10218 3356 49 2126 1755 2752 2379 35 1 ...
[')'] 6594 1984 2590 68 0 284 0 1429 22 0 ...
['O'] 3240 0 34 21 2355 1782 0 64 302 277 ...
['[N+]'] 0 271 0 0 0 0 7 0 0 0 ...
['=C'] 1292 1760 45 6 46 8 550 35 1 0 ...
... ... ... ... ... ... ... ... ... ... ... ...
['[GeH4]'] 0 0 0 0 1 0 0 0 0 0 ...
['[Ti]'] 0 1 0 0 0 0 0 0 0 0 ...
['[SeH]'] 0 0 0 0 1 0 0 0 0 0 ...
['[Cu]'] 0 0 0 0 0 0 0 0 0 0 ...
['=[O+]'] 0 0 0 0 0 0 0 0 0 0 ....
['c', '(', '=O', ')', 'c', '&', 'c', 'c', 'c', ... 1 0 0 0
['(', '=O', ')', 'c', '&', 'c', 'c', 'c', 'c', ... 0 1 0 0
['c', 'c', 'c', '&', 'c', 'c', 'c', '(', 'c', '&'] 1 0 0 0
['c', 'c', '&', 'c', 'c', 'c', '(', 'c', '&', 'c'] 1 0 0 0
['c', '&', 'c', 'c', 'c', '(', 'c', '&', 'c', 'c'] 1 0 0 0
['&', 'c', 'c', 'c', '(', 'c', '&', 'c', 'c', 'c'] 0 0 1 0
['c', 'c', 'c', '(', 'c', '&', 'c', 'c', 'c', '('] 1 0 0 0
['c', 'c', '(', 'c', '&', 'c', 'c', 'c', '(', 'c'] 0 0 0 1]]])

以上は肩慣らしで、N-gram作成はここからが本番となる。まず、CanonicalなSMILESに変換しなおし、それをmolオブジェクトに変換する。SMILESへの逆変換の際に、SMILESの最初の原子を順繰りに変えた文字列を作成する。重複をなくしたリストを作成し、これをsmi_reorder_allに入れる。重複をなくしたリストの個数がn_reorder以下であれば、ランダムに並び替えて、sample_reorder
入れる。そして、生成したSMILESの文字列数に応じて、ヒストグラムを表示する。

# N-gram library in XenonPy-iQSPR
from xenonpy.inverse.iqspr import NGram

# Method 2: expand n-gram training set with randomly reordered SMILES
# (we show one of the many possible ways of doing it)
n_reorder = 15 # pick a fixed number of re-ordering

# convert the SMILES to canonical SMILES in RDKit (not necessary in general)
cans = []
for smi in data['SMILES']:
    # remove some molecules in the full SMILES list that may lead to error
    try:
        cans.append(Chem.MolToSmiles(Chem.MolFromSmiles(smi)))
    except:
        print(smi)
        pass

mols = [Chem.MolFromSmiles(smi) for smi in cans]
smi_reorder_all = []
smi_reorder = []
for mol in mols:
    idx = list(range(mol.GetNumAtoms()))
    tmp = [Chem.MolToSmiles(mol,rootedAtAtom=x) for x in range(len(idx))]
    smi_reorder_all.append(np.array(list(set(tmp))))
    smi_reorder.append(np.random.choice(smi_reorder_all[-1], n_reorder,
                                        replace=(len(smi_reorder_all[-1]) < n_reorder)))

n_uni = [len(x) for x in smi_reorder_all]
_ = plt.hist(n_uni, bins='auto')  # arguments are passed to np.histogram
_ = plt.title("Histogram for number of SMILES variants")
_ = plt.show()

次に、ここで改めて、新しいNGramクラスを定義する。ここでは、train_orderは10文字、並び替え確率は0.5としている。その後、20文字に拡張してモデルを作っている。このモデル作成にはかなり時間がかかる。M1proで数時間くらい。

# flatten out the list and train the N-gram
flat_list = [item for sublist in smi_reorder for item in sublist]

# first train up to order 10
n_gram_reorder1 = NGram(reorder_prob=0.5)
_ = n_gram_reorder1.fit(flat_list, train_order=10)
# save results
_ = joblib.dump(n_gram_reorder1, 'ngram_reO15_O10.xz')
# with open('ngram_pubchem_ikebata_reO15_O10.obj', 'wb') as f:
#     pk.dump(n_gram_reorder1, f)
    
# Then, train up from order 11 to 20
n_gram_reorder2 = NGram(reorder_prob=0.5, sample_order=(11, 20))
_ = n_gram_reorder2.fit(flat_list, train_order=(11, 20))
# save results
_ = joblib.dump(n_gram_reorder2, 'ngram_reO15_O11to20.xz')
# with open('ngram_pubchem_ikebata_reO15_O11to20.obj', 'wb') as f:
#     pk.dump(n_gram_reorder2, f)

データをロードするときには、以下のようにする。二つのN-gramをマージすることで一つにして、次のiQSPR解析に用いる。

# load a pre-trained n-gram from the pickle file
n_gram = joblib.load('ngram_reO15_O10.xz')
# with open('ngram_pubchem_ikebata_reO15_O10.obj', 'rb') as f:
#     n_gram = pk.load(f)
    
# load a pre-trained n-gram from the pickle file
n_gram2 = joblib.load('ngram_reO15_O11to20.xz')
# with open('ngram_pubchem_ikebata_reO15_O11to20.obj', 'rb') as f:
#     n_gram2 = pk.load(f)

_ = n_gram.merge_table(n_gram2)

まとめ

この作業自体はかなり長くかかるため、別解析として考えた方が良い。セーブするところまでを一つの区切りとする。

前述の書籍によるとそのほかの逆解析方法として遺伝的アルゴリズム、VAE、GANがある。Xenonpyが終わったら、VAEか、GANはここで、試してみたい。

さて、次のiQSPR部分は、表示が面倒なだけで、コーディングの面では、かなり楽だと思っている。

一番の難所は、
Descriptorの定義
学習モデルの準備
LogLikelihood関数の定義
である。

そのため、先にN-gram生成を行なっておくと、実際の研究には良いだろう。

run tools.ipynb

# load in-house data
data = pd.read_csv("./iQSPR_sample_data.csv", index_col=0)
data=data[ data["SMILES"] != "FBr(F)(F)(F)F"  ]

# N-gram library in XenonPy-iQSPR
from xenonpy.inverse.iqspr import NGram

# Method 2: expand n-gram training set with randomly reordered SMILES
# (we show one of the many possible ways of doing it)
n_reorder = 15 # pick a fixed number of re-ordering

# convert the SMILES to canonical SMILES in RDKit (not necessary in general)
cans = []
for smi in data['SMILES']:
    # remove some molecules in the full SMILES list that may lead to error
    try:
        cans.append(Chem.MolToSmiles(Chem.MolFromSmiles(smi)))
    except:
        print(smi)
        pass

mols = [Chem.MolFromSmiles(smi) for smi in cans]
smi_reorder_all = []
smi_reorder = []
for mol in mols:
    idx = list(range(mol.GetNumAtoms()))
    tmp = [Chem.MolToSmiles(mol,rootedAtAtom=x) for x in range(len(idx))]
    smi_reorder_all.append(np.array(list(set(tmp))))
    smi_reorder.append(np.random.choice(smi_reorder_all[-1], n_reorder,
                                        replace=(len(smi_reorder_all[-1]) < n_reorder)))

n_uni = [len(x) for x in smi_reorder_all]
_ = plt.hist(n_uni, bins='auto')  # arguments are passed to np.histogram
_ = plt.title("Histogram for number of SMILES variants")
_ = plt.show()


# flatten out the list and train the N-gram
flat_list = [item for sublist in smi_reorder for item in sublist]

# first train up to order 10
n_gram_reorder1 = NGram(reorder_prob=0.5)
_ = n_gram_reorder1.fit(flat_list, train_order=10)
# save results
_ = joblib.dump(n_gram_reorder1, 'ngram_reO15_O10.xz')
# with open('ngram_pubchem_ikebata_reO15_O10.obj', 'wb') as f:
#     pk.dump(n_gram_reorder1, f)
    
# Then, train up from order 11 to 20
n_gram_reorder2 = NGram(reorder_prob=0.5, sample_order=(11, 20))
_ = n_gram_reorder2.fit(flat_list, train_order=(11, 20))
# save results
_ = joblib.dump(n_gram_reorder2, 'ngram_reO15_O11to20.xz')
# with open('ngram_pubchem_ikebata_reO15_O11to20.obj', 'wb') as f:
#     pk.dump(n_gram_reorder2, f)