-
Notifications
You must be signed in to change notification settings - Fork 47
/
l0_attack.py
249 lines (207 loc) · 10.9 KB
/
l0_attack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
## l0_attack.py -- attack a network optimizing for l_0 distance
##
## Copyright (C) IBM Corp, 2017-2018
## Copyright (C) 2016, Nicholas Carlini <[email protected]>.
##
## This program is licenced under the BSD 2-Clause licence,
## contained in the LICENCE file in this directory.
import sys
import tensorflow as tf
import numpy as np
MAX_ITERATIONS = 1000 # number of iterations to perform gradient descent
ABORT_EARLY = True # abort gradient descent upon first valid solution
LEARNING_RATE = 1e-2 # larger values converge faster to less accurate results
INITIAL_CONST = 1e-3 # the first value of c to start at
LARGEST_CONST = 2e6 # the largest value of c to go up to before giving up
REDUCE_CONST = False # try to lower c each iteration; faster to set to false
TARGETED = True # should we target one specific class? or just be wrong?
CONST_FACTOR = 2.0 # f>1, rate at which we increase constant, smaller better
class CarliniL0:
def __init__(self, sess, model,
targeted = TARGETED, learning_rate = LEARNING_RATE,
max_iterations = MAX_ITERATIONS, abort_early = ABORT_EARLY,
initial_const = INITIAL_CONST, largest_const = LARGEST_CONST,
reduce_const = REDUCE_CONST, const_factor = CONST_FACTOR,
independent_channels = False):
"""
The L_0 optimized attack.
Returns adversarial examples for the supplied model.
targeted: True if we should perform a targetted attack, False otherwise.
learning_rate: The learning rate for the attack algorithm. Smaller values
produce better results but are slower to converge.
max_iterations: The maximum number of iterations. Larger values are more
accurate; setting too small will require a large learning rate and will
produce poor results.
abort_early: If true, allows early aborts if gradient descent gets stuck.
initial_const: The initial tradeoff-constant to use to tune the relative
importance of distance and confidence. Should be set to a very small
value (but positive).
largest_const: The largest constant to use until we report failure. Should
be set to a very large value.
const_factor: The rate at which we should increase the constant, when the
previous constant failed. Should be greater than one, smaller is better.
independent_channels: set to false optimizes for number of pixels changed,
set to true (not recommended) returns number of channels changed.
"""
self.model = model
self.sess = sess
self.TARGETED = targeted
self.LEARNING_RATE = learning_rate
self.MAX_ITERATIONS = max_iterations
self.ABORT_EARLY = abort_early
self.INITIAL_CONST = initial_const
self.LARGEST_CONST = largest_const
self.REDUCE_CONST = reduce_const
self.const_factor = const_factor
self.independent_channels = independent_channels
self.grad = self.gradient_descent(sess, model)
def gradient_descent(self, sess, model):
def compare(x,y):
if self.TARGETED:
return x == y
else:
return x != y
shape = (1,model.image_size,model.image_size,model.num_channels)
# the variable to optimize over
modifier = tf.Variable(np.zeros(shape,dtype=np.float32))
# the variables we're going to hold, use for efficiency
canchange = tf.Variable(np.zeros(shape),dtype=np.float32)
simg = tf.Variable(np.zeros(shape,dtype=np.float32))
original = tf.Variable(np.zeros(shape,dtype=np.float32))
timg = tf.Variable(np.zeros(shape,dtype=np.float32))
tlab = tf.Variable(np.zeros((1,model.num_labels),dtype=np.float32))
const = tf.placeholder(tf.float32, [])
# and the assignment to set the variables
assign_modifier = tf.placeholder(np.float32,shape)
assign_canchange = tf.placeholder(np.float32,shape)
assign_simg = tf.placeholder(np.float32,shape)
assign_original = tf.placeholder(np.float32,shape)
assign_timg = tf.placeholder(np.float32,shape)
assign_tlab = tf.placeholder(np.float32,(1,self.model.num_labels))
# these are the variables to initialize when we run
set_modifier = tf.assign(modifier, assign_modifier)
setup = []
setup.append(tf.assign(canchange, assign_canchange))
setup.append(tf.assign(timg, assign_timg))
setup.append(tf.assign(original, assign_original))
setup.append(tf.assign(simg, assign_simg))
setup.append(tf.assign(tlab, assign_tlab))
newimg = (tf.tanh(modifier + simg)/2)*canchange+(1-canchange)*original
output = model.predict(newimg)
real = tf.reduce_sum((tlab)*output,1)
other = tf.reduce_max((1-tlab)*output - (tlab*10000),1)
if self.TARGETED:
# if targetted, optimize for making the other class most likely
loss1 = tf.maximum(0.0, other-real+.01)
else:
# if untargeted, optimize for making this class least likely.
loss1 = tf.maximum(0.0, real-other+.01)
# sum up the losses
loss2 = tf.reduce_sum(tf.square(newimg-tf.tanh(timg)/2))
loss = const*loss1+loss2
outgrad = tf.gradients(loss, [modifier])[0]
# setup the adam optimizer and keep track of variables we're creating
start_vars = set(x.name for x in tf.global_variables())
optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE)
train = optimizer.minimize(loss, var_list=[modifier])
end_vars = tf.global_variables()
new_vars = [x for x in end_vars if x.name not in start_vars]
init = tf.variables_initializer(var_list=[modifier,canchange,simg,
original,timg,tlab]+new_vars)
def doit(oimgs, labs, starts, valid, CONST):
# convert to tanh-space
imgs = np.arctanh(np.array(oimgs)*1.999999)
starts = np.arctanh(np.array(starts)*1.999999)
# initialize the variables
sess.run(init)
sess.run(setup, {assign_timg: imgs,
assign_tlab:labs,
assign_simg: starts,
assign_original: oimgs,
assign_canchange: valid})
while CONST < self.LARGEST_CONST:
# try solving for each value of the constant
print('try const', CONST)
for step in range(self.MAX_ITERATIONS):
feed_dict={const: CONST}
# remember the old value
oldmodifier = self.sess.run(modifier)
if step%(self.MAX_ITERATIONS//10) == 0:
print(step,*sess.run((loss1,loss2),feed_dict=feed_dict))
# perform the update step
_, works = sess.run([train, loss1], feed_dict=feed_dict)
if works < .0001 and (self.ABORT_EARLY or step == CONST-1):
# it worked previously, restore the old value and finish
self.sess.run(set_modifier, {assign_modifier: oldmodifier})
grads, scores, nimg = sess.run((outgrad, output,newimg),
feed_dict=feed_dict)
l2s=np.square(nimg-np.tanh(imgs)/2).sum(axis=(1,2,3))
return grads, scores, nimg, CONST
# we didn't succeed, increase constant and try again
CONST *= self.const_factor
return doit
def attack(self, imgs, targets):
"""
Perform the L_0 attack on the given images for the given targets.
If self.targeted is true, then the targets represents the target labels.
If self.targeted is false, then targets are the original class labels.
"""
r = []
for i,(img,target) in enumerate(zip(imgs, targets)):
print("Attack iteration",i)
r.extend(self.attack_single(img, target))
return np.array(r)
def attack_single(self, img, target):
"""
Run the attack on a single image and label
"""
# the pixels we can change
valid = np.ones((1,self.model.image_size,self.model.image_size,self.model.num_channels))
# the previous image
prev = np.copy(img).reshape((1,self.model.image_size,self.model.image_size,
self.model.num_channels))
last_solution = None
const = self.INITIAL_CONST
while True:
# try to solve given this valid map
res = self.grad([np.copy(img)], [target], np.copy(prev),
valid, const)
if res == None:
# the attack failed, we return this as our final answer
print("Final answer",equal_count)
return last_solution
# the attack succeeded, now we pick new pixels to set to 0
restarted = False
gradientnorm, scores, nimg, const = res
if self.REDUCE_CONST: const /= 2
equal_count = self.model.image_size**2-np.sum(np.all(np.abs(img-nimg[0])<.0001,axis=2))
print("Forced equal:",np.sum(1-valid),
"Equal count:",equal_count)
if np.sum(valid) == 0:
# if no pixels changed, return
return [img]
if self.independent_channels:
# we are allowed to change each channel independently
valid = valid.flatten()
totalchange = abs(nimg[0]-img)*np.abs(gradientnorm[0])
else:
# we care only about which pixels change, not channels independently
# compute total change as sum of change for each channel
valid = valid.reshape((self.model.image_size**2,self.model.num_channels))
totalchange = abs(np.sum(nimg[0]-img,axis=2))*np.sum(np.abs(gradientnorm[0]),axis=2)
totalchange = totalchange.flatten()
# set some of the pixels to 0 depending on their total change
did = 0
for e in np.argsort(totalchange):
if np.all(valid[e]):
did += 1
valid[e] = 0
if totalchange[e] > .01:
# if this pixel changed a lot, skip
break
if did >= .3*equal_count**.5:
# if we changed too many pixels, skip
break
valid = np.reshape(valid,(1,self.model.image_size,self.model.image_size,-1))
print("Now forced equal:",np.sum(1-valid))
last_solution = prev = nimg