FGSM for Adversarial Attacks

FGSM is a simple algorithm that ...

first we can import the Adversarial Module and access the MNIST data from Flux

using Adversarial
using Flux
using Flux: update!
using Flux.Data.MNIST
using Statistics
# using Flux.Tracker: gradient, update!
# using CuArrays # comment out if a GPU is not available
using Plots

next we will load the training. For the example, we will only perform adversarial attacks on this data

train_images = MNIST.images();
train_labels = MNIST.labels();

A function to conver the images to arrays

function minibatch(i, batch_size = 32)
    x_batch = Array{Float32}(undef, size(train_images[1])..., 1, batch_size)
    y_batch = Flux.onehotbatch(train_labels[i:i+batch_size-1], 0:9)
    for (idx, image) in enumerate(train_images[i:i+batch_size-1])
        x_batch[:,:,1,idx] = Float32.(image)
    end
    return x_batch |> gpu, y_batch |> gpu
end

minibatch (generic function with 2 methods)

We then create and train a simple CNN.

CNN() = Chain(
    Conv((3, 3), 1=>16, pad=(1,1), relu),
    MaxPool((2,2)),
    Conv((3, 3), 16=>32, pad=(1,1), relu),
    MaxPool((2,2)),
    Conv((3, 3), 32=>32, pad=(1,1), relu),
    MaxPool((2,2)),
    x -> reshape(x, :, size(x, 4)),
    Dense(288, 10),
    softmax) |> gpu

m = CNN();
θ = params(m);
loss(x, y) = Flux.crossentropy(m(x), y)
acc(x, y) = mean(Flux.onecold(m(x)) |> cpu .== Flux.onecold(y) |> cpu)
opt = ADAM()

ADAM(0.001, (0.9, 0.999), IdDict{Any,Any}())

now for our training loop...

const EPOCHS = 5
const BATCH_SIZE = 32

for epoch in 1:EPOCHS
    losses = 0.
    accuracies = 0.
    steps = 0

    for i in 1:BATCH_SIZE:size(train_images,1)-BATCH_SIZE
        local l;
        x, y = minibatch(i)
        gs = gradient(θ) do
            l = loss(x, y)
            return l
        end
        a = acc(x, y)

        update!(opt, θ, gs)

        losses += (l |> cpu)
        accuracies += (a |> cpu)
        steps += 1
    end
    @info "Epoch $epoch end. Loss: $(losses / steps), Acc: $(accuracies / steps)"
end

Now that we have a "trained" model, lets create some adversarial examples using the FGSM method.

Let's begin with a single image

x, y = minibatch(1, 1)
x_adv = FGSM(m, loss, x, y; ϵ = 0.07)

# we can see that the predicted labels are different
adversarial_pred = m(x_adv) |> Flux.onecold |> getindex
original_pred = m(x) |> Flux.onecold |> getindex

and visualise the resulting adversarial in comparison to the original image. When using an ϵ value of 0.07, the different is very slight, if noticable at all.

l = @layout [a b]
adv = heatmap(permutedims(x_adv, (4, 3, 1, 2))[1,1,:,:] |> cpu)
org = heatmap(permutedims(x, (4, 3, 1, 2))[1,1,:,:] |> cpu)
plot(org, adv, layout = l)

@assert adversarial_pred != original_pred