1. 程式人生 > >藝術風格轉換之《A Neural Algorithm of Artistic Style》

藝術風格轉換之《A Neural Algorithm of Artistic Style》

scipy.optimize.minimize的使用

scipy.optimize.minimize(fun, x0, args=(), method=None, jac=None, hess=None, hessp=None, bounds=None, constraints=(), tol=None, callback=None, options=None)

fun目標函式,也即要最小化的函式。

x0初始猜測值。

args可將額外引數傳給fun

method論文使用的是L-BFGS-B

options為字典形,其中maxiter表示最大迭代次數。

L-BFGS-B的方法前提下(其它方法可選引數不一樣):

jac表示fun除了要返回loss之外,還要另外返回變數的向量梯度grad

bounds表示各個變數的範圍,形式為列表中的元組。

舉一個例子如下:

minxf(x)=||x||2

s.t.5<x1<5,12<x2<23

如下圖所示:greeting是額外引數,因為jacTrue, 所以除了loss還返回了梯度向量2*x
這裡寫圖片描述

Loss解釋

Ltotal(p⃗ ,a⃗ ,x⃗ )=αLcontent(p⃗ ,x⃗ )+βLstyle(a⃗ ,x
⃗ 
)

其中p⃗ a⃗ x⃗ 分別代表content照、artwork照、生成照。α,β表示對兩種loss的權重。

Lstyle強調紋理、顏色的損失,以Gram矩陣來表達。

Lcontent強調直接損失,以特徵圖直接表達。

其它具體細節請參照論文。

原始碼

# system imports
import argparse
import logging
import os
import sys
import timeit

# library imports
import caffe
import numpy as np
import progressbar as pb
from
scipy.fftpack import ifftn from scipy.linalg.blas import sgemm from scipy.misc import imsave from scipy.optimize import minimize from skimage import img_as_ubyte from skimage.transform import rescale # logging LOG_FORMAT = "%(filename)s:%(funcName)s:%(asctime)s.%(msecs)03d -- %(message)s" # numeric constants INF = np.float32(np.inf) STYLE_SCALE = 1.2 # weights for the individual models # assume that corresponding layers' top blob matches its name VGG19_WEIGHTS = {"content": {"conv4_2": 1}, "style": {"conv1_1": 0.2, "conv2_1": 0.2, "conv3_1": 0.2, "conv4_1": 0.2, "conv5_1": 0.2}} VGG16_WEIGHTS = {"content": {"conv4_2": 1}, "style": {"conv1_1": 0.2, "conv2_1": 0.2, "conv3_1": 0.2, "conv4_1": 0.2, "conv5_1": 0.2}} GOOGLENET_WEIGHTS = {"content": {"conv2/3x3": 2e-4, "inception_3a/output": 1-2e-4}, "style": {"conv1/7x7_s2": 0.2, "conv2/3x3": 0.2, "inception_3a/output": 0.2, "inception_4a/output": 0.2, "inception_5a/output": 0.2}} CAFFENET_WEIGHTS = {"content": {"conv4": 1}, "style": {"conv1": 0.2, "conv2": 0.2, "conv3": 0.2, "conv4": 0.2, "conv5": 0.2}} # argparse parser = argparse.ArgumentParser(description="Transfer the style of one image to another.", usage="style.py -s <style_image> -c <content_image>") parser.add_argument("-s", "--style-img", type=str, required=True, help="input style (art) image") parser.add_argument("-c", "--content-img", type=str, required=True, help="input content image") parser.add_argument("-g", "--gpu-id", default=0, type=int, required=False, help="GPU device number") parser.add_argument("-m", "--model", default="vgg16", type=str, required=False, help="model to use") parser.add_argument("-i", "--init", default="content", type=str, required=False, help="initialization strategy") parser.add_argument("-r", "--ratio", default="1e4", type=str, required=False, help="style-to-content ratio") parser.add_argument("-n", "--num-iters", default=512, type=int, required=False, help="L-BFGS iterations") parser.add_argument("-l", "--length", default=512, type=float, required=False, help="maximum image length") parser.add_argument("-v", "--verbose", action="store_true", required=False, help="print minimization outputs") parser.add_argument("-o", "--output", default=None, required=False, help="output path") ## 5.1.6.2 def _compute_style_grad(F, G, G_style, layer): """ Computes style gradient and loss from activation features. """ # compute loss and gradient (Fl, Gl) = (F[layer], G[layer]) c = Fl.shape[0]**-2 * Fl.shape[1]**-2 El = Gl - G_style[layer] loss = c/4 * (El**2).sum() grad = c * sgemm(1.0, El, Fl) * (Fl>0) return loss, grad ## 5.1.6.3 def _compute_content_grad(F, F_content, layer): """ Computes content gradient and loss from activation features. """ # compute loss and gradient Fl = F[layer] El = Fl - F_content[layer] loss = (El**2).sum() / 2 grad = El * (Fl>0) return loss, grad ##5.1.2 計算content特徵和style特徵,並返回 def _compute_reprs(net_in, net, layers_style, layers_content, gram_scale=1): """ Computes representation matrices for an image. """ # input data and forward pass (repr_s, repr_c) = ({}, {}) net.blobs["data"].data[0] = net_in net.forward() # loop through combined set of layers for layer in set(layers_style)|set(layers_content): F = net.blobs[layer].data[0].copy() F.shape = (F.shape[0], -1) repr_c[layer] = F if layer in layers_style: repr_s[layer] = sgemm(gram_scale, F, F.T) return repr_s, repr_c ## 5.1.6 目標函式,返回損失、和梯度向量 def style_optfn(x, net, weights, layers, reprs, ratio): """ Style transfer optimization callback for scipy.optimize.minimize(). :param numpy.ndarray x: Flattened data array. :param caffe.Net net: Network to use to generate gradients. :param dict weights: Weights to use in the network. :param list layers: Layers to use in the network. :param tuple reprs: Representation matrices packed in a tuple. :param float ratio: Style-to-content ratio. """ # 解析輸入引數 layers_style = weights["style"].keys() layers_content = weights["content"].keys() net_in = x.reshape(net.blobs["data"].data.shape[1:]) (G_style, F_content) = reprs ## 5.1.6.1又進入_compute_reprs函式,之前都是單獨計算style特徵或者content特徵, ## 這裡對(初始猜測net_in=img0)在各層(layers_style|layers_content)提取content特徵, ## 在layers_style層提取style特徵。 (G, F) = _compute_reprs(net_in, net, layers_style, layers_content) # 按層反向傳播 # 初始化loss和layers中最後一層的diff loss = 0 net.blobs[layers[-1]].diff[:] = 0 # diff函式指的是 T 時刻與 T-1 時刻的差分,初始化為零 for i, layer in enumerate(reversed(layers)) next_layer = None if i == len(layers)-1 else layers[-i-2] # 迴圈的過程:conv5_1 --> conv4_2 --> conv4_1 --> conv3_1 --> conv2_1 # --> conv1_1 --> data層,這個過程是一步一步進行的, # 首次迴圈是conv5_1(layer) --> conv4_2(next_layer) grad = net.blobs[layer].diff[0] # style對loss的貢獻 if layer in layers_style: wl = weights["style"][layer] ## 5.1.6.2進入_compute_style_grad函式, ## 計算style特徵對loss的貢獻, ## 以及loss對 F(初始猜測的該層的content特徵)的導數。 (l, g) = _compute_style_grad(F, G, G_style, layer) loss += wl * l * ratio # ratio是style損失和content損失對總loss貢獻的權重摺中。 grad += wl * g.reshape(grad.shape) * ratio # 更新梯度 # content對loss的貢獻 if layer in layers_content: wl = weights["content"][layer] ## 5.1.6.3 進入_compute_content_grad, ## 計算content特徵對loss的貢獻, ## 以及loss對 F(初始猜測的該層的content特徵)的導數。 (l, g) = _compute_content_grad(F, F_content, layer) loss += wl * l grad += wl * g.reshape(grad.shape) # 梯度反向傳播 net.backward(start=layer, end=next_layer) if next_layer is None: grad = net.blobs["data"].diff[0] else: grad = net.blobs[next_layer].diff[0] # 將梯度拉成一個長向量 grad = grad.flatten().astype(np.float64) # 一次優化結束,返回損失和梯度向量 return loss, grad class StyleTransfer(object): ## 4.1 生成物件例項 '''初始化了如下引數 self.net #網路 self.transformer #網路轉換器 self.weights #網路權重 self.layers #網路層 self.callback #函式 self.use_pbar #是否顯示進度條 ''' def __init__(self, model_name, use_pbar=True): """ Initialize the model used for style transfer. :param str model_name: Model to use. :param bool use_pbar: Use progressbar flag. """ style_path = os.path.abspath(os.path.split(__file__)[0]) base_path = os.path.join(style_path, "models", model_name) # vgg19 if model_name == "vgg19": model_file = os.path.join(base_path, "VGG_ILSVRC_19_layers_deploy.prototxt") pretrained_file = os.path.join(base_path, "VGG_ILSVRC_19_layers.caffemodel") mean_file = os.path.join(base_path, "ilsvrc_2012_mean.npy") weights = VGG19_WEIGHTS # vgg16 elif model_name == "vgg16": model_file = os.path.join(base_path, "VGG_ILSVRC_16_layers_deploy.prototxt") pretrained_file = os.path.join(base_path, "VGG_ILSVRC_16_layers.caffemodel") mean_file = os.path.join(base_path, "ilsvrc_2012_mean.npy") weights = VGG16_WEIGHTS # googlenet elif model_name == "googlenet": model_file = os.path.join(base_path, "deploy.prototxt") pretrained_file = os.path.join(base_path, "bvlc_googlenet.caffemodel") mean_file = os.path.join(base_path, "ilsvrc_2012_mean.npy") weights = GOOGLENET_WEIGHTS # caffenet elif model_name == "caffenet": model_file = os.path.join(base_path, "deploy.prototxt") pretrained_file = os.path.join(base_path, "bvlc_reference_caffenet.caffemodel") mean_file = os.path.join(base_path, "ilsvrc_2012_mean.npy") weights = CAFFENET_WEIGHTS else: assert False, "model not available" # add model and weights ## 4.1.1 進入self.load_model, 初始化了self.net、self.transformer self.load_model(model_file, pretrained_file, mean_file) self.weights = weights.copy() self.layers = [] for layer in self.net.blobs: if layer in self.weights["style"] or layer in self.weights["content"]: self.layers.append(layer) self.use_pbar = use_pbar # set the callback function # 定義的callback函式,每次優化後都會被呼叫一次,顯示進度條。 if self.use_pbar: def callback(xk): self.grad_iter += 1 try: self.pbar.update(self.grad_iter) except: self.pbar.finished = True if self._callback is not None: net_in = xk.reshape(self.net.blobs["data"].data.shape[1:]) self._callback(self.transformer.deprocess("data", net_in)) else: def callback(xk): if self._callback is not None: net_in = xk.reshape(self.net.blobs["data"].data.shape[1:]) self._callback(self.transformer.deprocess("data", net_in)) self.callback = callback ##4.1.1 比較基礎,不做多解釋 def load_model(self, model_file, pretrained_file, mean_file): """ Loads specified model from caffe install (see caffe docs). :param str model_file: Path to model protobuf. :param str pretrained_file: Path to pretrained caffe model. :param str mean_file: Path to mean file. """ # load net (supressing stderr output) null_fds = os.open(os.devnull, os.O_RDWR) out_orig = os.dup(2) os.dup2(null_fds, 2) net = caffe.Net(model_file, pretrained_file, caffe.TEST) os.dup2(out_orig, 2) os.close(null_fds) # all models used are trained on imagenet data transformer = caffe.io.Transformer({"data": net.blobs["data"].data.shape}) transformer.set_mean("data", np.load(mean_file).mean(1).mean(1)) transformer.set_channel_swap("data", (2,1,0)) transformer.set_transpose("data", (2,0,1)) transformer.set_raw_scale("data", 255) # add net parameters self.net = net self.transformer = transformer ## 5.2 獲得生成的影象, ## 之所以能從'data'層得到的, ## 是因為在5.1.6 style_optfn中, ## 反向傳播時對初始猜測不斷進行更新了。 def get_generated(self): """ Saves the generated image (net input, after optimization). :param str path: Output path. """ data = self.net.blobs["data"].data img_out = self.transformer.deprocess("data", data) return img_out ## 5.1.1 調整網路輸入尺度,以適應img def _rescale_net(self, img): """ Rescales the network to fit a particular image. """ # get new dimensions and rescale net + transformer new_dims = (1, img.shape[2]) + img.shape[:2] self.net.blobs["data"].reshape(*new_dims) self.transformer.inputs["data"] = new_dims def _make_noise_input(self, init): """ Creates an initial input (generated) image. """ # specify dimensions and create grid in Fourier domain dims = tuple(self.net.blobs["data"].data.shape[2:]) + \ (self.net.blobs["data"].data.shape[1], ) grid = np.mgrid[0:dims[0], 0:dims[1]] # create frequency representation for pink noise Sf = (grid[0] - (dims[0]-1)/2.0) ** 2 + \ (grid[1] - (dims[1]-1)/2.0) ** 2 Sf[np.where(Sf == 0)] = 1 Sf = np.sqrt(Sf) Sf = np.dstack((Sf**int(init),)*dims[2]) # apply ifft to create pink noise and normalize ifft_kernel = np.cos(2*np.pi*np.random.randn(*dims)) + \ 1j*np.sin(2*np.pi*np.random.randn(*dims)) img_noise = np.abs(ifftn(Sf * ifft_kernel)) img_noise -= img_noise.min() img_noise /= img_noise.max() # preprocess the pink noise image x0 = self.transformer.preprocess("data", img_noise) return x0 def _create_pbar(self, max_iter): """ Creates a progress bar. """ self.grad_iter = 0 self.pbar = pb.ProgressBar() self.pbar.widgets = ["Optimizing: ", pb.Percentage(), " ", pb.Bar(marker=pb.AnimatedMarker()), " ", pb.ETA()] self.pbar.maxval = max_iter ## 5.1 關鍵函式,轉換風格 def transfer_style(self, img_style, img_content, length=512, ratio=1e5, n_iter=512, init="-1", verbose=False, callback=None): """ Transfers the style of the artwork to the input image. :param numpy.ndarray img_style: A style image with the desired target style. :param numpy.ndarray img_content: A content image in floating point, RGB format. :param function callback: A callback function, which takes images at iterations. """ # 假設輸入影象:高=寬 orig_dim = min(self.net.blobs["data"].shape[2:]) # 對影象進行尺度變換,但我不知作用是什麼。。 scale = max(length / float(max(img_style.shape[:2])), orig_dim / float(min(img_style.shape[:2]))) img_style = rescale(img_style, STYLE_SCALE*scale) scale = max(length / float(max(img_content.shape[:2])), orig_dim / float(min(img_content.shape[:2]))) img_content = rescale(img_content, scale) # 計算style特徵 ## 5.1.1 進入_rescale_net函式,調整網路輸入引數,使尺度適應img_style影象 self._rescale_net(img_style) layers = self.weights["style"].keys() net_in = self.transformer.preprocess("data", img_style) gram_scale = float(img_content.size)/img_style.size #這一句作者沒使用。。 ## 5.1.2 進入_compute_reprs,計算並返回img_style的style特徵 G_style = _compute_reprs(net_in, self.net, layers, [], gram_scale=1)[0] # 計算content特徵 ## 5.1.3進入_rescale_net函式,調整網路輸入引數,以適應影象img_content ## 注意之後就沒有改變網路輸入引數了,一直沿用適應img_content的這些引數 self._rescale_net(img_content) layers = self.weights["content"].keys() net_in = self.transformer.preprocess("data", img_content) ## 5.1.4 進入_compute_reprs,計算並返回img_content的content特徵 F_content = _compute_reprs(net_in, self.net, [], layers)[1] # 生成初始網路輸入img0 # 預設的init = 'content', 見開頭argpaser部分 if isinstance(init, np.ndarray): #如果init是陣列型別 img0 = self.transformer.preprocess("data", init) elif init == "content": img0 = self.transformer.preprocess("data", img_content) elif init == "mixed": img0 = 0.95*self.transformer.preprocess("data", img_content) + \ 0.05*self.transformer.preprocess("data", img_style) else: img0 = self._make_noise_input(init) #生成隨機噪聲影象,預設情況下不會用到。 # 計算每個畫素的範圍 data_min = -self.transformer.mean["data"][:,0,0] data_max = data_min + self.transformer.raw_scale["data"] data_bounds = [(data_min[0], data_max[0])]*(img0.size/3) + \ [(data_min[1], data_max[1])]*(img0.size/3) + \ [(data_min[2], data_max[2])]*(img0.size/3) # 設定優化引數 grad_method = "L-BFGS-B" reprs = (G_style, F_content) minfn_args = { "args": (self.net, self.weights, self.layers, reprs, ratio), "method": grad_method, "jac": True, "bounds": data_bounds, "options": {"maxcor": 8, "maxiter": n_iter, "disp": verbose} } # 進行優化 self._callback = callback #callback為輸入引數,預設為None minfn_args["callback"] = self.callback #self.callback是一個函式,與上一句有區別 if self.use_pbar and not verbose: #預設情況 ## 5.1.5 進入_create_pbar函式,生成進度條。 self._create_pbar(n_iter) self.pbar.start() ## 5.1.6 最最關鍵的函式!!!我們來看style_optfn函式 ## minimize返回一個類,這裡作者只使用了類的nit屬性:迭代次數 res = minimize(style_optfn, img0.flatten(), **minfn_args).nit self.pbar.finish() else: res = minimize(style_optfn, img0.flatten(), **minfn_args).nit return res def main(args): ## 1. logging level = logging.INFO if args.verbose else logging.DEBUG logging.basicConfig(format=LOG_FORMAT, datefmt="%H:%M:%S", level=level) logging.info("Starting style transfer.") ## 2. 設定 GPU/CPU 模式 if args.gpu_id == -1: caffe.set_mode_cpu() logging.info("Running net on CPU.") else: caffe.set_device(args.gpu_id) caffe.set_mode_gpu() logging.info("Running net on GPU {0}.".format(args.gpu_id)) ## 3. 載入影象,RGB格式,值在(0,1)之間 img_style = caffe.io.load_image(args.style_img) img_content = caffe.io.load_image(args.content_img) logging.info("Successfully loaded images.") ## 4. 生成StyleTransfer物件 use_pbar = not args.verbose ## 4.1 進入StyleTransfer類 st = StyleTransfer(args.model.lower(), use_pbar=use_pbar) logging.info("Successfully loaded model {0}.".format(args.model)) ## 5. 進行風格轉換 start = timeit.default_timer() ## 5.1 關鍵函式,轉換風格 n_iters = st.transfer_style(img_style, img_content, length=args.length, init=args.init, ratio=np.float(args.ratio), n_iter=args.num_iters, verbose=args.verbose) end = timeit.default_timer() logging.info("Ran {0} iterations in {1:.0f}s.".format(n_iters, end-start)) ## 5.2 獲得輸出影象 img_out = st.get_generated() ## 6. 輸出路徑 if args.output is not None: out_path = args.output else: out_path_fmt = (os.path.splitext(os.path.split(args.content_img)[1])[0], os.path.splitext(os.path.split(args.style_img)[1])[0], args.model, args.init, args.ratio, args.num_iters) out_path = "outputs/{0}-{1}-{2}-{3}-{4}-{5}.jpg".format(*out_path_fmt) ## 7. 儲存影象 imsave(out_path, img_as_ubyte(img_out)) logging.info("Output saved to {0}.".format(out_path)) if __name__ == "__main__": args = parser.parse_args() main(args)