i’ve played around with this too much yesterday. if you wanna stay on the cpu side of things i think it’s possible, but it’ll require a fast computer. i have a 2.3ghz quad core and managed to get up to a bit over 20 fps in full hd, a bit faster would be nice.
i made a test video. as input i used two scenes from a nasa video. as mask i used a single circle.
so, before i post the code, here are some import performance pointer:
- the contents of your mask will determine which source image is read. the smoother your mask is, the faster the code will run. the impact of this is huge (ie. going from 10 to 30 fps switching from a random to a linear mask). doing these completely random memory queries will make the cache fail a lot i guess.
- using set color/getcolor is indeed slow. work with the raw buffer data as much as possible
- use arrays instead of vectors if possible
- parallelization doesn’t speed things up as much as expected. on my machine (i have 8 threads) using 3 threads yields maximum performance
all in all it was too fun to play with and i got a bit sidetracked, so i made a lot of changes:
- the effect looks quite bad when limiting the mask accuracy to brightness values. i’ve switched from the mask to a float buffer, and then linearly interpolate between neighbouring source frames. this gives very neat results, but dropped me down to ~10fps again.
- no idea how you assemble the mask. i’m using a circle that also shifts through time, and got rid of your x scanning offset.
- from your snippet i’m not sure what you use the mask transparency for. i’m not using it anymore.
alrighty, here’s the code:
#pragma once
#include "ofMain.h"
#define NUM_WORKERS 4
class MaskWorker;
class EffectWorker;
class ofApp : public ofBaseApp{
public:
void setup();
void update();
void draw();
void keyPressed(int key);
void keyReleased(int key);
void mouseMoved(int x, int y );
void mouseDragged(int x, int y, int button);
void mousePressed(int x, int y, int button);
void mouseReleased(int x, int y, int button);
void mouseEntered(int x, int y);
void mouseExited(int x, int y);
void windowResized(int w, int h);
void dragEvent(ofDragInfo dragInfo);
void gotMessage(ofMessage msg);
// not really used anymore,
// except to tell everyone what size the images are.
ofPixels mskPix;
// that's the new mask buffer.
float * mess;
// final output goes here
ofImage rsltImg;
// source images
vector<ofImage> allDayImagesFull;
// source buffer of the mask data
unsigned char** allDayData;
// workers to assemble the mask
vector<shared_ptr<MaskWorker>> maskWorkers;
// workers to assemble the result image
vector<shared_ptr<EffectWorker>> effectWorkers;
// repurposing this. it's now used for the global time offset
int imgScrollX;
// determines the size of the circle
float fxFactor;
float mouseX;
float mouseY;
};
#include "ofApp.h"
#include <mutex>
// this generates the mask
// it is split by lines (for two workers, worker 1 does upper half, worker 2 does the lower half)
class MaskWorker : public ofThread{
private:
ofApp & app;
int offset;
std::mutex m;
public:
MaskWorker( ofApp & app, int offset ) : app(app), offset(offset){
m.lock();
}
void threadedFunction(){
while(isThreadRunning()){
// Wait until main() sends data
m.lock();
unsigned char * maskData = app.mskPix.getData();
unsigned char * resultData = app.rsltImg.getPixels().getData();
int w = app.mskPix.getWidth();
int h = app.mskPix.getHeight();
float mx = app.mouseX;
float my = app.mouseY;
float of_width = ofGetWidth();
float of_height = ofGetHeight();
size_t dest_len = w*h;
size_t dest_start = w*h*offset/NUM_WORKERS;
size_t dest_end = w*h*(offset+1)/NUM_WORKERS;
for( ; dest_start < dest_end; dest_start++ ){
int x = (dest_start)%w;
int y = (dest_start)/w;
//app.mess[dest_start] = 1-ofClamp((fabsf(mx-x)+fabsf(my-y))/of_width/app.fxFactor,0,1);
app.mess[dest_start] = 1-ofClamp(ofDist(x,y,mx,my)/of_width/app.fxFactor,0,1);
}
m.unlock();
}
}
void work(){
m.unlock();
}
void wait(){
m.lock();
}
};
// this processes a chunk of the image
// it is split by lines (for two workers, worker 1 does upper half, worker 2 does the lower half)
class EffectWorker : public ofThread{
private:
ofApp & app;
int offset;
std::mutex m;
public:
EffectWorker( ofApp & app, int offset ) : app(app), offset(offset){
m.lock();
}
void threadedFunction(){
while(isThreadRunning()){
// Wait until main() sends data
m.lock();
unsigned char * maskData = app.mskPix.getData();
unsigned char * resultData = app.rsltImg.getPixels().getData();
int w = app.mskPix.getWidth();
int h = app.mskPix.getHeight();
size_t src_len = w*h*3;
size_t src_start = w*h*3*(offset)/NUM_WORKERS;
size_t src_end = w*h*3*(offset+1)/NUM_WORKERS;
size_t scroll_offset = app.imgScrollX*3;
int imgs = app.allDayImagesFull.size();
for( ; src_start < src_end; src_start+=3 ){
// instead of linear time, use a triangle ramp
// so we go from 0...2*numImages and make the second half ramp down
float brightness = 2*ofClamp(app.mess[src_start/3]*(imgs-1), 0, imgs-1.1); // app.mess is the mask (each value = 0...1)
int a = floor(brightness);
int b = a+1;
float alpha = brightness-a;
a = (a+app.imgScrollX)%(2*imgs);
b = (b+app.imgScrollX)%(2*imgs);
if(a>=imgs) a = 2*imgs-a-1;
if(b>=imgs) b = 2*imgs-b-1;
// i see potential for a huge speed up here, by writing
// alpha, srca and srcb to separate result textures/arrays,
// and then either blending with simd or the graphics card.
unsigned char * srca = &app.allDayData[a][src_start];
unsigned char * srcb = &app.allDayData[b][src_start];
resultData[src_start+0] = (unsigned char)(srca[0]*(1-alpha)+srcb[0]*alpha);
resultData[src_start+1] = (unsigned char)(srca[1]*(1-alpha)+srcb[1]*alpha);
resultData[src_start+2] = (unsigned char)(srca[2]*(1-alpha)+srcb[2]*alpha);
}
m.unlock();
}
}
void work(){
m.unlock();
}
void wait(){
m.lock();
}
};
//--------------------------------------------------------------
void ofApp::setup(){
fxFactor = 1;
cout << "Loading images..." << endl;
for( int i = 0; i <= 299; i++){
cout << (i*100/255) << "%" << endl;
ofImage nextImage;
nextImage.setUseTexture(false);
allDayImagesFull.push_back(nextImage);
ofImage &img = allDayImagesFull.back();
img.load("img/" + ofToString(i+1, 5, '0') + ".png");
}
allDayData = new unsigned char * [allDayImagesFull.size()];
for( int i = 0; i < allDayImagesFull.size(); i++ ){
allDayData[i] = allDayImagesFull[i].getPixels().getData();
}
cout << "Loaded all images" << endl;
ofImage & first = allDayImagesFull.front();
rsltImg.allocate(first.getWidth(), first.getHeight(), OF_IMAGE_COLOR);
mskPix.allocate(first.getWidth(), first.getHeight(), OF_IMAGE_COLOR);
mess = new float[(int)first.getWidth()*(int)first.getHeight()];
// create some workers
for( int i = 0; i < NUM_WORKERS; i++ ){
effectWorkers.push_back(make_shared<EffectWorker>(*this, i));
effectWorkers.back()->startThread();
maskWorkers.push_back(make_shared<MaskWorker>(*this, i));
maskWorkers.back()->startThread();
}
}
//--------------------------------------------------------------
void ofApp::update(){
}
//--------------------------------------------------------------
void ofApp::draw(){
bool exporting = false;
if(exporting){
mouseX = ofGetWidth()/2;
mouseY = ofGetHeight()/2;
fxFactor = 1.25;
}
else{
mouseX = ofGetMouseX();
mouseY = ofGetMouseY();
if(ofGetMousePressed()){
fxFactor = 10*ofGetMouseX()/(float)ofGetWidth();
}
}
imgScrollX ++;
imgScrollX %= (int)allDayImagesFull.front().getWidth();
// assemble the mask in parallel
for( shared_ptr<MaskWorker> worker : maskWorkers )
worker->work();
for( shared_ptr<MaskWorker> worker : maskWorkers )
worker->wait();
// start work on all threads, then wait for them
for( shared_ptr<EffectWorker> worker : effectWorkers )
worker->work();
for( shared_ptr<EffectWorker> worker : effectWorkers )
worker->wait();
rsltImg.update();
rsltImg.draw(0,0);
cout << ofGetFrameRate() << "/" << fxFactor << "/" << mouseX << "/" << mouseY << endl;
if(exporting){
rsltImg.save("out/" + ofToString(ofGetFrameNum(),5,'0') + ".tiff");
if(ofGetFrameNum()>2*allDayImagesFull.size()){
std::exit(0);
}
}
}
//--------------------------------------------------------------
void ofApp::keyPressed(int key){
}
//--------------------------------------------------------------
void ofApp::keyReleased(int key){
}
//--------------------------------------------------------------
void ofApp::mouseMoved(int x, int y ){
}
//--------------------------------------------------------------
void ofApp::mouseDragged(int x, int y, int button){
}
//--------------------------------------------------------------
void ofApp::mousePressed(int x, int y, int button){
}
//--------------------------------------------------------------
void ofApp::mouseReleased(int x, int y, int button){
}
//--------------------------------------------------------------
void ofApp::mouseEntered(int x, int y){
}
//--------------------------------------------------------------
void ofApp::mouseExited(int x, int y){
}
//--------------------------------------------------------------
void ofApp::windowResized(int w, int h){
}
//--------------------------------------------------------------
void ofApp::gotMessage(ofMessage msg){
}
//--------------------------------------------------------------
void ofApp::dragEvent(ofDragInfo dragInfo){
}