分享一个在caffe中实现的yolo层

原创
2019/10/15 09:46
阅读数 695

这是别人实现的,是我移植到cc的cpu实现,可以实现caffe中使用yolo3,但是我感觉实际效果不如darknet 好点


template <typename Dtype>
inline Dtype sigmoid(Dtype x)
{
	return 1. / (1. + exp(-x));
}
template <typename Dtype>
class PredictionResult {
public:
	Dtype x;
	Dtype y;
	Dtype w;
	Dtype h;
	Dtype objScore;
	Dtype classScore;
	Dtype confidence;
	int classType;
};
template <typename Dtype>
Dtype overlap(Dtype x1, Dtype w1, Dtype x2, Dtype w2)
{
	float l1 = x1 - w1 / 2;
	float l2 = x2 - w2 / 2;
	float left = l1 > l2 ? l1 : l2;
	float r1 = x1 + w1 / 2;
	float r2 = x2 + w2 / 2;
	float right = r1 < r2 ? r1 : r2;
	return right - left;
}
template <typename Dtype>
Dtype box_intersection(vector<Dtype> a, vector<Dtype> b)
{
	float w = overlap(a[0], a[2], b[0], b[2]);
	float h = overlap(a[1], a[3], b[1], b[3]);
	if (w < 0 || h < 0) return 0;
	float area = w * h;
	return area;
}
template <typename Dtype>
Dtype box_union(vector<Dtype> a, vector<Dtype> b)
{
	float i = box_intersection(a, b);
	float u = a[2] * a[3] + b[2] * b[3] - i;
	return u;
}
template <typename Dtype>
Dtype box_iou(vector<Dtype> a, vector<Dtype> b)
{
	return box_intersection(a, b) / box_union(a, b);
}
struct NormalizedBBox {
	float xmin = 1;
	float ymin = 2;
	float xmax = 3;
	float ymax = 4;
	int label = 5;
	bool difficult = 6;
	float score = 7;
	float size = 8;
};
bool BoxSortDecendScore(const PredictionResult<float>& box1, const PredictionResult<float>& box2) {
	return box1.confidence > box2.confidence;
};

template <typename Dtype>
void setNormalizedBBox(NormalizedBBox& bbox, Dtype x, Dtype y, Dtype w, Dtype h)
{
	Dtype xmin = x - w / 2.0;
	Dtype xmax = x + w / 2.0;
	Dtype ymin = y - h / 2.0;
	Dtype ymax = y + h / 2.0;

	if (xmin < 0.0) {
		xmin = 0.0;
	}
	if (xmax > 1.0) {
		xmax = 1.0;
	}
	if (ymin < 0.0) {
		ymin = 0.0;
	}
	if (ymax > 1.0) {
		ymax = 1.0;
	}
	bbox.set_xmin(xmin);
	bbox.set_ymin(ymin);
	bbox.set_xmax(xmax);
	bbox.set_ymax(ymax);
	float bbox_size = BBoxSize(bbox, true);
	bbox.set_size(bbox_size);
}
template <typename Dtype>
void ApplyNms(vector<PredictionResult<Dtype>>& boxes, vector<int>& idxes, Dtype threshold) {
	map<int, int> idx_map;
	for (int i = 0; i < boxes.size() - 1; ++i) {
		if (idx_map.find(i) != idx_map.end()) {
			continue;
		}
		for (int j = i + 1; j < boxes.size(); ++j) {
			if (idx_map.find(j) != idx_map.end()) {
				continue;
			}
			vector<Dtype> Bbox1, Bbox2;
			Bbox1.push_back(boxes[i].x);
			Bbox1.push_back(boxes[i].y);
			Bbox1.push_back(boxes[i].w);
			Bbox1.push_back(boxes[i].h);

			Bbox2.push_back(boxes[j].x);
			Bbox2.push_back(boxes[j].y);
			Bbox2.push_back(boxes[j].w);
			Bbox2.push_back(boxes[j].h);

			Dtype iou = box_iou(Bbox1, Bbox2);
			if (iou >= threshold) {
				idx_map[j] = 1;
			}
			/*	NormalizedBBox Bbox1, Bbox2;
			setNormalizedBBox(Bbox1, boxes[i].x, boxes[i].y, boxes[i].w, boxes[i].h);
			setNormalizedBBox(Bbox2, boxes[j].x, boxes[j].y, boxes[j].w, boxes[j].h);
			float overlap = JaccardOverlap(Bbox1, Bbox2, true);
			if (overlap >= threshold) {
				idx_map[j] = 1;
			}*/
		}
	}
	for (int i = 0; i < boxes.size(); ++i) {
		if (idx_map.find(i) == idx_map.end()) {
			idxes.push_back(i);
		}
	}
}
template <typename Dtype>
void class_index_and_score(Dtype* input, int classes, PredictionResult<Dtype>& predict)
{
	Dtype sum = 0;
	Dtype large = input[0];
	int classIndex = 0;
	for (int i = 0; i < classes; ++i) {
		if (input[i] > large)
			large = input[i];
	}
	for (int i = 0; i < classes; ++i) {
		Dtype e = exp(input[i] - large);
		sum += e;
		input[i] = e;
	}

	for (int i = 0; i < classes; ++i) {
		input[i] = input[i] / sum;
	}
	large = input[0];
	classIndex = 0;

	for (int i = 0; i < classes; ++i) {
		if (input[i] > large) {
			large = input[i];
			classIndex = i;
		}
	}
	predict.classType = classIndex;
	predict.classScore = large;
}
template <typename Dtype>
void get_region_box2(vector<Dtype> &b, Dtype* x, vector<Dtype> biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride) {

	//LOG(INFO) << lw << "," << lh << "," << w << "," << h << "," << stride;
	b.clear();
	b.push_back((i + (x[index + 0 * stride])) / lw);
	b.push_back((j + (x[index + 1 * stride])) / lh);
	b.push_back(exp(x[index + 2 * stride]) * biases[2 * n] / (w));
	b.push_back(exp(x[index + 3 * stride]) * biases[2 * n + 1] / (h));
}




class Yolov3DetectionOutput : public AbstractCustomLayer {
public:
	SETUP_LAYERFUNC(Yolov3DetectionOutput);

	virtual void setup(const char* name, const char* type, const char* param_str, int phase, Blob** bottom, int numBottom, Blob** top, int numTop) {
		//
		//CHECK(yolov3_detection_output_param.has_num_classes()) << "Must specify num_classes";
		side_ = bottom[0]->width();
		num_class_ = 20;
		num_ = 2;
		coords_ = 4;
		confidence_threshold_ = .01;
		nms_threshold_ = .45;
		mask_group_num_ = 3;

		biases_ = { 10,13,16,30,33,23,30,61,62,45,59,119,116,90,156,198,373,326 };
		/*for (int c = 0; c < 18; ++c) {
			biases_.push_back(biases[c]);
		}*/
		mask_ = { 6,7,8,3,4,5,0,1,2 };
		/*for (int c = 0; c < yolov3_detection_output_param.mask_size(); ++c) {
			mask_.push_back(yolov3_detection_output_param.mask(c));
		}*/
		anchors_scale_ = { 32,16,16 };
		/*for (int c = 0; c < yolov3_detection_output_param.anchors_scale_size(); ++c) {
			anchors_scale_.push_back(yolov3_detection_output_param.anchors_scale(c));
		}*/
		groups_num_ = 9 / mask_group_num_;
	}

	virtual void forward(Blob** bottom, int numBottom, Blob** top, int numTop) {
		const int num = bottom[0]->num();

		int len = 4 + num_class_ + 1;
		int stride = side_ * side_;



		int mask_offset = 0;
		vector<PredictionResult<float>> predicts;
		predicts.clear();
		int *class_score = new int[num_class_];

		for (int t = 0; t < 3; t++) {
			side_ = bottom[t]->width();
			int stride = side_ * side_;
			swap_ = newBlobByShape(bottom[t]->num(), bottom[t]->channel(), bottom[t]->height(), bottom[t]->width());
			swap_->ReshapeLike(*bottom[t]);
			float* swap_data = swap_->mutable_cpu_data();
			const float* input_data = bottom[t]->cpu_data();
			for (int b = 0; b < bottom[t]->num(); b++) {
				for (int s = 0; s < side_*side_; s++) {
					//LOG(INFO) << s;
					for (int n = 0; n < num_; n++) {
						//LOG(INFO) << bottom[t]->count(1);
						int index = n * len*stride + s + b * bottom[t]->count(1);
						vector<float> pred;

						for (int c = 0; c < len; ++c) {
							int index2 = c * stride + index;
							//LOG(INFO)<<index2;
							if (c == 2 || c == 3) {
								swap_data[index2] = (input_data[index2 + 0]);
							}
							else {
								if (c > 4) {
									//LOG(INFO) << c - 5;
									class_score[c - 5] = sigmoid(input_data[index2 + 0]);
								}
								else {
									swap_data[index2] = sigmoid(input_data[index2 + 0]);
								}
							}
						}
						int y2 = s / side_;
						int x2 = s % side_;
						//LOG(INFO) << x2 << "," << y2;
						float obj_score = swap_data[index + 4 * stride];
						//LOG(INFO) << obj_score;
						get_region_box2(pred, swap_data, biases_, mask_[n + mask_offset], index, x2, y2, side_, side_, side_*anchors_scale_[t], side_*anchors_scale_[t], stride);
						//LOG(INFO)<<anchors_scale_[t];
						//LOG(INFO) << pred[0] << "," << pred[1];
						//float maxmima_score = 0;
						PredictionResult<float> predict;
						for (int c = 0; c < num_class_; ++c) {
							class_score[c] *= obj_score;
							//LOG(INFO) << class_score[c];
							if (class_score[c] > confidence_threshold_)
							{
								//if(class_score[c]>maxmima_score)
								{
									//maxmima_score = class_score[c];
									predict.x = pred[0];
									predict.y = pred[1];
									predict.w = pred[2];
									predict.h = pred[3];
									predict.classType = c;
									predict.confidence = class_score[c];
									predicts.push_back(predict);
								}

								//LOG(INFO) << predict.x << "," << predict.y << "," << predict.w << "," << predict.h;
								//LOG(INFO) << predict.confidence;
							}
						}
						//if(maxmima_score> confidence_threshold_)
						//{
						//	predicts.push_back(predict);
						//}
					}
				}
			}
			mask_offset += groups_num_;

		}

		delete[] class_score;

		sort(predicts.begin(), predicts.end(), BoxSortDecendScore);
		vector<int> idxes;
		int num_kept = 0;
		if (predicts.size() > 0) {
			//LOG(INFO) << predicts.size();
			ApplyNms(predicts, idxes, nms_threshold_);
			num_kept = idxes.size();
			//LOG(INFO) << num_kept;
		}
		vector<int> top_shape(2, 1);
		top_shape.push_back(num_kept);
		top_shape.push_back(7);

		float* top_data;

		if (num_kept == 0) {
			//DLOG(INFO) << "Couldn't find any detections";
			top_shape[2] = swap_->num();
			top[0]->Reshape(top_shape[0], top_shape[1], top_shape[2], top_shape[3]);//不知道是否可行
			top_data = top[0]->mutable_cpu_data();
			caffe_set(top[0]->count(), float(-1), top_data);
			// Generate fake results per image.
			for (int i = 0; i < num; ++i) {
				top_data[0] = i;
				top_data += 7;
			}
		}
		else {
			top[0]->Reshape(top_shape[0], top_shape[1], top_shape[2], top_shape[3]);//不知道是否可行
			top_data = top[0]->mutable_cpu_data();
			for (int i = 0; i < num_kept; i++) {
				top_data[i * 7] = 0;                              //Image_Id
				top_data[i * 7 + 1] = predicts[idxes[i]].classType + 1; //label
				top_data[i * 7 + 2] = predicts[idxes[i]].confidence; //confidence
				float left = (predicts[idxes[i]].x - predicts[idxes[i]].w / 2.);
				float right = (predicts[idxes[i]].x + predicts[idxes[i]].w / 2.);
				float top = (predicts[idxes[i]].y - predicts[idxes[i]].h / 2.);
				float bot = (predicts[idxes[i]].y + predicts[idxes[i]].h / 2.);

				top_data[i * 7 + 3] = left;
				top_data[i * 7 + 4] = top;
				top_data[i * 7 + 5] = right;
				top_data[i * 7 + 6] = bot;
				std::cout << "Detection box" << "," << predicts[idxes[i]].classType << "," << predicts[idxes[i]].x << "," << predicts[idxes[i]].y << "," << predicts[idxes[i]].w << "," << predicts[idxes[i]].h;
			}

		}
	}

	virtual void backward(Blob** bottom, int numBottom, Blob** top, int numTop, const bool* propagate_down) {

	};

	virtual void reshape(Blob** bottom, int numBottom, Blob** top, int numTop) {
		//CHECK_EQ(bottom[0]->num(), 1);
		// num() and channels() are 1.
		//vector<int> top_shape(2, 1);
		// Since the number of bboxes to be kept is unknown before nms, we manually
		// set it to (fake) 1.
		//top_shape.push_back(1);
		// Each row is a 7 dimension vector, which stores
		// [image_id, label, confidence, x, y, w, h]
		//top_shape.push_back(7);
		top[0]->Reshape(2, 1, 1, 7);
	};


private:
	int side_;
	int num_class_;
	int num_;
	int coords_;

	int mask_group_num_;
	int groups_num_;

	float confidence_threshold_;
	float nms_threshold_;
	vector<float> biases_;
	vector<float> anchors_scale_;
	vector<float> mask_;
	Blob *swap_;
};```
展开阅读全文
打赏
0
0 收藏
分享
加载中
更多评论
打赏
0 评论
0 收藏
0
分享
返回顶部
顶部