// %mavenRepo snapshots https://oss.sonatype.org/content/repositories/snapshots/
%maven ai.djl:api:0.27.0
%maven ai.djl.paddlepaddle:paddlepaddle-model-zoo:0.27.0
%maven org.slf4j:slf4j-simple:1.7.36
// second engine to do preprocessing and postprocessing
%maven ai.djl.pytorch:pytorch-engine:0.27.0
import ai.djl.*;
import ai.djl.inference.Predictor;
import ai.djl.modality.Classifications;
import ai.djl.modality.cv.Image;
import ai.djl.modality.cv.ImageFactory;
import ai.djl.modality.cv.output.*;
import ai.djl.modality.cv.util.NDImageUtils;
import ai.djl.ndarray.*;
import ai.djl.ndarray.types.DataType;
import ai.djl.ndarray.types.Shape;
import ai.djl.repository.zoo.*;
import ai.djl.paddlepaddle.zoo.cv.objectdetection.PpWordDetectionTranslator;
import ai.djl.paddlepaddle.zoo.cv.imageclassification.PpWordRotateTranslator;
import ai.djl.paddlepaddle.zoo.cv.wordrecognition.PpWordRecognitionTranslator;
import ai.djl.translate.*;
import java.util.concurrent.ConcurrentHashMap;
圖片讀取¶
首先讓我們載入這次教程會用到的機票範例圖片:
String url = "https://resources.djl.ai/images/flight_ticket.jpg";
Image img = ImageFactory.getInstance().fromUrl(url);
img.getWrappedImage();
var criteria1 = Criteria.builder()
.optEngine("PaddlePaddle")
.setTypes(Image.class, DetectedObjects.class)
.optModelUrls("https://resources.djl.ai/test-models/paddleOCR/mobile/det_db.zip")
.optTranslator(new PpWordDetectionTranslator(new ConcurrentHashMap<string, string="">()))
.build();
var detectionModel = criteria1.loadModel();
var detector = detectionModel.newPredictor();
接著我們檢測出圖片中的文字區塊,這個模型的原始輸出是含有標註所有文字區域的圖算法(Bitmap),我們可以利用PpWordDetectionTranslator
函式將圖算法的輸出轉成長方形的方框來裁剪圖片
var detectedObj = detector.predict(img);
Image newImage = img.duplicate();
newImage.drawBoundingBoxes(detectedObj);
newImage.getWrappedImage();
如上所示,所標註的文字區塊都非常窄,且沒有包住所有完整的文字區塊。讓我們嘗試使用extendRect
函式來擴展文字框的長寬到需要的大小, 再利用 getSubImage
裁剪並擷取出文子區塊。
Image getSubImage(Image img, BoundingBox box) {
Rectangle rect = box.getBounds();
double[] extended = extendRect(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
int width = img.getWidth();
int height = img.getHeight();
int[] recovered = {
(int) (extended[0] * width),
(int) (extended[1] * height),
(int) (extended[2] * width),
(int) (extended[3] * height)
};
return img.getSubImage(recovered[0], recovered[1], recovered[2], recovered[3]);
}
double[] extendRect(double xmin, double ymin, double width, double height) {
double centerx = xmin + width / 2;
double centery = ymin + height / 2;
if (width > height) {
width += height * 2.0;
height *= 3.0;
} else {
height += width * 2.0;
width *= 3.0;
}
double newX = centerx - width / 2 < 0 ? 0 : centerx - width / 2;
double newY = centery - height / 2 < 0 ? 0 : centery - height / 2;
double newWidth = newX + width > 1 ? 1 - newX : width;
double newHeight = newY + height > 1 ? 1 - newY : height;
return new double[] {newX, newY, newWidth, newHeight};
}
讓我們輸出其中一個文字區塊
List<detectedobjects.detectedobject> boxes = detectedObj.items();
var sample = getSubImage(img, boxes.get(5).getBoundingBox());
sample.getWrappedImage();
var criteria2 = Criteria.builder()
.optEngine("PaddlePaddle")
.setTypes(Image.class, Classifications.class)
.optModelUrls("https://resources.djl.ai/test-models/paddleOCR/mobile/cls.zip")
.optTranslator(new PpWordRotateTranslator())
.build();
var rotateModel = criteria2.loadModel();
var rotateClassifier = rotateModel.newPredictor();
var criteria3 = Criteria.builder()
.optEngine("PaddlePaddle")
.setTypes(Image.class, String.class)
.optModelUrls("https://resources.djl.ai/test-models/paddleOCR/mobile/rec_crnn.zip")
.optTranslator(new PpWordRecognitionTranslator())
.build();
var recognitionModel = criteria3.loadModel();
var recognizer = recognitionModel.newPredictor();
接著我們可以試著套用這兩個模型在先前剪裁好的文字區塊上
System.out.println(rotateClassifier.predict(sample));
recognizer.predict(sample);
最後我們把這些模型串連在一起並套用在整張圖片上看看結果會如何。DJL提供了豐富的影像工具包讓你可以從圖片中擷取出文字並且完美呈現
Image rotateImg(Image image) {
try (NDManager manager = NDManager.newBaseManager()) {
NDArray rotated = NDImageUtils.rotate90(image.toNDArray(manager), 1);
return ImageFactory.getInstance().fromNDArray(rotated);
}
}
List<string> names = new ArrayList<>();
List<double> prob = new ArrayList<>();
List<boundingbox> rect = new ArrayList<>();
for (int i = 0; i < boxes.size(); i++) {
Image subImg = getSubImage(img, boxes.get(i).getBoundingBox());
if (subImg.getHeight() * 1.0 / subImg.getWidth() > 1.5) {
subImg = rotateImg(subImg);
}
Classifications.Classification result = rotateClassifier.predict(subImg).best();
if ("Rotate".equals(result.getClassName()) && result.getProbability() > 0.8) {
subImg = rotateImg(subImg);
}
String name = recognizer.predict(subImg);
names.add(name);
prob.add(-1.0);
rect.add(boxes.get(i).getBoundingBox());
}
newImage.drawBoundingBoxes(new DetectedObjects(names, prob, rect));
newImage.getWrappedImage();