由爱的恩典机器照管一切
前几天一直在忙着写 iOS 课程的期末项目 Octor(一个笔记软件),还添加了 OCR 功能:
自定义拍照视图
CameraViewController 中实现了自定义的拍照视图。
class CameraViewController: UIViewController {
private var textDetectionRequest: VNDetectTextRectanglesRequest!
private var captureSession: AVCaptureSession!
private var stillImageOutput: AVCapturePhotoOutput!
private var videoPreviewLayer: AVCaptureVideoPreviewLayer!
private var takePhotoButton: UIButton!
public weak var delegate: CameraPhotoDelegate?
override func viewDidLoad() {
super.viewDidLoad()
self.navigationItem.title = "文字识别"
setupBackButton()
setupTextDetection()
setupCamera()
addTakePhotoButton()
}
override func viewWillDisappear(_ animated: Bool) {
super.viewWillDisappear(animated)
self.captureSession.stopRunning()
}
// ......
func setupCamera() {
captureSession = AVCaptureSession()
captureSession.sessionPreset = .high
guard let captureDevice = AVCaptureDevice.default(for: .video) else { return }
guard let input = try? AVCaptureDeviceInput(device: captureDevice) else { return }
if captureSession.canAddInput(input) {
videoPreviewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
videoPreviewLayer.frame = view.frame
videoPreviewLayer.videoGravity = .resize
view.layer.addSublayer(videoPreviewLayer)
// add input
captureSession.addInput(input)
// add image output
stillImageOutput = AVCapturePhotoOutput()
if captureSession.canAddOutput(stillImageOutput) {
captureSession.addOutput(stillImageOutput)
}
// add video data output
let videoDataOutput = AVCaptureVideoDataOutput()
videoDataOutput.setSampleBufferDelegate(self, queue: DispatchQueue(label: "Buffer Queue", qos: .userInteractive, attributes: .concurrent, autoreleaseFrequency: .inherit, target: nil))
if captureSession.canAddOutput(videoDataOutput) {
captureSession.addOutput(videoDataOutput)
}
self.captureSession.startRunning()
}
}
// ......
}
通过 Vision 框架检测摄像头拍摄到的文字,并根据文字大小和位置绘制出文字的边界。
func setupTextDetection() {
textDetectionRequest = VNDetectTextRectanglesRequest(completionHandler: handleDetection)
textDetectionRequest!.reportCharacterBoxes = true
}
private func handleDetection(request: VNRequest, error: Error?) {
guard let detectionResults = request.results else {
return
}
let textResults = detectionResults.map() {
return $0 as? VNTextObservation
}
if textResults.isEmpty {
return
}
DispatchQueue.main.async {
// remove old rects
self.view.layer.sublayers?.removeSubrange(2...)
let viewWidth = self.view.frame.size.width
let viewHeight = self.view.frame.size.height
for region in textResults {
guard let boxes = region?.characterBoxes else {
return
}
// iter all boxes in current region
var xMin = CGFloat.greatestFiniteMagnitude
var xMax: CGFloat = 0
var yMin = CGFloat.greatestFiniteMagnitude
var yMax: CGFloat = 0
for box in boxes {
xMin = min(xMin, box.bottomLeft.x)
xMax = max(xMax, box.bottomRight.x)
yMin = min(yMin, box.bottomRight.y)
yMax = max(yMax, box.topRight.y)
}
// position and size of the rect for current region
let x = xMin * viewWidth
let y = (1 - yMax) * viewHeight
let width = (xMax - xMin) * viewWidth
let height = (yMax - yMin) * viewHeight
// draw a new rect for current region
let layer = CALayer()
layer.frame = CGRect(x: x, y: y, width: width, height: height)
layer.borderWidth = 1
layer.borderColor = UIColor.systemTeal.cgColor
self.view.layer.addSublayer(layer)
}
// set button to the front
self.takePhotoButton.layer.zPosition = 1
}
}
视图中有一个白色按钮,点击该按钮将会跳出选择框,显示图片以及选项。
func addTakePhotoButton() {
let buttonDiameter = CGFloat(50)
takePhotoButton = UIButton()
takePhotoButton.translatesAutoresizingMaskIntoConstraints = false
takePhotoButton.backgroundColor = .white
takePhotoButton.layer.cornerRadius = buttonDiameter / 2
takePhotoButton.clipsToBounds = true
takePhotoButton.addTarget(self, action: #selector(didTapTakePhoto), for: .touchUpInside)
self.view.addSubview(takePhotoButton)
// layout
takePhotoButton.centerXAnchor.constraint(equalTo: self.view.centerXAnchor).isActive = true
takePhotoButton.bottomAnchor.constraint(equalTo: self.view.bottomAnchor, constant: -1.5 * buttonDiameter).isActive = true
takePhotoButton.widthAnchor.constraint(equalToConstant: buttonDiameter).isActive = true
takePhotoButton.heightAnchor.constraint(equalToConstant: buttonDiameter).isActive = true
}
@objc func didTapTakePhoto(sender: UIButton!) {
let settings = AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg])
self.stillImageOutput.capturePhoto(with: settings, delegate: self)
}
extension CameraViewController: AVCapturePhotoCaptureDelegate {
func photoOutput(_ output: AVCapturePhotoOutput, didFinishProcessingPhoto photo: AVCapturePhoto, error: Error?) {
guard let imageData = photo.fileDataRepresentation() else { return }
let image: UIImage! = UIImage(data: imageData)
let photoAlertController = UIAlertController(title: "当前图片", message: nil, preferredStyle: .alert)
photoAlertController.addImage(image: image)
photoAlertController.addAction(UIAlertAction(title: "识别文字", style: .default) { (alert) -> Void in
self.delegate?.onCameraPhotoReady(image: image)
self.navigationController?.popViewController(animated: true)
})
photoAlertController.addAction(UIAlertAction(title: "重新选择", style: .default, handler: nil))
photoAlertController.addAction(UIAlertAction(title: "丢弃", style: .cancel) { (alert) -> Void in
self.navigationController?.popViewController(animated: true)
})
present(photoAlertController, animated: true)
}
}
图片在 addImage 函数中需要进行等比例缩小。
// MARK: - UIAlertController Extension
extension UIAlertController {
func addImage(image: UIImage) {
let imageAction = UIAlertAction(title: "", style: .default)
imageAction.isEnabled = false
let maxSize = CGSize(width: 245, height: 300)
var scaledImage: UIImage! = image.scale(maxSize: maxSize)
if image.size.height > image.size.width {
// center the image
let left = (maxSize.width - scaledImage.size.width) / 2
scaledImage = scaledImage?.withAlignmentRectInsets(UIEdgeInsets(top: 0, left: -left, bottom: 0, right: 0))
}
imageAction.setValue(scaledImage.withRenderingMode(.alwaysOriginal), forKey: "image")
self.addAction(imageAction)
}
}
// MARK: - UIImage Extension
extension UIImage {
func scale(maxSize: CGSize) -> UIImage? {
var ratio: CGFloat!
if size.width > size.height {
ratio = maxSize.width / size.width
}
else {
ratio = maxSize.height / size.height
}
let targetSize = CGSize(width: size.width * ratio, height: size.height * ratio)
// draw a new image
UIGraphicsBeginImageContext(targetSize)
draw(in: CGRect(origin: .zero, size: targetSize))
let scaledImage = UIGraphicsGetImageFromCurrentImageContext()
UIGraphicsEndImageContext()
return scaledImage
}
}
文字识别
TextRecognizer 用于实现文字识别,使用了 Tesseract OCR iOS 框架。
class TextRecognizer {
private var tesseract: G8Tesseract!
init() {
tesseract = G8Tesseract(language: "eng+fra")
tesseract?.engineMode = .tesseractCubeCombined
tesseract?.pageSegmentationMode = .auto
}
func recognize(_ image: UIImage) -> String? {
let scaledImage = image.scale(maxDimension: 1000) ?? image
let preprocessedImage = scaledImage.preprocess() ?? scaledImage
tesseract?.image = preprocessedImage
tesseract?.recognize()
return tesseract?.recognizedText
}
}
由于 Tesseract OCR 框架的限制,这里同样需要将图像等比例缩小。preprocess 函数通过 GPUImage 框架的 GPUImageAdaptiveThresholdFilter 组件提高图像的质量。
GPUImage 文档的解释:
GPUImageAdaptiveThresholdFilter: Determines the local luminance around a pixel, then turns the pixel black if it is below that local luminance and white if above. This can be useful for picking out text under varying lighting conditions.
// MARK: - UIImage Extension
extension UIImage {
func scale(maxDimension: CGFloat) -> UIImage? {
// keep the width-to-height ratio constant
var targetSize = CGSize(width: maxDimension, height: maxDimension)
if size.width > size.height {
// keep width to maxDimension and update height
targetSize.height = size.height / size.width * targetSize.width
}
else {
// keep height to maxDimension and update width
targetSize.width = size.width / size.height * targetSize.height
}
// draw a new image
UIGraphicsBeginImageContext(targetSize)
draw(in: CGRect(origin: .zero, size: targetSize))
let scaledImage = UIGraphicsGetImageFromCurrentImageContext()
UIGraphicsEndImageContext()
return scaledImage
}
func preprocess() -> UIImage? {
let stillImageFilter = GPUImageAdaptiveThresholdFilter()
stillImageFilter.blurRadiusInPixels = 15.0
let filteredImage = stillImageFilter.image(byFilteringImage: self)
return filteredImage
}
}
由爱的恩典机器照管一切
文章开头的演示 GIF 里,扫描出来的文本是布劳提根的诗歌 All Watched Over by Machines of Loving Grace,它比上面的代码更有意思。
I like to think (and
the sooner the better!)
of a cybernetic meadow
where mammals and computers
live together in mutually
programming harmony
like pure water
touching clear sky.
I like to think
(right now, please!)
of a cybernetic forest
filled with pines and electronics
where deer stroll peacefully
past computers
as if they were flowers
with spinning blossoms.
I like to think
(it has to be!)
of a cybernetic ecology
where we are free of our labors
and joined back to nature,
returned to our mammal
brothers and sisters,
and all watched over
by machines of loving grace.