由爱的恩典机器照管一切

前几天一直在忙着写 iOS 课程的期末项目 Octor(一个笔记软件),还添加了 OCR 功能:

自定义拍照视图

CameraViewController 中实现了自定义的拍照视图。

class CameraViewController: UIViewController {

  private var textDetectionRequest: VNDetectTextRectanglesRequest!

  private var captureSession: AVCaptureSession!
  private var stillImageOutput: AVCapturePhotoOutput!
  private var videoPreviewLayer: AVCaptureVideoPreviewLayer!

  private var takePhotoButton: UIButton!

  public weak var delegate: CameraPhotoDelegate?

  override func viewDidLoad() {
    super.viewDidLoad()
    self.navigationItem.title = "文字识别"
    setupBackButton()
    setupTextDetection()
    setupCamera()
    addTakePhotoButton()
  }

  override func viewWillDisappear(_ animated: Bool) {
    super.viewWillDisappear(animated)
    self.captureSession.stopRunning()
  }

  // ......

  func setupCamera() {
    captureSession = AVCaptureSession()
    captureSession.sessionPreset = .high

    guard let captureDevice = AVCaptureDevice.default(for: .video) else { return }
    guard let input = try? AVCaptureDeviceInput(device: captureDevice) else { return }

    if captureSession.canAddInput(input) {
      videoPreviewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
      videoPreviewLayer.frame = view.frame
      videoPreviewLayer.videoGravity = .resize
      view.layer.addSublayer(videoPreviewLayer)
      // add input
      captureSession.addInput(input)
      // add image output
      stillImageOutput = AVCapturePhotoOutput()
      if captureSession.canAddOutput(stillImageOutput) {
        captureSession.addOutput(stillImageOutput)
      }
      // add video data output
      let videoDataOutput = AVCaptureVideoDataOutput()
      videoDataOutput.setSampleBufferDelegate(self, queue: DispatchQueue(label: "Buffer Queue", qos: .userInteractive, attributes: .concurrent, autoreleaseFrequency: .inherit, target: nil))
      if captureSession.canAddOutput(videoDataOutput) {
        captureSession.addOutput(videoDataOutput)
      }
      self.captureSession.startRunning()
    }
  }

  // ......
}

通过 Vision 框架检测摄像头拍摄到的文字,并根据文字大小和位置绘制出文字的边界。

func setupTextDetection() {
  textDetectionRequest = VNDetectTextRectanglesRequest(completionHandler: handleDetection)
  textDetectionRequest!.reportCharacterBoxes = true
}

private func handleDetection(request: VNRequest, error: Error?) {
  guard let detectionResults = request.results else {
    return
  }
  let textResults = detectionResults.map() {
    return $0 as? VNTextObservation
  }
  if textResults.isEmpty {
    return
  }
  DispatchQueue.main.async {
    // remove old rects
    self.view.layer.sublayers?.removeSubrange(2...)
    let viewWidth = self.view.frame.size.width
    let viewHeight = self.view.frame.size.height
    for region in textResults {
      guard let boxes = region?.characterBoxes else {
        return
      }
      // iter all boxes in current region
      var xMin = CGFloat.greatestFiniteMagnitude
      var xMax: CGFloat = 0
      var yMin = CGFloat.greatestFiniteMagnitude
      var yMax: CGFloat = 0
      for box in boxes {
        xMin = min(xMin, box.bottomLeft.x)
        xMax = max(xMax, box.bottomRight.x)
        yMin = min(yMin, box.bottomRight.y)
        yMax = max(yMax, box.topRight.y)
      }
      // position and size of the rect for current region
      let x = xMin * viewWidth
      let y = (1 - yMax) * viewHeight
      let width = (xMax - xMin) * viewWidth
      let height = (yMax - yMin) * viewHeight
      // draw a new rect for current region
      let layer = CALayer()
      layer.frame = CGRect(x: x, y: y, width: width, height: height)
      layer.borderWidth = 1
      layer.borderColor = UIColor.systemTeal.cgColor
      self.view.layer.addSublayer(layer)
    }
    // set button to the front
    self.takePhotoButton.layer.zPosition = 1
  }
}

视图中有一个白色按钮,点击该按钮将会跳出选择框,显示图片以及选项。

func addTakePhotoButton() {
  let buttonDiameter = CGFloat(50)
  takePhotoButton = UIButton()
  takePhotoButton.translatesAutoresizingMaskIntoConstraints = false
  takePhotoButton.backgroundColor = .white
  takePhotoButton.layer.cornerRadius = buttonDiameter / 2
  takePhotoButton.clipsToBounds = true
  takePhotoButton.addTarget(self, action: #selector(didTapTakePhoto), for: .touchUpInside)
  self.view.addSubview(takePhotoButton)
  // layout
  takePhotoButton.centerXAnchor.constraint(equalTo: self.view.centerXAnchor).isActive = true
  takePhotoButton.bottomAnchor.constraint(equalTo: self.view.bottomAnchor, constant: -1.5 * buttonDiameter).isActive = true
  takePhotoButton.widthAnchor.constraint(equalToConstant: buttonDiameter).isActive = true
  takePhotoButton.heightAnchor.constraint(equalToConstant: buttonDiameter).isActive = true
}

@objc func didTapTakePhoto(sender: UIButton!) {
  let settings = AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg])
  self.stillImageOutput.capturePhoto(with: settings, delegate: self)
}
extension CameraViewController: AVCapturePhotoCaptureDelegate {

  func photoOutput(_ output: AVCapturePhotoOutput, didFinishProcessingPhoto photo: AVCapturePhoto, error: Error?) {
    guard let imageData = photo.fileDataRepresentation() else { return }
    let image: UIImage! = UIImage(data: imageData)

    let photoAlertController = UIAlertController(title: "当前图片", message: nil, preferredStyle: .alert)
    photoAlertController.addImage(image: image)
    photoAlertController.addAction(UIAlertAction(title: "识别文字", style: .default) { (alert) -> Void in
      self.delegate?.onCameraPhotoReady(image: image)
      self.navigationController?.popViewController(animated: true)
    })
    photoAlertController.addAction(UIAlertAction(title: "重新选择", style: .default, handler: nil))
    photoAlertController.addAction(UIAlertAction(title: "丢弃", style: .cancel) { (alert) -> Void in
      self.navigationController?.popViewController(animated: true)
    })

    present(photoAlertController, animated: true)
  }

}

图片在 addImage 函数中需要进行等比例缩小。

// MARK: - UIAlertController Extension

extension UIAlertController {

  func addImage(image: UIImage) {
    let imageAction = UIAlertAction(title: "", style: .default)
    imageAction.isEnabled = false

    let maxSize = CGSize(width: 245, height: 300)
    var scaledImage: UIImage! = image.scale(maxSize: maxSize)

    if image.size.height > image.size.width {
      // center the image
      let left = (maxSize.width - scaledImage.size.width) / 2
      scaledImage = scaledImage?.withAlignmentRectInsets(UIEdgeInsets(top: 0, left: -left, bottom: 0, right: 0))
    }

    imageAction.setValue(scaledImage.withRenderingMode(.alwaysOriginal), forKey: "image")
    self.addAction(imageAction)
  }

}

// MARK: - UIImage Extension

extension UIImage {

  func scale(maxSize: CGSize) -> UIImage? {
    var ratio: CGFloat!
    if size.width > size.height {
      ratio = maxSize.width / size.width
    }
    else {
      ratio = maxSize.height / size.height
    }
    let targetSize = CGSize(width: size.width * ratio, height: size.height * ratio)
    // draw a new image
    UIGraphicsBeginImageContext(targetSize)
    draw(in: CGRect(origin: .zero, size: targetSize))
    let scaledImage = UIGraphicsGetImageFromCurrentImageContext()
    UIGraphicsEndImageContext()
    return scaledImage
  }

}

文字识别

TextRecognizer 用于实现文字识别,使用了 Tesseract OCR iOS 框架

class TextRecognizer {

  private var tesseract: G8Tesseract!

  init() {
    tesseract = G8Tesseract(language: "eng+fra")
    tesseract?.engineMode = .tesseractCubeCombined
    tesseract?.pageSegmentationMode = .auto
  }

  func recognize(_ image: UIImage) -> String? {
    let scaledImage = image.scale(maxDimension: 1000) ?? image
    let preprocessedImage = scaledImage.preprocess() ?? scaledImage
    tesseract?.image = preprocessedImage
    tesseract?.recognize()
    return tesseract?.recognizedText
  }

}

由于 Tesseract OCR 框架的限制,这里同样需要将图像等比例缩小。preprocess 函数通过 GPUImage 框架的 GPUImageAdaptiveThresholdFilter 组件提高图像的质量。

GPUImage 文档的解释:

GPUImageAdaptiveThresholdFilter: Determines the local luminance around a pixel, then turns the pixel black if it is below that local luminance and white if above. This can be useful for picking out text under varying lighting conditions.

// MARK: - UIImage Extension

extension UIImage {

  func scale(maxDimension: CGFloat) -> UIImage? {
    // keep the width-to-height ratio constant
    var targetSize = CGSize(width: maxDimension, height: maxDimension)
    if size.width > size.height {
      // keep width to maxDimension and update height
      targetSize.height = size.height / size.width * targetSize.width
    }
    else {
      // keep height to maxDimension and update width
      targetSize.width = size.width / size.height * targetSize.height
    }
    // draw a new image
    UIGraphicsBeginImageContext(targetSize)
    draw(in: CGRect(origin: .zero, size: targetSize))
    let scaledImage = UIGraphicsGetImageFromCurrentImageContext()
    UIGraphicsEndImageContext()
    return scaledImage
  }

  func preprocess() -> UIImage? {
    let stillImageFilter = GPUImageAdaptiveThresholdFilter()
    stillImageFilter.blurRadiusInPixels = 15.0
    let filteredImage = stillImageFilter.image(byFilteringImage: self)
    return filteredImage
  }

}

由爱的恩典机器照管一切

文章开头的演示 GIF 里,扫描出来的文本是布劳提根的诗歌 All Watched Over by Machines of Loving Grace,它比上面的代码更有意思。

I like to think (and
the sooner the better!)
of a cybernetic meadow
where mammals and computers
live together in mutually
programming harmony
like pure water
touching clear sky.

I like to think
  (right now, please!)
of a cybernetic forest
filled with pines and electronics
where deer stroll peacefully
past computers
as if they were flowers
with spinning blossoms.

I like to think
  (it has to be!)
of a cybernetic ecology
where we are free of our labors
and joined back to nature,
returned to our mammal
brothers and sisters,
and all watched over
by machines of loving grace.

Updated: