@Article{Petscharnig2017, author = {Petscharnig, Stefan and Schoeffmann, Klaus}, journal = {Multimedia Tools and Applications}, title = {Learning laparoscopic video shot classification for gynecological surgery}, year = {2017}, issn = {1573-7721}, month = {apr}, pages = {1-19}, abstract = {Videos of endoscopic surgery are used for education of medical experts, analysis in medical research, and documentation for everyday clinical life. Hand-crafted image descriptors lack the capabilities of a semantic classification of surgical actions and video shots of anatomical structures. In this work, we investigate how well single-frame convolutional neural networks (CNN) for semantic shot classification in gynecologic surgery work. Together with medical experts, we manually annotate hours of raw endoscopic gynecologic surgery videos showing endometriosis treatment and myoma resection of over 100 patients. The cleaned ground truth dataset comprises 9 h of annotated video material (from 111 different recordings). We use the well-known CNN architectures AlexNet and GoogLeNet and train these architectures for both, surgical actions and anatomy, from scratch. Furthermore, we extract high-level features from AlexNet with weights from a pre-trained model from the Caffe model zoo and feed them to an SVM classifier. Our evaluation shows that we reach an average recall of .697 and .515 for classification of anatomical structures and surgical actions respectively using off-the-shelf CNN features. Using GoogLeNet, we achieve a mean recall of .782 and .617 for classification of anatomical structures and surgical actions respectively. With AlexNet the achieved recall is .615 for anatomical structures and .469 for surgical action classification respectively. The main conclusion of our work is that advances in general image classification methods transfer to the domain of endoscopic surgery videos in gynecology. This is relevant as this domain is different from natural images, e.g. it is distinguished by smoke, reflections, or a limited amount of colors.}, address = {Berlin, Heidelberg, New York}, doi = {10.1007/s11042-017-4699-5}, keywords = {Video classification, Deep learning, Convolutional Neural Network}, language = {EN}, publisher = {Springer}, url = {http://dx.doi.org/10.1007/s11042-017-4699-5} }