Add link metadata fetching package

This commit is contained in:
Ken-Håvard Lieng 2017-07-17 23:11:36 +02:00
parent a4a4588ae6
commit d22758227d
22 changed files with 27836 additions and 0 deletions

View file

@ -0,0 +1,50 @@
// Copyright 2015 The Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package linkify
import "unicode"
var (
trigger [256]bool
unreserved [256]bool
subdelims [256]bool
emailcs [256]bool
basicPunct [256]bool
)
func init() {
for _, b := range "-._~" {
unreserved[b] = true
}
for _, b := range "!$&'()*+,;=" {
subdelims[b] = true
}
for _, b := range "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!#$%&'*+/=?^_`{|}~-" {
emailcs[b] = true
}
for _, b := range ".,?!;:" {
basicPunct[b] = true
}
}
func isAllowedInEmail(r rune) bool {
return r < 0x7f && emailcs[r]
}
func isLetterOrDigit(r rune) bool {
return unicode.In(r, unicode.Letter, unicode.Digit)
}
func isPunctOrSpaceOrControl(r rune) bool {
return r == '<' || r == '>' || unicode.In(r, unicode.Punct, unicode.Space, unicode.Cc)
}
func isUnreserved(r rune) bool {
return (r < 0x7f && unreserved[r]) || isLetterOrDigit(r)
}
func isSubDelimiter(r rune) bool {
return r < 0x7f && subdelims[r]
}

View file

@ -0,0 +1,93 @@
// Copyright 2015 The Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package linkify
import (
"unicode"
"unicode/utf8"
)
func findEmailStart(s string, start int) (_ int, _ bool) {
end := start
allowDot := false
for end >= 0 {
b := s[end]
switch {
case emailcs[b]:
allowDot = true
case b == '.':
if !allowDot {
return
}
allowDot = false
default:
if end == start {
return
}
if s[end+1] == '.' {
return
}
r, _ := utf8.DecodeLastRuneInString(s[:end+1])
if r == utf8.RuneError {
return
}
if !unicode.IsSpace(r) {
return
}
return end + 1, true
}
end--
}
if end < start && s[end+1] == '.' {
return
}
return end + 1, true
}
func findEmailEnd(s string, start int) (_ int, _ bool) {
end := start
allowDot := false
loop:
for end < len(s) {
b := s[end]
switch {
case emailcs[b]:
allowDot = true
case b == '.':
if !allowDot {
return
}
allowDot = false
case b == '@':
break loop
default:
return
}
end++
}
if end >= len(s)-5 {
return
}
if end > start && s[end-1] == '.' {
return
}
var dot int
var ok bool
end, dot, ok = findHostnameEnd(s, end+1)
if !ok || dot == -1 {
return
}
if dot+5 <= len(s) && s[dot+1:dot+5] == "xn--" {
return end, true
}
if length := match(s[dot+1:]); dot+length+1 != end {
return
}
return end, true
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,463 @@
// Copyright 2015 The Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package linkify provides a way to find links in plain text.
package linkify
import (
"unicode/utf8"
"github.com/golang-commonmark/markdown/byteutil"
)
// Link represents a link found in a string with a schema and a position in the string.
type Link struct {
Scheme string
Start, End int
}
func max(a, b int) int {
if a >= b {
return a
}
return b
}
// Links returns links found in s.
func Links(s string) (links []Link) {
for i := 0; i < len(s)-2; i++ {
switch s[i] {
case '.': // IP address or domain name
if i == 0 {
continue // . at the start of a line
}
if length := match(s[i+1:]); length > 0 {
pos := i + 1 + length
switch s[pos-1] {
case '.': // IP address
if pos >= len(s) {
continue // . at the end of line
}
if !byteutil.IsDigit(s[i-1]) {
i = pos
continue // . should be preceded by a digit
}
if !byteutil.IsDigit(s[pos]) {
i = pos
continue // . should be followed by a digit
}
// find the start of the IP address
j := i - 2
m := max(0, j-3)
for j >= m && byteutil.IsDigit(s[j]) {
j--
}
if i-2-j > 2 {
i = pos + 1
continue // at most 3 digits
}
start := 0
if j >= 0 {
r, rlen := utf8.DecodeLastRuneInString(s[:j+1])
if !isPunctOrSpaceOrControl(r) {
i = pos + 1
continue
}
switch r {
case '.', ':', '/', '\\', '-', '_':
i = pos + 1
continue
}
start = j + 2 - rlen
}
length, ok := skipIPv4(s[start:])
if !ok {
i = pos + 1
continue
}
end := start + length
if end == len(s) {
links = append(links, Link{
Scheme: "",
Start: start,
End: end,
})
return
}
r, _ := utf8.DecodeRuneInString(s[end:])
if !isPunctOrSpaceOrControl(r) {
continue
}
end = skipPort(s, end)
end = skipPath(s, end)
end = skipQuery(s, end)
end = skipFragment(s, end)
end = unskipPunct(s, end)
if end < len(s) {
r, _ = utf8.DecodeRuneInString(s[end:])
if !isPunctOrSpaceOrControl(r) || r == '%' {
continue
}
}
links = append(links, Link{
Scheme: "",
Start: start,
End: end,
})
i = end
default: // domain name
r, _ := utf8.DecodeLastRuneInString(s[:i])
if !isLetterOrDigit(r) {
continue // should be preceded by a letter or a digit
}
if pos == len(s) {
start, ok := findHostnameStart(s, i)
if !ok {
continue
}
links = append(links, Link{
Scheme: "",
Start: start,
End: pos,
})
return
}
if s[i+1:pos] != "xn--" {
r, _ = utf8.DecodeRuneInString(s[pos:])
if isLetterOrDigit(r) {
continue // should not be followed by a letter or a digit
}
}
end, dot, ok := findHostnameEnd(s, pos)
if !ok {
continue
}
dot = max(dot, i)
if !(dot+5 <= len(s) && s[dot+1:dot+5] == "xn--") {
if length := match(s[dot+1:]); dot+length+1 != end {
continue
}
}
start, ok := findHostnameStart(s, i)
if !ok {
continue
}
end = skipPort(s, end)
end = skipPath(s, end)
end = skipQuery(s, end)
end = skipFragment(s, end)
end = unskipPunct(s, end)
if end < len(s) {
r, _ = utf8.DecodeRuneInString(s[end:])
if !isPunctOrSpaceOrControl(r) || r == '%' {
continue // should be followed by punctuation or space
}
}
links = append(links, Link{
Scheme: "",
Start: start,
End: end,
})
i = end
}
}
case '/': // schema-less link
if s[i+1] != '/' {
continue
}
if i > 0 {
if s[i-1] == ':' {
i++
continue // should not be preceded by a colon
}
r, _ := utf8.DecodeLastRuneInString(s[:i])
if !isPunctOrSpaceOrControl(r) {
i++
continue // should be preceded by punctuation or space
}
}
r, _ := utf8.DecodeRuneInString(s[i+2:])
if !isLetterOrDigit(r) {
i++
continue // should be followed by a letter or a digit
}
start := i
end, dot, ok := findHostnameEnd(s, i+2)
if !ok {
continue
}
if s[i+2:end] != "localhost" {
if dot == -1 {
continue // no dot
}
if length, ok := skipIPv4(s[i+2:]); !ok || i+2+length != end {
if length := match(s[dot+1:]); dot+length+1 != end {
continue
}
}
}
end = skipPort(s, end)
end = skipPath(s, end)
end = skipQuery(s, end)
end = skipFragment(s, end)
end = unskipPunct(s, end)
if end < len(s) {
r, _ = utf8.DecodeRuneInString(s[end:])
if !isPunctOrSpaceOrControl(r) || r == '%' {
continue // should be followed by punctuation or space
}
}
links = append(links, Link{
Scheme: "//",
Start: start,
End: end,
})
i = end
case ':': // http, https, ftp, mailto or localhost
if i < 3 { // at least ftp:
continue
}
if i >= 9 && s[i-1] == 't' && s[i-9:i] == "localhost" {
j := i - 9
if !byteutil.IsDigit(s[j+10]) {
continue
}
if j > 0 {
r, _ := utf8.DecodeLastRuneInString(s[:j])
if !isPunctOrSpaceOrControl(r) {
i++
continue // should be preceded by punctuation or space
}
}
start := j
pos := j + 9
end := skipPort(s, pos)
if end == pos {
continue // invalid port
}
end = skipPath(s, end)
end = skipQuery(s, end)
end = skipFragment(s, end)
end = unskipPunct(s, end)
if end < len(s) {
r, _ := utf8.DecodeRuneInString(s[end:])
if !isPunctOrSpaceOrControl(r) || r == '%' {
i++
continue // should be followed by punctuation or space
}
}
links = append(links, Link{
Scheme: "",
Start: start,
End: end,
})
i = end
break
}
j := i - 1
var start int
var schema string
switch byteutil.ByteToLower(s[j]) {
case 'o': // mailto
if j < 5 {
continue // too short for mailto
}
if len(s)-j < 8 {
continue // insufficient length after
}
if byteutil.ToLower(s[j-5:j+2]) != "mailto:" {
continue
}
r, _ := utf8.DecodeLastRuneInString(s[:j-5])
if isLetterOrDigit(r) {
continue // should not be preceded by a letter or a digit
}
r, _ = utf8.DecodeRuneInString(s[j+2:])
if !isAllowedInEmail(r) {
continue // should be followed by a valid e-mail character
}
start = j - 5
end, ok := findEmailEnd(s, j+2)
if !ok {
continue
}
links = append(links, Link{
Scheme: "mailto:",
Start: start,
End: end,
})
i = end
continue // continue processing
case 'p': // http or ftp
if len(s)-j < 8 {
continue // insufficient length after
}
switch byteutil.ByteToLower(s[j-2]) {
case 'f':
if byteutil.ToLower(s[j-2:j+4]) != "ftp://" {
continue
}
start = j - 2
schema = "ftp:"
case 't':
if j < 3 {
continue
}
if byteutil.ToLower(s[j-3:j+4]) != "http://" {
continue
}
start = j - 3
schema = "http:"
default:
continue
}
case 's': // https
if j < 4 {
continue // too short for https
}
if len(s)-j < 8 {
continue // insufficient length after
}
start = j - 4
if byteutil.ToLower(s[start:j+4]) != "https://" {
continue
}
schema = "https:"
default:
continue
}
// http, https or ftp
if start > 0 {
r, _ := utf8.DecodeLastRuneInString(s[:start])
if !isPunctOrSpaceOrControl(r) {
continue // should be preceded by punctuation or space
}
}
r, _ := utf8.DecodeRuneInString(s[j+4:])
if !isLetterOrDigit(r) {
continue // should be followed by a letter or a digit
}
end, dot, ok := findHostnameEnd(s, j+4)
if !ok {
continue
}
if s[j+4:end] != "localhost" {
if dot == -1 {
continue // no dot
}
if length, ok := skipIPv4(s[j+4:]); !ok || j+4+length != end {
if !(dot+5 <= len(s) && s[dot+1:dot+5] == "xn--") {
if length := match(s[dot+1:]); dot+length+1 != end {
continue
}
}
}
}
end = skipPort(s, end)
end = skipPath(s, end)
end = skipQuery(s, end)
end = skipFragment(s, end)
end = unskipPunct(s, end)
if end < len(s) {
r, _ = utf8.DecodeRuneInString(s[end:])
if !isPunctOrSpaceOrControl(r) || r == '%' {
continue // should be followed by punctuation or space
}
}
links = append(links, Link{
Scheme: schema,
Start: start,
End: end,
})
i = end
case '@': // schema-less e-mail
if i == 0 {
continue // @ at the start of a line
}
if len(s)-i < 5 {
continue // insufficient length after
}
r, _ := utf8.DecodeLastRuneInString(s[:i])
if !isAllowedInEmail(r) {
continue // should be preceded by a valid e-mail character
}
r, _ = utf8.DecodeRuneInString(s[i+1:])
if !isLetterOrDigit(r) {
continue // should be followed by a letter or a digit
}
start, ok := findEmailStart(s, i-1)
if !ok {
continue
}
end, dot, ok := findHostnameEnd(s, i+1)
if !ok {
continue
}
if dot == -1 {
continue // no dot
}
if !(dot+5 <= len(s) && s[dot+1:dot+5] == "xn--") {
if length := match(s[dot+1:]); dot+length+1 != end {
continue
}
}
links = append(links, Link{
Scheme: "mailto:",
Start: start,
End: end,
})
i = end
}
}
return
}

View file

@ -0,0 +1,412 @@
// Copyright 2015 The Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package linkify
import (
"unicode/utf8"
"github.com/golang-commonmark/markdown/byteutil"
)
func atoi3(s string, start int) (int, bool) {
n := 0
var i int
for i = start; i < len(s) && byteutil.IsDigit(s[i]); i++ {
n = n*10 + int(s[i]-'0')
if n > 255 {
return 0, false
}
}
if i == start {
return 0, false
}
return i, true
}
func skipIPv4(s string) (_ int, _ bool) {
j := 0
for i := 0; i < 4; i++ {
if j >= len(s) {
return
}
if i > 0 {
if s[j] != '.' {
return
}
j++
}
if n, ok := atoi3(s, j); !ok {
return
} else {
j = n
}
}
return j, true
}
func atoi5(s string, start int) (int, bool) {
n := 0
var i int
for i = start; i < len(s) && byteutil.IsDigit(s[i]); i++ {
n = n*10 + int(s[i]-'0')
if n > 65535 {
return 0, false
}
}
if i == start || n == 0 {
return 0, false
}
return i, true
}
func skipPort(s string, start int) int {
if start >= len(s) || s[start] != ':' {
return start
}
end, ok := atoi5(s, start+1)
if !ok {
return start
}
return end
}
func skipPath(s string, start int) int {
if start >= len(s) || s[start] != '/' {
return start // skip empty path
}
var stack []rune
var notClosedIndex int
var nHyphen int
end := start + 1
loop:
for end < len(s) {
r, rlen := utf8.DecodeRuneInString(s[end:])
if r == utf8.RuneError {
nHyphen = 0
break
}
switch {
case isUnreserved(r):
if r == '-' {
nHyphen++
if nHyphen > 1 {
break loop
}
} else {
nHyphen = 0
}
case isSubDelimiter(r) || r == '[' || r == ']':
nHyphen = 0
switch r {
case '[', '(':
if len(stack) == 0 {
notClosedIndex = end
}
stack = append(stack, r)
case ']', ')':
opening := '['
if r == ')' {
opening = '('
}
if len(stack) == 0 || stack[len(stack)-1] != opening {
break loop
}
stack = stack[:len(stack)-1]
}
case r == '/' || r == ':' || r == '@':
nHyphen = 0
case r == '%':
nHyphen = 0
if end+2 >= len(s) {
break loop
}
if !(byteutil.IsHexDigit(s[end+1]) &&
byteutil.IsHexDigit(s[end+2])) {
break loop
}
end += 2
default:
nHyphen = 0
if r != ' ' || len(stack) == 0 {
break loop
}
}
end += rlen
}
if len(stack) > 0 {
return notClosedIndex
}
if nHyphen > 0 {
return end - nHyphen + 1
}
return end
}
func skipQuery(s string, start int) int {
if start >= len(s) || s[start] != '?' {
return start
}
var stack []rune
var notClosedIndex int
var nHyphen int
end := start + 1
loop:
for end < len(s) {
r, rlen := utf8.DecodeRuneInString(s[end:])
if r == utf8.RuneError {
nHyphen = 0
break
}
switch {
case isUnreserved(r):
if r == '-' {
nHyphen++
if nHyphen > 1 {
break loop
}
} else {
nHyphen = 0
}
case isSubDelimiter(r) || r == '[' || r == ']':
nHyphen = 0
switch r {
case '[', '(':
if len(stack) == 0 {
notClosedIndex = end
}
stack = append(stack, r)
case ']', ')':
opening := '['
if r == ')' {
opening = '('
}
if len(stack) == 0 || stack[len(stack)-1] != opening {
break loop
}
stack = stack[:len(stack)-1]
}
case r == '?' || r == '/' || r == ':' || r == '@':
nHyphen = 0
case r == '%':
nHyphen = 0
if end+2 >= len(s) {
break loop
}
if !(byteutil.IsHexDigit(s[end+1]) &&
byteutil.IsHexDigit(s[end+2])) {
break loop
}
end += 2
default:
nHyphen = 0
if r != ' ' || len(stack) == 0 {
break loop
}
}
end += rlen
}
if len(stack) > 0 {
return notClosedIndex
}
if nHyphen > 0 {
return end - nHyphen + 1
}
return end
}
func skipFragment(s string, start int) int {
if start >= len(s) || s[start] != '#' {
return start
}
var stack []rune
var notClosedIndex int
var nHyphen int
end := start + 1
loop:
for end < len(s) {
r, rlen := utf8.DecodeRuneInString(s[end:])
if r == utf8.RuneError {
nHyphen = 0
break
}
switch {
case isUnreserved(r):
if r == '-' {
nHyphen++
if nHyphen > 1 {
break loop
}
} else {
nHyphen = 0
}
case isSubDelimiter(r) || r == '[' || r == ']':
nHyphen = 0
switch r {
case '[', '(':
if len(stack) == 0 {
notClosedIndex = end
}
stack = append(stack, r)
case ']', ')':
opening := '['
if r == ')' {
opening = '('
}
if len(stack) == 0 || stack[len(stack)-1] != opening {
break loop
}
stack = stack[:len(stack)-1]
}
case r == '?' || r == '/' || r == ':' || r == '@':
nHyphen = 0
case r == '%':
nHyphen = 0
if end+2 >= len(s) {
break loop
}
if !(byteutil.IsHexDigit(s[end+1]) &&
byteutil.IsHexDigit(s[end+2])) {
break loop
}
end += 2
default:
nHyphen = 0
if r != ' ' || len(stack) == 0 {
break loop
}
}
end += rlen
}
if len(stack) > 0 {
return notClosedIndex
}
if nHyphen > 0 {
return end - nHyphen + 1
}
return end
}
func unskipPunct(s string, start int) int {
end := start - 1
if end < 0 || end >= len(s) || !basicPunct[s[end]] {
return start
}
return end
}
func findHostnameStart(s string, start int) (_ int, _ bool) {
end := start
lastDot := true
nHyphen := 0
loop:
for end > 0 {
r, rlen := utf8.DecodeLastRuneInString(s[:end])
if r == utf8.RuneError {
return
}
switch {
case isLetterOrDigit(r):
lastDot = false
nHyphen = 0
case r == '.':
if nHyphen > 0 {
return
}
lastDot = true
case r == '-':
if end == start {
return
}
if lastDot {
return
}
nHyphen++
if nHyphen == 3 {
return
}
case r == ':' || r == '/' || r == '\\' || r == '_':
return
case isPunctOrSpaceOrControl(r):
break loop
default:
return
}
end -= rlen
}
if lastDot || nHyphen > 0 {
return
}
return end, true
}
func findHostnameEnd(s string, start int) (_ int, _ int, _ bool) {
end := start
lastDot := false
lastDotPos := -1
nHyphen := 0
loop:
for end < len(s) {
r, rlen := utf8.DecodeRuneInString(s[end:])
if r == utf8.RuneError {
return
}
switch {
case isLetterOrDigit(r):
lastDot = false
nHyphen = 0
case r == '.':
if nHyphen > 0 {
return
}
if lastDot {
break loop
}
lastDot = true
lastDotPos = end
nHyphen = 0
case r == '-':
lastDot = false
if end == start {
return
}
if lastDot {
return
}
nHyphen++
if nHyphen == 3 {
break loop
}
case r == '\\' || r == '_':
return
case isPunctOrSpaceOrControl(r):
break loop
default:
return
}
end += rlen
}
if nHyphen > 0 {
end -= nHyphen
} else if lastDot {
if s[end-1] == '.' {
end--
}
lastDotPos = end - 1
for lastDotPos >= start && s[lastDotPos] != '.' {
lastDotPos--
}
if lastDotPos < start {
lastDotPos = -1
}
}
return end, lastDotPos, true
}