1
|
|
|
package main |
2
|
|
|
|
3
|
|
|
import ( |
4
|
|
|
"encoding/xml" |
5
|
|
|
"fmt" |
6
|
|
|
"net/http" |
7
|
|
|
"net/url" |
8
|
|
|
"os" |
9
|
|
|
"path" |
10
|
|
|
"time" |
11
|
|
|
) |
12
|
|
|
|
13
|
|
|
type SitemapIndex struct { |
|
|
|
|
14
|
|
|
XMLName xml.Name `xml:"sitemapindex"` |
15
|
|
|
XMLNs string `xml:"xmlns,attr"` |
16
|
|
|
Sitemap []Sitemap `xml:"sitemap"` |
17
|
|
|
} |
18
|
|
|
type Sitemap struct { |
|
|
|
|
19
|
|
|
Loc string `xml:"loc"` |
20
|
|
|
LastMod string `xml:"lastmod,omitempty"` |
21
|
|
|
} |
22
|
|
|
type SitemapValidation struct { |
|
|
|
|
23
|
|
|
IsValid bool |
24
|
|
|
Sitemap Sitemap |
25
|
|
|
} |
26
|
|
|
|
27
|
|
|
func (s Sitemap) findFileName() string { |
28
|
|
|
u, _ := url.Parse(s.Loc) |
29
|
|
|
|
30
|
|
|
dir := path.Dir(u.Path) |
31
|
|
|
|
32
|
|
|
if dir=="/" { |
33
|
|
|
dir="." |
34
|
|
|
} |
35
|
|
|
|
36
|
|
|
filename := u.Path[len(dir):] |
37
|
|
|
|
38
|
|
|
if _, err := os.Stat(dir); os.IsNotExist(err) != false { |
39
|
|
|
os.MkdirAll(dir, 0777) |
40
|
|
|
} |
41
|
|
|
filename = dir + string(os.PathSeparator) + filename |
42
|
|
|
return filename |
43
|
|
|
} |
44
|
|
|
func (si *SitemapIndex) validate() SitemapIndex { |
45
|
|
|
validatedSitemapChannel := make(chan SitemapValidation) |
46
|
|
|
|
47
|
|
|
for _, sitemap := range (*si).Sitemap { |
48
|
|
|
go func(s Sitemap){ |
49
|
|
|
s.validate(validatedSitemapChannel) |
50
|
|
|
}(sitemap) |
51
|
|
|
} |
52
|
|
|
|
53
|
|
|
newSitemapIndex := SitemapIndex{ |
54
|
|
|
XMLNs: si.XMLNs, |
55
|
|
|
} |
56
|
|
|
|
57
|
|
|
for i:=0;i<len((*si).Sitemap);i++ { |
58
|
|
|
validatedSitemap := <-validatedSitemapChannel |
59
|
|
|
if validatedSitemap.IsValid { |
60
|
|
|
newSitemapIndex.Sitemap = append(newSitemapIndex.Sitemap, validatedSitemap.Sitemap) |
61
|
|
|
}else{ |
62
|
|
|
fmt.Printf("Url is dead: %s\n",validatedSitemap.Sitemap.Loc) |
63
|
|
|
} |
64
|
|
|
} |
65
|
|
|
|
66
|
|
|
close(validatedSitemapChannel) |
67
|
|
|
|
68
|
|
|
return newSitemapIndex |
69
|
|
|
} |
70
|
|
|
|
71
|
|
|
func (s *Sitemap) validate(sitemapChannel chan SitemapValidation) { |
72
|
|
|
|
73
|
|
|
resp,err := http.Get((*s).Loc) |
74
|
|
|
if err!=nil { |
75
|
|
|
fmt.Println(err.Error) |
|
|
|
|
76
|
|
|
return |
77
|
|
|
} |
78
|
|
|
|
79
|
|
|
validateSitemap := SitemapValidation { |
80
|
|
|
Sitemap: (*s), |
81
|
|
|
IsValid: true, |
82
|
|
|
} |
83
|
|
|
|
84
|
|
|
if resp.StatusCode != 200 { |
85
|
|
|
validateSitemap.IsValid = false; |
86
|
|
|
} |
87
|
|
|
sitemapChannel <- validateSitemap |
88
|
|
|
|
89
|
|
|
return |
90
|
|
|
} |
91
|
|
|
|
92
|
|
|
func (si *SitemapIndex) saveToFile(filename string) error { |
93
|
|
|
m, err := xml.Marshal((*si)) |
94
|
|
|
if err != nil { |
95
|
|
|
return err |
96
|
|
|
} |
97
|
|
|
|
98
|
|
|
file, err := os.OpenFile(filename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0777) |
99
|
|
|
file.Write([]byte(xml.Header)) |
100
|
|
|
file.Write(m) |
101
|
|
|
file.Close() |
102
|
|
|
return err |
103
|
|
|
} |
104
|
|
|
|
105
|
|
|
func batchProcess(uri string) { |
106
|
|
|
resp, err := http.Get(uri) |
107
|
|
|
if err != nil { |
108
|
|
|
fmt.Printf("Url cannot fetched: %s\n", uri) |
109
|
|
|
fmt.Println(err) |
110
|
|
|
os.Exit(1) |
111
|
|
|
} |
112
|
|
|
|
113
|
|
|
rawXMLData := readXMLFromResponse(resp) |
114
|
|
|
|
115
|
|
|
sitemapIndex := newSitemapIndexFromXML(rawXMLData) |
116
|
|
|
sitemapIndexValidate(sitemapIndex) |
117
|
|
|
} |
118
|
|
|
|
119
|
|
|
func sitemapIndexValidate(sitemapIndex SitemapIndex) { |
120
|
|
|
newSitemapIndex := sitemapIndex.validate() |
121
|
|
|
|
122
|
|
|
for _, sitemap := range newSitemapIndex.Sitemap { |
123
|
|
|
filename := sitemap.findFileName() |
124
|
|
|
if Verbose {fmt.Printf("Filename is %s\n",filename)} |
125
|
|
|
singleProcess(sitemap.Loc, filename) |
126
|
|
|
time.Sleep(time.Second * 2) |
127
|
|
|
} |
128
|
|
|
|
129
|
|
|
newSitemapIndex.saveToFile(OutputFileName) |
130
|
|
|
|
131
|
|
|
} |
132
|
|
|
|
133
|
|
|
func newSitemapIndexFromXML(rawXMLData []byte) SitemapIndex { |
134
|
|
|
sm := SitemapIndex{} |
135
|
|
|
err := xml.Unmarshal(rawXMLData, &sm) |
136
|
|
|
|
137
|
|
|
if err != nil { |
138
|
|
|
fmt.Printf("Sitemap index cannot parsed. Because: %s", err) |
139
|
|
|
return SitemapIndex{} |
140
|
|
|
} |
141
|
|
|
return sm |
142
|
|
|
} |
143
|
|
|
|