1 | package main |
||
2 | |||
3 | import ( |
||
4 | "encoding/xml" |
||
5 | "fmt" |
||
6 | "net/http" |
||
7 | "net/url" |
||
8 | "os" |
||
9 | "path" |
||
10 | "time" |
||
11 | ) |
||
12 | |||
13 | type SitemapIndex struct { |
||
14 | XMLName xml.Name `xml:"sitemapindex"` |
||
15 | XMLNs string `xml:"xmlns,attr"` |
||
16 | Sitemap []Sitemap `xml:"sitemap"` |
||
17 | } |
||
18 | type Sitemap struct { |
||
19 | Loc string `xml:"loc"` |
||
20 | LastMod string `xml:"lastmod,omitempty"` |
||
21 | } |
||
22 | type SitemapValidation struct { |
||
23 | IsValid bool |
||
24 | Sitemap Sitemap |
||
25 | } |
||
26 | |||
27 | func (s Sitemap) findFileName() string { |
||
28 | u, _ := url.Parse(s.Loc) |
||
29 | |||
30 | dir := path.Dir(u.Path) |
||
31 | |||
32 | if dir=="/" { |
||
33 | dir="." |
||
34 | } |
||
35 | |||
36 | filename := u.Path[len(dir):] |
||
37 | |||
38 | if _, err := os.Stat(dir); os.IsNotExist(err) != false { |
||
39 | os.MkdirAll(dir, 0777) |
||
40 | } |
||
41 | filename = dir + string(os.PathSeparator) + filename |
||
42 | return filename |
||
43 | } |
||
44 | func (si *SitemapIndex) validate() SitemapIndex { |
||
45 | validatedSitemapChannel := make(chan SitemapValidation) |
||
46 | |||
47 | for _, sitemap := range (*si).Sitemap { |
||
48 | go func(s Sitemap){ |
||
49 | s.validate(validatedSitemapChannel) |
||
50 | }(sitemap) |
||
51 | } |
||
52 | |||
53 | newSitemapIndex := SitemapIndex{ |
||
54 | XMLNs: si.XMLNs, |
||
55 | } |
||
56 | |||
57 | for i:=0;i<len((*si).Sitemap);i++ { |
||
58 | validatedSitemap := <-validatedSitemapChannel |
||
59 | if validatedSitemap.IsValid { |
||
60 | newSitemapIndex.Sitemap = append(newSitemapIndex.Sitemap, validatedSitemap.Sitemap) |
||
61 | }else{ |
||
62 | fmt.Printf("Url is dead: %s\n",validatedSitemap.Sitemap.Loc) |
||
63 | } |
||
64 | } |
||
65 | |||
66 | close(validatedSitemapChannel) |
||
67 | |||
68 | return newSitemapIndex |
||
69 | } |
||
70 | |||
71 | func (s *Sitemap) validate(sitemapChannel chan SitemapValidation) { |
||
72 | |||
73 | resp,err := http.Get((*s).Loc) |
||
74 | if err!=nil { |
||
75 | fmt.Println(err.Error) |
||
0 ignored issues
–
show
introduced
by
![]() |
|||
76 | return |
||
77 | } |
||
78 | |||
79 | validateSitemap := SitemapValidation { |
||
80 | Sitemap: (*s), |
||
81 | IsValid: true, |
||
82 | } |
||
83 | |||
84 | if resp.StatusCode != 200 { |
||
85 | validateSitemap.IsValid = false; |
||
86 | } |
||
87 | sitemapChannel <- validateSitemap |
||
88 | |||
89 | return |
||
90 | } |
||
91 | |||
92 | func (si *SitemapIndex) saveToFile(filename string) error { |
||
93 | m, err := xml.Marshal((*si)) |
||
94 | if err != nil { |
||
95 | return err |
||
96 | } |
||
97 | |||
98 | file, err := os.OpenFile(filename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0777) |
||
99 | file.Write([]byte(xml.Header)) |
||
100 | file.Write(m) |
||
101 | file.Close() |
||
102 | return err |
||
103 | } |
||
104 | |||
105 | func batchProcess(uri string) { |
||
106 | resp, err := http.Get(uri) |
||
107 | if err != nil { |
||
108 | fmt.Printf("Url cannot fetched: %s\n", uri) |
||
109 | fmt.Println(err) |
||
110 | os.Exit(1) |
||
111 | } |
||
112 | |||
113 | rawXMLData := readXMLFromResponse(resp) |
||
114 | |||
115 | sitemapIndex := newSitemapIndexFromXML(rawXMLData) |
||
116 | sitemapIndexValidate(sitemapIndex) |
||
117 | } |
||
118 | |||
119 | func sitemapIndexValidate(sitemapIndex SitemapIndex) { |
||
120 | newSitemapIndex := sitemapIndex.validate() |
||
121 | |||
122 | for _, sitemap := range newSitemapIndex.Sitemap { |
||
123 | filename := sitemap.findFileName() |
||
124 | if Verbose {fmt.Printf("Filename is %s\n",filename)} |
||
125 | singleProcess(sitemap.Loc, filename) |
||
126 | time.Sleep(time.Second * 2) |
||
127 | } |
||
128 | |||
129 | newSitemapIndex.saveToFile(OutputFileName) |
||
130 | |||
131 | } |
||
132 | |||
133 | func newSitemapIndexFromXML(rawXMLData []byte) SitemapIndex { |
||
134 | sm := SitemapIndex{} |
||
135 | err := xml.Unmarshal(rawXMLData, &sm) |
||
136 | |||
137 | if err != nil { |
||
138 | fmt.Printf("Sitemap index cannot parsed. Because: %s", err) |
||
139 | return SitemapIndex{} |
||
140 | } |
||
141 | return sm |
||
142 | } |
||
143 |